|
24 | 24 | from treeprofiler.src import utils |
25 | 25 | from treeprofiler.src.phylosignal import run_acr_discrete, run_acr_continuous, run_delta |
26 | 26 | from treeprofiler.src.ls import run_ls |
27 | | -from treeprofiler.src import b64pickle |
| 27 | +from treeprofiler.src import ete_format |
28 | 28 |
|
29 | 29 | from multiprocessing import Pool |
30 | 30 |
|
@@ -112,7 +112,7 @@ def populate_annotate_args(parser): |
112 | 112 | # help=("<kingdom|phylum|class|order|family|genus|species|subspecies> " |
113 | 113 | # "reference tree from taxonomic database")) |
114 | 114 | add('--taxadb', type=str.upper, |
115 | | - choices=['NCBI', 'GTDB', 'customdb'], |
| 115 | + choices=['NCBI', 'GTDB', 'MOTUS', 'customdb'], |
116 | 116 | help="<NCBI|GTDB> for taxonomic annotation or fetch taxatree") |
117 | 117 | add('--gtdb-version', type=int, |
118 | 118 | choices=[95, 202, 207, 214, 220], |
@@ -648,6 +648,17 @@ def run_tree_annotate(tree, input_annotated_tree=False, |
648 | 648 | else: |
649 | 649 | logger.info("No specific version or dump file provided; using latest GTDB data...") |
650 | 650 | GTDBTaxa().update_taxonomy_database() |
| 651 | + elif taxadb == 'MOTUS': |
| 652 | + if gtdb_version and taxa_dump: |
| 653 | + logger.error('Please specify either GTDB version or taxa dump file, not both.') |
| 654 | + sys.exit(1) |
| 655 | + if taxa_dump: |
| 656 | + logger.info(f"Loading GTDB database dump file {taxa_dump}...") |
| 657 | + GTDBTaxa().update_taxonomy_database(taxa_dump) |
| 658 | + else: |
| 659 | + logger.info("No specific version or dump file provided; using latest GTDB data...") |
| 660 | + motus_dump = download_motus_dump() |
| 661 | + GTDBTaxa().update_taxonomy_database(motus_dump) |
651 | 662 | elif taxadb == 'NCBI': |
652 | 663 | if taxa_dump: |
653 | 664 | logger.info(f"Loading NCBI database dump file {taxa_dump}...") |
@@ -924,7 +935,7 @@ def run(args): |
924 | 935 |
|
925 | 936 | ### out ete |
926 | 937 | with open(os.path.join(args.outdir, base+'_annotated.ete'), 'w') as f: |
927 | | - f.write(b64pickle.dumps(annotated_tree, encoder='pickle', pack=False)) |
| 938 | + f.write(ete_format.dumps(annotated_tree, encoder='pickle', pack=False)) |
928 | 939 |
|
929 | 940 | ### out tsv |
930 | 941 | prop_keys = list(prop2type.keys()) |
@@ -1780,7 +1791,7 @@ def merge_dictionaries(dict_ranks, dict_names): |
1780 | 1791 | return merged_dict |
1781 | 1792 |
|
1782 | 1793 |
|
1783 | | - if db == "GTDB": |
| 1794 | + if db == "GTDB" or "MOTUS": |
1784 | 1795 | gtdb = GTDBTaxa() |
1785 | 1796 | tree.set_species_naming_function(return_spcode_gtdb) |
1786 | 1797 | gtdb.annotate_tree(tree, taxid_attr="species", ignore_unclassified=ignore_unclassified) |
@@ -1840,28 +1851,27 @@ def merge_dictionaries(dict_ranks, dict_names): |
1840 | 1851 |
|
1841 | 1852 | return tree, rank2values |
1842 | 1853 |
|
1843 | | -# def annotate_evol_events(tree, sp_delimiter='.', sp_field=0): |
1844 | | -# def return_spcode(leaf): |
1845 | | -# try: |
1846 | | -# return leaf.name.split(sp_delimiter)[sp_field] |
1847 | | -# except (IndexError, ValueError): |
1848 | | -# return leaf.name |
1849 | | - |
1850 | | -# tree.set_species_naming_function(return_spcode) |
1851 | | - |
1852 | | -# node2species = tree.get_cached_content('species') |
1853 | | -# for n in tree.traverse(): |
1854 | | -# n.props['species'] = node2species[n] |
1855 | | -# if len(n.children) == 2: |
1856 | | -# dup_sp = node2species[n.children[0]] & node2species[n.children[1]] |
1857 | | -# if dup_sp: |
1858 | | -# n.props['evoltype'] = 'D' |
1859 | | -# n.props['dup_sp'] = ','.join(dup_sp) |
1860 | | -# n.props['dup_percent'] = round(len(dup_sp)/len(node2species[n]), 3) * 100 |
1861 | | -# else: |
1862 | | -# n.props['evoltype'] = 'S' |
1863 | | -# n.del_prop('_speciesFunction') |
1864 | | -# return tree |
| 1854 | +def download_motus_dump(): |
| 1855 | + from hashlib import md5 |
| 1856 | + import requests |
| 1857 | + |
| 1858 | + url = "https://github.com/dengzq1234/ete-data/raw/refs/heads/main/motus_taxonomy/motus_latest_dump.tar.gz" |
| 1859 | + fname = './motus_latest_dump.tar.gz' |
| 1860 | + if not os.path.exists(fname): |
| 1861 | + print(f'Downloading {fname} from {url} ...') |
| 1862 | + with open(fname, 'wb') as f: |
| 1863 | + f.write(requests.get(url).content) |
| 1864 | + else: |
| 1865 | + md5_local = md5(open(fname, 'rb').read()).hexdigest() |
| 1866 | + md5_remote = requests.get(url + '.md5').text.split()[0] |
| 1867 | + |
| 1868 | + if md5_local != md5_remote: |
| 1869 | + print(f'Updating {fname} from {url} ...') |
| 1870 | + with open(fname, 'wb') as f: |
| 1871 | + f.write(requests.get(url).content) |
| 1872 | + else: |
| 1873 | + print(f'File {fname} is already up-to-date with {url} .') |
| 1874 | + return fname |
1865 | 1875 |
|
1866 | 1876 | def annotate_evol_events(tree, sos_thr=0.0, sp_delimiter='.', sp_field=0): |
1867 | 1877 | def return_spcode(leaf): |
|
0 commit comments