diff --git a/scripts/query_taxon_clades/main_DBs_con_dict_hierarchical_taxonomy.tsv.gz b/scripts/query_taxon_clades/main_DBs_con_dict_hierarchical_taxonomy.tsv.gz new file mode 100644 index 0000000..c6a79ba Binary files /dev/null and b/scripts/query_taxon_clades/main_DBs_con_dict_hierarchical_taxonomy.tsv.gz differ diff --git a/scripts/query_taxon_clades/make_hierarchy_table.py b/scripts/query_taxon_clades/make_hierarchy_table.py new file mode 100644 index 0000000..1b65da1 --- /dev/null +++ b/scripts/query_taxon_clades/make_hierarchy_table.py @@ -0,0 +1,80 @@ +#! usr/bin/python3 +### load packages +import pickle, copy, datetime, shutil, argparse, configparser +import os, logging, sys +import glob +import json +import pandas as pd +import pytaxonkit +import itertools +#from _utils import util_filter_out_main_dbnames + +### function grabbed from _utils.py +def util_filter_out_main_dbnames(db_iterable): + ''' + takes an iterable of db names and returns a list containing a subset that includes only the + latest version of refseq. + ''' + out = [] + latest_refseq_ver = -1 + latest_refseq_name = '' + + for nm in db_iterable: + if nm[:6].lower()=='refseq': + v = int(nm[8:]) + if v > latest_refseq_ver: + latest_refseq_ver = v + latest_refseq_name = nm + else: + out.append(nm) + out.append(latest_refseq_name) + return out + +### set directories and files. replace with variable +pathname = os.path.dirname( __file__ ) +script_path = os.path.abspath(pathname) +up_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')) +#dqt_dir = "/Users/tiszamj/Documents/mike_tisza/database_taxa_query/metscale/scripts" +db_taxids = "containment_dict.json" + +### add if is file +if os.path.isfile(up_path + "/" + db_taxids): + f = open(up_path + "/" + db_taxids) + taxid_dict = json.load(f) +else: + print ("file not found: " + up_path + "/" + db_taxids) + exit() + +main_db_list = util_filter_out_main_dbnames(taxid_dict["taxid_lists"]) +if main_db_list: + print ("DBs for table construction: ") + for i in main_db_list: + print (i) +else: + print ("list of main DBs not found.") + exit() + +### make empty df +large_tax_hi_df = pd.DataFrame(columns=['TaxID','kingdom','phylum','class','order','family','genus','species','DB']) +#for item in itertools.islice(mylist, n): +### Loops through lists of taxids from each of the main DBs +print ("Looping through DBs to get hierarchy") +for i in main_db_list : + print (i) + i_IDs = taxid_dict["taxid_lists"][i] + result = pytaxonkit.lineage(i_IDs, data_dir = "/Users/tiszamj/Documents/mike_tisza/database_taxa_query/taxdmp/", threads=16, formatstr = "{K};{p};{c};{o};{f};{g};{s}") + result_df = pd.DataFrame(result[['TaxID','LineageTaxIDs']]) + result_df[['kingdom','phylum','class','order','family','genus','species']] = result_df['LineageTaxIDs'].str.split(';',expand=True) + result_df = result_df.drop(['LineageTaxIDs'], axis=1) + result_df['DB'] = i + large_tax_hi_df = pd.concat([large_tax_hi_df, result_df]) + +if large_tax_hi_df.empty: + print ("output table not found. ") + exit() +else: + print ("table rows = " + str(len(large_tax_hi_df))) +### write as tsv file +large_tax_hi_df.to_csv(script_path + "/main_DBs_con_dict_hierarchical_taxonomy.tsv", sep = "\t", index_label = False) + + diff --git a/scripts/query_taxon_clades/query_taxon_clades.py b/scripts/query_taxon_clades/query_taxon_clades.py new file mode 100644 index 0000000..36ec61d --- /dev/null +++ b/scripts/query_taxon_clades/query_taxon_clades.py @@ -0,0 +1,117 @@ +#! usr/bin/python3 +### load packages +import pickle, copy, datetime, shutil, argparse, configparser +import os, logging, sys +import glob +import json +import pandas as pd +import pytaxonkit +import itertools +import subprocess + + +### str2bool not currently used, but could help with future options +def str2bool(v): + if isinstance(v, bool): + return v + if v.lower() in ('yes', 'true', 't', 'y', '1'): + return True + elif v.lower() in ('no', 'false', 'f', 'n', '0'): + return False + else: + raise argparse.ArgumentTypeError('Boolean value expected.') + +### set directories and files. replace with variable +pathname = os.path.dirname( __file__ ) +script_path = os.path.abspath(pathname) +up_path = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '..')) + +parser = argparse.ArgumentParser(description='Query DBs for all records at or below taxonomical level for given taxid.' \ + 'For example, if taxid for genus Bacteroides is given, # sequence records within Bacteroides are returned. ') + +required_args = parser.add_argument_group(' REQUIRED ARGUMENTS for query hierarchical taxon') + +help_taxid_list = 'The NCBI taxonomy ID(s) to query against the containment_dict metadatabase. Can be in one of ' \ + 'three forms: a) a single integer, which will be queried and the results output, b) a valid path ' \ + 'to a text file containing one taxon ID per line, all of which will be queried and the results ' \ + 'output as tab-delimited text, or c) \'stdin\' in which case a list of taxon_ids is pulled from ' \ + 'standard input and processed as with a file.' + +required_args.add_argument('-t', '--taxids', dest='taxid_list', type=str, default=None, help=help_taxid_list) + +help_db_table = 'Database hierarchical table. Can be generated from script make_hierarchy_table.py' +required_args.add_argument('-d', '--db_table', dest='db_table', type=str, default= script_path + "/main_DBs_con_dict_hierarchical_taxonomy.tsv", help=help_db_table) + +help_output = 'output file name. TSV format.' +required_args.add_argument('-o', '--output', dest='output_file', type=str, default="output_hierarchical_taxonomy.tsv", help=help_output) + +args = parser.parse_args() + + + +taxids = [] +if args.taxid_list is None: + #options.parser_store.print_help() + sys.exit(1) +elif args.taxid_list == 'stdin': + # while True: + # foo = input() + for foo in sys.stdin: + if len(foo.strip())==0: + break + taxids.append(int(foo.strip())) +elif os.path.isfile(args.taxid_list): + tf = open(args.taxid_list, 'r') + tfl = tf.readlines() + taxids = list(map(lambda x: int(x.strip()), tfl)) +else: + try: + taxids.append(int(args.taxid_list)) + except: + logging.error('The taxonID list argument \'-t\' must be either a single integer string, or \'stdin\', or' + 'a valid file path') + #options.parser_store.print_help() + sys.exit(1) + +#db_table +if os.path.isfile(args.db_table): + large_tax_hi_df = pd.read_csv(args.db_table, sep = "\t", header=0) +elif os.path.isfile(args.db_table + ".gz"): + subprocess.run(['gzip', '-d', args.db_table + ".gz"]) + if os.path.isfile(args.db_table): + large_tax_hi_df = pd.read_csv(args.db_table, sep = "\t", header=0) + else: + print(str(args.db_table) + " not found") + exit() +else: + print(str(args.db_table) + " not found") + exit() + +### have to rename because "class" is a no-no column name +large_tax_hi_df = large_tax_hi_df.rename(columns={"class": "tax_class" }) + +### one column DF +DBs_df = pd.DataFrame({"DB": large_tax_hi_df.DB.unique()}) + +### actual querying +for taxid in taxids: + tmp = large_tax_hi_df.query('(TaxID == @taxid) or (kingdom == @taxid) or (phylum == @taxid) or (order == @taxid) or (family == @taxid) or (genus == @taxid) or (species == @taxid)') \ + .groupby('DB').count().reset_index()[['DB', 'TaxID']].rename(columns={"TaxID": "ID_" + str(taxid)}) + DBs_df = pd.merge(DBs_df, tmp, on="DB", how='left') + +### remove decimals +pd.options.display.float_format = '{:,.0f}'.format + +### current output print to terminal +print(DBs_df.fillna(0)) + +DBs_df.fillna(0).to_csv(args.output_file, sep="\t", index=False) +''' +NOTES +On my macbook pro, it took 3 seconds to query 50 taxids and 42 seconds to query 1000 taxids + +SUGGESTIONS/TO-DO +1. Change output file format and/or print the result in a more compact format ala the other tool +2. Query the taxdump file to get taxon level "species, genus, family, etc" for each query taxid, similar to other script and set the column names to, for example Genus_816 instead of ID_816 (line 96) + -- Mike Tisza, 2022-10-12 +''' \ No newline at end of file