diff --git a/biotiger/index.py b/biotiger/index.py index 6d0de0c..783ed20 100644 --- a/biotiger/index.py +++ b/biotiger/index.py @@ -139,40 +139,40 @@ def gen_prefix(input): return o def die_with_help(): - print """ -**************** -TIGER v2.0 Help: -**************** - -tiger index Options: - - -i|input Specify input file. File must be in FastA format and must be aligned prior. - Datasets with uneven sequence lengths will return an error. - - -s|split Split dataset across multiple files to run simultaneously. Takes int argument. - - -o|output Specify the prefix name of output files. - - -u|unknowns Specify unknown characters in the alignment. Unknown characters are omitted from - site patterns and so are not considered in the analysis. - -u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma - between characters, NO SPACE!!) - - Default is ? only - - Examples: - 1. Generate a .tgr file for complete sequence named full_seq.tgr & set unknowns to ? and - : - tiger index -i my_file.aln -o full_seq -u ?,- - - 2. Generate 10 subsets of the data with an output prefix of tiger_split and a reference: - tiger index -i my_file.aln -o tiger_split -s 10 - ** Results in files named tiger_split.1.tgr, tiger_split.2,tgr, and so on, along with - tiger_split.ref.tgr - - """ + print(""" + **************** + TIGER v2.0 Help: + **************** + + tiger index Options: + + -i|input Specify input file. File must be in FastA format and must be aligned prior. + Datasets with uneven sequence lengths will return an error. + + -s|split Split dataset across multiple files to run simultaneously. Takes int argument. + + -o|output Specify the prefix name of output files. + + -u|unknowns Specify unknown characters in the alignment. Unknown characters are omitted from + site patterns and so are not considered in the analysis. + -u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma + between characters, NO SPACE!!) + + Default is ? only + + Examples: + 1. Generate a .tgr file for complete sequence named full_seq.tgr & set unknowns to ? and - : + tiger index -i my_file.aln -o full_seq -u ?,- + + 2. Generate 10 subsets of the data with an output prefix of tiger_split and a reference: + tiger index -i my_file.aln -o tiger_split -s 10 + ** Results in files named tiger_split.1.tgr, tiger_split.2,tgr, and so on, along with + tiger_split.ref.tgr + + """) sys.exit(1) def die_with_message(message): - print message + print(message) sys.exit(1) \ No newline at end of file diff --git a/biotiger/output.py b/biotiger/output.py index 21473fb..04109fb 100644 --- a/biotiger/output.py +++ b/biotiger/output.py @@ -295,56 +295,56 @@ def run(opts): except IOError: die_with_message("Cannot open outfile %s%s" % (opts.output, suffix)) else: - print output_str + print(output_str) def die_with_help(): - print """ -**************** -TIGER v2.0 Help: -**************** - -tiger output Options: + print(""" + **************** + TIGER v2.0 Help: + **************** - -i|input Specify input file. Must be in .gr format. - - -c|combine Specify input file. This file should contain a list of .gr files to be combined. - - -fa|fasta Provide original .fa file for sequence data. - - -o|output Specify prefix name for output files (prints to stdout if not provided) - - -f|format Changes formatting options. - NEXUS, with comments: - -f 0: output bin numbers, sites unsorted (default) - -f 1: output bin number, sites sorted on rank - -f 2: displays rank values rather than bin numbers - -f 3: displays rank values and sites sorted on rank - FastA: - -f 4 - - -inc|include_only Give list of bins to include (only) in output - -inc 3,4,5,6 (Note: No spaces, just commas) - - -exc|exclude_only Give list of bins to exclude from output - -exc 1,2,9,10 - - -m|mask Mask -inc/-exc sites. (Default is to remove them) - - -b|bins Set the number of bins to be used. - -b : Sites will be placed into number of bins. is a whole number. - - Default is 10. - Examples: - 1. Write a FastA file, masking site that fall into Bin1, Bin2, Bin9 and Bin10 of 10 bins: - tiger output -i sample.gr -fa my_data.fa -f 4 -exc 1,2,9,10 -b 10 --mask - - 2. Write a NEXUS file combining test.0.gr, test.1.gr, test.2.gr with sites sorted on rank - tiger output -c list_of_gr_files.txt -fa my_data.fa -f 3 - - """ + tiger output Options: + + -i|input Specify input file. Must be in .gr format. + + -c|combine Specify input file. This file should contain a list of .gr files to be combined. + + -fa|fasta Provide original .fa file for sequence data. + + -o|output Specify prefix name for output files (prints to stdout if not provided) + + -f|format Changes formatting options. + NEXUS, with comments: + -f 0: output bin numbers, sites unsorted (default) + -f 1: output bin number, sites sorted on rank + -f 2: displays rank values rather than bin numbers + -f 3: displays rank values and sites sorted on rank + FastA: + -f 4 + + -inc|include_only Give list of bins to include (only) in output + -inc 3,4,5,6 (Note: No spaces, just commas) + + -exc|exclude_only Give list of bins to exclude from output + -exc 1,2,9,10 + + -m|mask Mask -inc/-exc sites. (Default is to remove them) + + -b|bins Set the number of bins to be used. + -b : Sites will be placed into number of bins. is a whole number. + + Default is 10. + Examples: + 1. Write a FastA file, masking site that fall into Bin1, Bin2, Bin9 and Bin10 of 10 bins: + tiger output -i sample.gr -fa my_data.fa -f 4 -exc 1,2,9,10 -b 10 --mask + + 2. Write a NEXUS file combining test.0.gr, test.1.gr, test.2.gr with sites sorted on rank + tiger output -c list_of_gr_files.txt -fa my_data.fa -f 3 + + """) sys.exit(1) def die_with_message(message): - print message + print(message) sys.exit(1) \ No newline at end of file diff --git a/biotiger/rate.py b/biotiger/rate.py index 0ff9f68..a9637f6 100644 --- a/biotiger/rate.py +++ b/biotiger/rate.py @@ -81,7 +81,7 @@ def set_pattern(p): def sort(rates, patterns): if len(rates) != len(patterns): - print "Something's weird here. len(rates) != len(patterns)" + print("Something's weird here. len(rates) != len(patterns)") sys.exit(1) rate_d = {} @@ -160,51 +160,51 @@ def gen_prefix(input): return o def die_with_help(): - print """ -**************** -TIGER v2.0 Help: -**************** - -tiger rate Options: - - -i|input Specify input file. File should be in .ti format. - - -r|reference Specify reference sequence (.ti). -i file is used as default if none is provided. - - -o|output Specify prefix name for output files. + print(""" + **************** + TIGER v2.0 Help: + **************** - -rl|rate_list A list of the rate at each site may be optionally written to a specified - file. - -rl : writes list of the rates at each site to file.txt. - - -ptp Specifies that a PTP test should be run. - * Note * this option has a huge effect on running time! - - -z|randomisations Number of randomisations to be used for the PTP test. - -z : each site will be randomised times. is a whole number. - - Default is 100 - - -p|p_value Specify p-value which denotes significance in PTP test. - -p : site will be denoted as significant if p-value is better than . - is a floating point number. - - Default is 0.05 - - -pl|pval_list Write a list of p-values to a specified file. - -pl : writes list of p-values for each site to file.txt. - - Examples: - 1. Calculate rates for file test.ref.ti against itself, with a list of rates: - tiger rate -i test.ref.ti -rl - 2. Calculate rates for file test.0.ti against test.ref.ti with a PTP test and a list of p values - tiger rate -i test.0.ti -r test.ref.ti -ptp -pl - - """ + tiger rate Options: + + -i|input Specify input file. File should be in .ti format. + + -r|reference Specify reference sequence (.ti). -i file is used as default if none is provided. + + -o|output Specify prefix name for output files. + + -rl|rate_list A list of the rate at each site may be optionally written to a specified + file. + -rl : writes list of the rates at each site to file.txt. + + -ptp Specifies that a PTP test should be run. + * Note * this option has a huge effect on running time! + + -z|randomisations Number of randomisations to be used for the PTP test. + -z : each site will be randomised times. is a whole number. + + Default is 100 + + -p|p_value Specify p-value which denotes significance in PTP test. + -p : site will be denoted as significant if p-value is better than . + is a floating point number. + + Default is 0.05 + + -pl|pval_list Write a list of p-values to a specified file. + -pl : writes list of p-values for each site to file.txt. + + Examples: + 1. Calculate rates for file test.ref.ti against itself, with a list of rates: + tiger rate -i test.ref.ti -rl + 2. Calculate rates for file test.0.ti against test.ref.ti with a PTP test and a list of p values + tiger rate -i test.0.ti -r test.ref.ti -ptp -pl + + """) sys.exit(1) def die_with_message(message): - print message + print(message) sys.exit(1) diff --git a/tiger b/tiger index 46ce39b..d7457c5 100755 --- a/tiger +++ b/tiger @@ -45,121 +45,121 @@ def parse_args(args): return (mode, opts) def die_with_help(): - print """ -**************** -TIGER v2.0 Help: -**************** - -TIGER: Tree-Independent Generation of Evolutionary Rates - -Unlike the initial version, TIGER v2.0 is split into 3 modes, allowing for comparisons to be run -concurrently. -First, the input file needs to be broken down and indexed (index mode). It can be broken into as -many pieces as you wish, all of which can be compared on seperate processors. An index file for -the full dataset will be generated too, so that each piece can be compared back to it (reference -index). -Next, the rate calculation must be performed (rate mode). Each index file should be compared to -the reference index. -Finally, the data can be combined and output in a number of formats with masking options. - -Modes: - index: prepare the data for rate calculation, generate 'tiger index' (.ti) file(s). - rate: preform calculation of tiger rate for each site, create 'generated rates' (.gr) file(s). - output: write sequence files based on .gr files, integrate rates from a split analysis into a - single file, mask and edit alignment based on tiger rates. - -It may be noted that both .ti and .gr files are python cpickle dumps and can be browsed using iPython -cpickle: https://docs.python.org/2/library/pickle.html#module-cPickle -ipython: https://ipython.org/ - -1. tiger index Options: - - -i|input Specify input file. File must be in FastA format and must be aligned prior. - Datasets with uneven sequence lengths will return an error. - - -s|split Split dataset across multiple files to run simultaneously. Takes int argument. - - -o|output Specify the prefix name of output files. - - -u|unknowns Specify unknown characters in the alignment. Unknown characters are omitted from - site patterns and so are not considered in the analysis. - -u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma - between characters, NO SPACE!!) - - Default is ? only - - Examples: - 1. Generate a .ti file for complete sequence named full_seq.ti & set unknowns to ? and - : - tiger index -i my_file.aln -o full_seq -u ?,- - - 2. Generate 10 subsets of the data with an output prefix of tiger_split and a reference: - tiger index -i my_file.aln -o tiger_split -s 10 - ** Results in files named tiger_split.0.ti, tiger_split.1.ti, and so on, along with - tiger_split.ref.ti - -2. tiger rate Options: - - -i|input Specify input file. File should be in .ti format. - - -r|reference Specify reference sequence (.ti). -i file is used as default if none is provided. - - -o|output Specify prefix name for output files. + print(""" + **************** + TIGER v2.0 Help: + **************** - -rl|rate_list A list of the rate at each site may be optionally written to a specified - file. - -rl : writes list of the rates at each site to file.txt. - - Default is .rates if no filename is specified - - Examples: - 1. Calculate rates for file test.ref.ti against itself and create a file containing a list of rates: - tiger rate -i test.ref.ti -rl - 2. Calculate rates for file test.0.ti against the reference index (test.ref.ti) - tiger rate -i test.0.ti -r test.ref.ti - -3. tiger output Options: + TIGER: Tree-Independent Generation of Evolutionary Rates - -i|input Specify input file. Must be in .gr format. - - -c|combine Specify input file. This file should contain a list of .gr files to be combined. - - -fa|fasta Provide original .fa file for sequence data. - - -o|output Specify prefix name for output files (prints to stdout if not provided) - - -f|format Changes formatting options. - NEXUS, with comments: - -f 0: output bin numbers, sites unsorted - -f 1: output bin number, sites sorted on rank - -f 2: displays rank values rather than bin numbers - -f 3: displays rank values and sites sorted on rank - FastA (default): - -f 4 - - -inc|include_only Give list of bins to include - -inc 3,4,5,6 (Note: No spaces, just commas) - - -exc|exclude_only Give list of charsets to exclude - -exc 1,2,9,10 - - -m|mask Mask -inc/-exc sites rather than removing them (default) - - -b|bins Set the number of bins to be used. - -b : Sites will be placed into number of bins. is a whole number. - - Default is 10. - Examples: - 1. Write a FastA file, masking site that fall into Bin1, Bin2, Bin9 and Bin10 of 10 bins: - tiger output -i sample.gr -fa my_data.fa -exc 1,2,9,10 -b 10 --mask - - 2. Write a NEXUS file combining test.0.gr, test.1.gr, test.2.gr with sites sorted on rank - tiger output -c list_of_gr_files.txt -fa my_data.fa -f 3 - - """ + Unlike the initial version, TIGER v2.0 is split into 3 modes, allowing for comparisons to be run + concurrently. + First, the input file needs to be broken down and indexed (index mode). It can be broken into as + many pieces as you wish, all of which can be compared on seperate processors. An index file for + the full dataset will be generated too, so that each piece can be compared back to it (reference + index). + Next, the rate calculation must be performed (rate mode). Each index file should be compared to + the reference index. + Finally, the data can be combined and output in a number of formats with masking options. + + Modes: + index: prepare the data for rate calculation, generate 'tiger index' (.ti) file(s). + rate: preform calculation of tiger rate for each site, create 'generated rates' (.gr) file(s). + output: write sequence files based on .gr files, integrate rates from a split analysis into a + single file, mask and edit alignment based on tiger rates. + + It may be noted that both .ti and .gr files are python cpickle dumps and can be browsed using iPython + cpickle: https://docs.python.org/2/library/pickle.html#module-cPickle + ipython: https://ipython.org/ + + 1. tiger index Options: + + -i|input Specify input file. File must be in FastA format and must be aligned prior. + Datasets with uneven sequence lengths will return an error. + + -s|split Split dataset across multiple files to run simultaneously. Takes int argument. + + -o|output Specify the prefix name of output files. + + -u|unknowns Specify unknown characters in the alignment. Unknown characters are omitted from + site patterns and so are not considered in the analysis. + -u ?,-,*: defines ?, - and * as unknown characters. (*Be sure to put only a comma + between characters, NO SPACE!!) + + Default is ? only + + Examples: + 1. Generate a .ti file for complete sequence named full_seq.ti & set unknowns to ? and - : + tiger index -i my_file.aln -o full_seq -u ?,- + + 2. Generate 10 subsets of the data with an output prefix of tiger_split and a reference: + tiger index -i my_file.aln -o tiger_split -s 10 + ** Results in files named tiger_split.0.ti, tiger_split.1.ti, and so on, along with + tiger_split.ref.ti + + 2. tiger rate Options: + + -i|input Specify input file. File should be in .ti format. + + -r|reference Specify reference sequence (.ti). -i file is used as default if none is provided. + + -o|output Specify prefix name for output files. + + -rl|rate_list A list of the rate at each site may be optionally written to a specified + file. + -rl : writes list of the rates at each site to file.txt. + + Default is .rates if no filename is specified + + Examples: + 1. Calculate rates for file test.ref.ti against itself and create a file containing a list of rates: + tiger rate -i test.ref.ti -rl + 2. Calculate rates for file test.0.ti against the reference index (test.ref.ti) + tiger rate -i test.0.ti -r test.ref.ti + + 3. tiger output Options: + + -i|input Specify input file. Must be in .gr format. + + -c|combine Specify input file. This file should contain a list of .gr files to be combined. + + -fa|fasta Provide original .fa file for sequence data. + + -o|output Specify prefix name for output files (prints to stdout if not provided) + + -f|format Changes formatting options. + NEXUS, with comments: + -f 0: output bin numbers, sites unsorted + -f 1: output bin number, sites sorted on rank + -f 2: displays rank values rather than bin numbers + -f 3: displays rank values and sites sorted on rank + FastA (default): + -f 4 + + -inc|include_only Give list of bins to include + -inc 3,4,5,6 (Note: No spaces, just commas) + + -exc|exclude_only Give list of charsets to exclude + -exc 1,2,9,10 + + -m|mask Mask -inc/-exc sites rather than removing them (default) + + -b|bins Set the number of bins to be used. + -b : Sites will be placed into number of bins. is a whole number. + + Default is 10. + Examples: + 1. Write a FastA file, masking site that fall into Bin1, Bin2, Bin9 and Bin10 of 10 bins: + tiger output -i sample.gr -fa my_data.fa -exc 1,2,9,10 -b 10 --mask + + 2. Write a NEXUS file combining test.0.gr, test.1.gr, test.2.gr with sites sorted on rank + tiger output -c list_of_gr_files.txt -fa my_data.fa -f 3 + + """) sys.exit(1) def die_with_message(message): - print message + print(message) sys.exit(1)