diff --git a/.circleci/config.yml b/.circleci/config.yml index ff30bc73..f079542b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -88,7 +88,7 @@ jobs: source activate ./atlasenv atlas init --db-dir $DATABASE_DIR --threads 1 --working-dir test/Getenvs test/reads/empty - run: - name: install environements + name: install environments command: | source activate ./atlasenv atlas run all --working-dir test/Getenvs --conda-create-envs-only --cores all diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml index 7b8da68b..dd0eb8e5 100644 --- a/.github/workflows/codespell.yml +++ b/.github/workflows/codespell.yml @@ -21,7 +21,3 @@ jobs: uses: actions/checkout@v4 - name: Codespell uses: codespell-project/actions-codespell@v2 - with: - check_filenames: true - skip: ".git,*.pdf,*.svg,versioneer.py,*.css,*.html" - check_hidden: true diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml index ee3faedb..8a8d10d0 100644 --- a/.github/workflows/python-package-conda.yml +++ b/.github/workflows/python-package-conda.yml @@ -106,7 +106,7 @@ jobs: path: databases key: conda-envs-assembly - # - name: upack conda envs + # - name: unpack conda envs # if: steps.get-envs.outputs.cache-hit != 'true' # run: tar -xzf assembly_conda_envs.tar.gz @@ -198,7 +198,7 @@ jobs: path: wd key: assembly-working-dir - - name: dryrun assembly shold need nothing to be done + - name: dryrun assembly should need nothing to be done run: | ls -l wd ls -l databases/conda_envs @@ -264,7 +264,7 @@ jobs: fail-on-cache-miss: true key: assembly-working-dir - - name: dryrun assembly shold need nothing to be done + - name: dryrun assembly should need nothing to be done run: | ls -l wd ls -l databases diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e1b99e..a6faadce 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,12 +20,12 @@ Fix error with downloading DRAM. Update to DRAM v1.5 - Qc reads, assembly are now written in the sample.tsv from the start. This should fix errors of partial writing to the sample.tsv https://github.com/metagenome-atlas/atlas/issues/695 - It also allows you to add external assemblies. -- singletons reads are no longer used trough the pipeline. +- singletons reads are no longer used through the pipeline. - This changes the default paths for raw reads and assemblies. assembly are now in `Assembly/fasta/{sample}.fasta` reads: `QC/reads/{sample}_{fraction}.fastq.gz` -**Seemless update**: If you update atlas and continue on an old project. Your old files will be copied. +**Seamless update**: If you update atlas and continue on an old project. Your old files will be copied. Or the path defined in the sample.tsv will be used. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 5e41cfbf..d940038c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,7 +24,7 @@ I hope we can help you... You can ask the maintainers to be added to the repository and work from a *branch* of the main atlas repository or you can work from a fork of the atlas repository. -Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the developpment version of atlas. This allows you to work with the code you have in the git repository. +Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the development version of atlas. This allows you to work with the code you have in the git repository. ## Test the code ### Locally @@ -36,8 +36,8 @@ When you created a new rule and you want to test the output of this rule `my_tar -### Continous integration -When you make a pull request to the master branch. Each change in your code get's checked by continous integration (CI). The tests should make shure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`. +### Continuous integration +When you make a pull request to the master branch. Each change in your code gets checked by continuous integration (CI). The tests should make sure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`. diff --git a/README.md b/README.md index 445f8757..31be0d09 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,7 @@ https://metagenome-atlas.readthedocs.io/ > doi: [10.1186/s12859-020-03585-4](https://doi.org/10.1186/s12859-020-03585-4) -# Developpment/Extensions +# Development/Extensions Here are some ideas I work or want to work on when I have time. If you want to contribute or have some ideas let me know via a feature request issue. diff --git a/atlas/atlas.py b/atlas/atlas.py index 8afc7342..a1046a4d 100644 --- a/atlas/atlas.py +++ b/atlas/atlas.py @@ -32,7 +32,7 @@ def handle_max_mem(max_mem, profile): import psutil from math import floor - # calulate max system meory in GB (float!) + # calculate max system memory in GB (float!) max_system_memory = psutil.virtual_memory().total / (1024**3) if max_mem is None: @@ -146,7 +146,7 @@ def get_snakefile(file="workflow/Snakefile"): def run_workflow( workflow, working_dir, config_file, jobs, max_mem, profile, dryrun, snakemake_args ): - """Runs the ATLAS pipline + """Runs the ATLAS pipeline By default all steps are executed but a sub-workflow can be specified. Needs a config-file and expects to find a sample table in the working-directory. Both can be generated with 'atlas init' diff --git a/atlas/init/create_sample_table.py b/atlas/init/create_sample_table.py index 734c6cab..384ef6ba 100644 --- a/atlas/init/create_sample_table.py +++ b/atlas/init/create_sample_table.py @@ -45,7 +45,7 @@ def add_sample_to_table(sample_dict, sample_id, header, fastq): def infer_split_character(base_name): - "Infer if fastq filename uses '_R1' '_1' to seperate filenames" + "Infer if fastq filename uses '_R1' '_1' to separate filenames" global split_character, is_paired @@ -59,7 +59,7 @@ def infer_split_character(base_name): is_paired = True else: logger.warning( - f"Could't find '_R1'/'_R2' or '_1'/'_2' in your filename {base_name}. Assume you have single-end reads." + f"Couldn't find '_R1'/'_R2' or '_1'/'_2' in your filename {base_name}. Assume you have single-end reads." ) split_character = None is_paired = False @@ -145,7 +145,7 @@ def get_samples_from_fastq(path, fraction_split_character=split_character): try: _, subfolders, files = next(os.walk(path)) except StopIteration: - logger.error(f"Folder {path} seems to conain no files or subfolders.") + logger.error(f"Folder {path} seems to contain no files or subfolders.") exit(1) abs_path = os.path.abspath(path) @@ -213,7 +213,7 @@ def simplify_sample_names(sample_df): lambda row: "{0}-{1}".format(*row), axis=1 ) - # cannt find unique sample ids + # cannot find unique sample ids else: logger.warning( "Didn't found a way to simplify sample names. " diff --git a/atlas/init/get_SRA_runinfo.py b/atlas/init/get_SRA_runinfo.py index 4c02c77a..ed5e1495 100644 --- a/atlas/init/get_SRA_runinfo.py +++ b/atlas/init/get_SRA_runinfo.py @@ -276,7 +276,7 @@ def getInfoFromSRAIdentifier(identifier): # SAME, SAMD, and SAMN return SRAUtils.getInfoFromBioSampleAcc(identifier) elif identifier.startswith("PRJ"): - # DDBJ archvie bioproject prefix PRJNA SAMEA2796165 + # DDBJ archive bioproject prefix PRJNA SAMEA2796165 return SRAUtils.getInfoFromBioProjectAcc(identifier) else: raise Exception( @@ -294,7 +294,7 @@ def get_runtable_from_ids(identifiers, output_file="SRA_runtable.tsv", overwrite with open(output_file, "w") as outInfoFile: identifierCount = 0 - # don't show progress bar if only one elelment + # don't show progress bar if only one element if len(identifiers) > 1: identifier_with_progressbar = tqdm(identifiers) else: @@ -317,7 +317,7 @@ def get_runtable_from_ids(identifiers, output_file="SRA_runtable.tsv", overwrite def parse_arguments_from_terminal(): - ## Comand line interface + ## Command line interface import argparse parser = argparse.ArgumentParser() diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py index 0c2c41ff..f3642227 100644 --- a/atlas/init/parse_sra.py +++ b/atlas/init/parse_sra.py @@ -106,7 +106,7 @@ def filter_runinfo(RunTable, ignore_paired=False): Platforms = ", ".join(RunTable.Platform.unique()) logger.warning( - f"Your samples are sequenced on the folowing platform: {Platforms}\n" + f"Your samples are sequenced on the following platform: {Platforms}\n" "I don't know how well Atlas handles non-illumina reads.\n" "If you have long-reads, specify them via a the longreads, column in the sample table." ) @@ -139,14 +139,14 @@ def validate_merging_runinfo(path): if len(problematic_samples) > 0: logger.error( - f"You attemt to merge runs from the same sample. " - f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and should't be merged.\n" - f"Please resolve the abiguity in the table {path} and rerun the command.\n" + f"You attempt to merge runs from the same sample. " + f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and shouldn't be merged.\n" + f"Please resolve the ambiguity in the table {path} and rerun the command.\n" ) exit(1) - # Warn if samples are not identical for the follwing columns + # Warn if samples are not identical for the following columns Expected_same_values = ["Experiment", "Model", "LibraryName"] for key in Expected_same_values: problematic_samples = [] @@ -161,7 +161,7 @@ def validate_merging_runinfo(path): problematic_samples_list = " ".join(problematic_samples) logger.warning( - "You attemt to merge runs from the same sample. " + "You attempt to merge runs from the same sample. " f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n" f"You can modify the table {path} and rerun the command.\n" ) diff --git a/atlas/sample_table.py b/atlas/sample_table.py index be7d5217..f73ed85f 100644 --- a/atlas/sample_table.py +++ b/atlas/sample_table.py @@ -127,7 +127,7 @@ def validate_bingroup_size(sampleTable, config, logger): if config["final_binner"] == "DASTool": binners = config["binner"] - logger.info(f"DASTool uses the folowing binners: {binners}") + logger.info(f"DASTool uses the following binners: {binners}") if ("vamb" in binners) or ("SemiBin" in binners): validate_bingroup_size_cobinning(sampleTable, logger) diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst index 3c53f42b..20c8192b 100644 --- a/docs/usage/getting_started.rst +++ b/docs/usage/getting_started.rst @@ -15,7 +15,7 @@ Atlas is based on snakemake, which allows to run steps of the workflow in parall If you want to try atlas and have a linux computer (OSX may also work), you can use our `example data`_ for testing. -For real metagenomic data atlas should be run on a _linux_ sytem, with enough memory (min ~50GB but assembly usually requires 250GB). +For real metagenomic data atlas should be run on a _linux_ system, with enough memory (min ~50GB but assembly usually requires 250GB). @@ -213,9 +213,9 @@ Gives the output:: [Atlas] INFO: Downloading runinfo from SRA [Atlas] INFO: Start with 2979 runs from 2979 samples - [Atlas] INFO: Runs have the folowing values for LibrarySource: METAGENOMIC, METATRANSCRIPTOMIC + [Atlas] INFO: Runs have the following values for LibrarySource: METAGENOMIC, METATRANSCRIPTOMIC Select only runs LibrarySource == METAGENOMIC, Filtered out 762 runs - [Atlas] INFO: Runs have the folowing values for LibrarySelection: PCR, RT-PCR, RANDOM + [Atlas] INFO: Runs have the following values for LibrarySelection: PCR, RT-PCR, RANDOM Select only runs LibrarySelection == RANDOM, Filtered out 879 runs [Atlas] INFO: Selected 1338 runs from 1338 samples [Atlas] INFO: Write filtered runinfo to HMP2/RunInfo.tsv @@ -269,7 +269,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v -h, --help Show this message and exit. -Execue Atlas +Execute Atlas ************ diff --git a/docs/usage/output.rst b/docs/usage/output.rst index e0f2237a..76b65a2f 100644 --- a/docs/usage/output.rst +++ b/docs/usage/output.rst @@ -94,7 +94,7 @@ Genomes atlas run genomes -Binning can predict several times the same genome from different samples. To remove this reduncancy we use DeRep to filter and de-replicate the genomes. By default the threshold is set to **97.5%**, which corresponds somewhat to the *sub-species level*. The best quality genome for each cluster is choosen as the representative for each cluster. The represenative MAG are then renamed and used for annotation and quantification. +Binning can predict several times the same genome from different samples. To remove this reduncancy we use DeRep to filter and de-replicate the genomes. By default the threshold is set to **97.5%**, which corresponds somewhat to the *sub-species level*. The best quality genome for each cluster is chosen as the representative for each cluster. The representative MAG are then renamed and used for annotation and quantification. The fasta sequence of the dereplicated and renamed genomes can be found in ``genomes/genomes`` and their quality estimation are in ``genomes/checkm/completeness.tsv``. @@ -138,7 +138,7 @@ All trees are properly rooted using the midpoint. The files can be found in ``ge **Functional annotation** -Sicne version 2.8, We use `DRAM `_ to annotate the genomes with Functional annotations, e.g. KEGG and CAZy as well as to **infere pathways**, or more specifically Kegg modules. +Since version 2.8, We use `DRAM `_ to annotate the genomes with Functional annotations, e.g. KEGG and CAZy as well as to **infere pathways**, or more specifically Kegg modules. The Functional annotations for each genome can be found in ``genomes/annotations/dram/`` @@ -148,7 +148,7 @@ and are contain the following files: - ``annotations.tsv`` Table of all annotations - ``distil/metabolism_summary.xlsx`` Excel of the summary of all annotations - The tool alos produces a nice report in `distil/product.html`_. + The tool also produces a nice report in `distil/product.html`_. .. _distil/product.html: ../_static/dram_product.html @@ -290,7 +290,7 @@ Here is the R code to calculate the gene copies per million (analogous to transc Before version 2.15 the output of the counts were stored in a parquet file. -The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```. +The parquet file can be opened easily with ``pandas.read_parquet`` or ``arrow::read_parquet```. However you need to load the full data into memory. .. code-block:: R diff --git a/setup.cfg b/setup.cfg index 0ef878c3..f4136fc0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -4,3 +4,10 @@ style = pep440 versionfile_source = atlas/_version.py versionfile_build = atlas/_version.py tag_prefix = v + + +[codespell] +check-filenames = +skip = .git,*.pdf,*.svg,versioneer.py,*.css,*.html +ignore-words-list = BRITE +check-hidden = \ No newline at end of file diff --git a/test/test_sra.sh b/test/test_sra.sh index 57dde706..f51dfa1e 100755 --- a/test/test_sra.sh +++ b/test/test_sra.sh @@ -67,7 +67,7 @@ atlas init-public SAMEA104416160 -w $WD atlas run None download_sra -w $WD $@ -## smal data +## small data echo "Download reads from small dataset for real test" diff --git a/workflow/Snakefile b/workflow/Snakefile index 2c12c552..e0f53441 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -15,11 +15,11 @@ import utils # add default config -# comand line adds user config +# command line adds user config configfile: os.path.join(workflow_folder, "..", "config", "default_config.yaml") -# add defualt values from python (TODO: replace this) +# add default values from python (TODO: replace this) from atlas.make_config import update_config as atlas_update_config config = atlas_update_config(config) @@ -227,7 +227,7 @@ for r in workflow.rules: # default r.resources["mem_mb"] = config["mem"] * 1000 - # add time if ot present. Simple jobs use simple time + # add time if not present. Simple jobs use simple time if "time_min" not in r.resources: r.resources["time_min"] = config["runtime"]["default"] * 60 diff --git a/workflow/report/assembly_report.py b/workflow/report/assembly_report.py index 61ecf740..71c966dd 100644 --- a/workflow/report/assembly_report.py +++ b/workflow/report/assembly_report.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts from common_report import * diff --git a/workflow/report/bin_report.py b/workflow/report/bin_report.py index c2918139..693c4297 100644 --- a/workflow/report/bin_report.py +++ b/workflow/report/bin_report.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts from common_report import * diff --git a/workflow/report/qc_report.py b/workflow/report/qc_report.py index dbc132cf..b7fb190d 100644 --- a/workflow/report/qc_report.py +++ b/workflow/report/qc_report.py @@ -27,7 +27,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts from common_report import * @@ -158,7 +158,7 @@ def make_plots( Quality_QC_pe, Quality_QC_se = get_stats_from_zips(zipfiles_QC, samples) # Quality_raw_pe, Quality_raw_se = get_stats_from_zips(zipfiles_QC,samples) - # detrmine range of quality values and if paired + # determine range of quality values and if paired max_quality = 1 + np.nanmax((Quality_QC_pe.max().max(), Quality_QC_se.max().max())) quality_range = [min_quality, max_quality] diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk index 7ccd8a93..27454243 100644 --- a/workflow/rules/assemble.smk +++ b/workflow/rules/assemble.smk @@ -90,7 +90,7 @@ else: # make symlink assert len(input) == len( output - ), "Input and ouput files have not same number, can not create symlinks for all." + ), "Input and output files have not same number, can not create symlinks for all." for i in range(len(input)): os.symlink(os.path.abspath(input[i]), output[i]) @@ -170,7 +170,7 @@ rule error_correction: params: inputs=lambda wc, input: io_params_for_tadpole(input), outputs=lambda wc, output: io_params_for_tadpole(output, key="out"), - prefilter=2, # Ignore kmers with less than 2 occurance + prefilter=2, # Ignore kmers with less than 2 occurrence minprob=config["error_correction_minprob"], tossdepth=config["error_correction_minimum_kmer_depth"], tossjunk="t" if config["error_correction_remove_lowdepth"] else "f", @@ -656,7 +656,7 @@ rule pileup_contigs_sample: benchmark: "logs/benchmarks/assembly/calculate_coverage/pileup/{sample}.txt" log: - "{sample}/logs/assembly/calculate_coverage/pilup_final_contigs.log", # This log file is uesd for report + "{sample}/logs/assembly/calculate_coverage/pilup_final_contigs.log", # This log file is used for report conda: "%s/required_packages.yaml" % CONDAENV threads: config.get("threads", 1) diff --git a/workflow/rules/bin_quality.smk b/workflow/rules/bin_quality.smk index 036ce115..89403ade 100644 --- a/workflow/rules/bin_quality.smk +++ b/workflow/rules/bin_quality.smk @@ -233,7 +233,7 @@ rule get_bin_filenames: def get_list_of_files(dirs, pattern): fasta_files = [] - # searh for fasta files (.f*) in all bin folders + # search for fasta files (.f*) in all bin folders for dir in dirs: dir = Path(dir) fasta_files += list(dir.glob(pattern)) diff --git a/workflow/rules/binning.smk b/workflow/rules/binning.smk index c6d44997..5a3ffd5b 100644 --- a/workflow/rules/binning.smk +++ b/workflow/rules/binning.smk @@ -46,12 +46,12 @@ rule get_contig_coverage_from_bb: output: temp("{sample}/binning/coverage/{sample_reads}_coverage.txt"), run: - with open(input[0]) as fi, open(output[0], "w") as fo: + with open(input[0]) as fi, open(output[0], "w") as fout: # header next(fi) for line in fi: toks = line.strip().split("\t") - print(toks[0], toks[1], sep="\t", file=fo) + print(toks[0], toks[1], sep="\t", file=fout) rule combine_coverages: @@ -92,7 +92,7 @@ rule run_concoct: "{sample}/binning/concoct/intermediate_files/log.txt", conda: "%s/concoct.yaml" % CONDAENV - threads: 10 # concoct uses 10 threads by default, wit for update: https://github.com/BinPro/CONCOCT/issues/177 + threads: 10 # concoct uses 10 threads by default, with for update: https://github.com/BinPro/CONCOCT/issues/177 resources: mem_mb=config["mem"] * 1000, shell: @@ -137,7 +137,7 @@ rule get_metabat_depth_file: "{sample}/binning/metabat/metabat.log", conda: "../envs/metabat.yaml" - threads: config["threads"] # multithreaded trough OMP_NUM_THREADS + threads: config["threads"] # multithreaded through OMP_NUM_THREADS resources: mem_mb=config["mem"] * 1000, params: diff --git a/workflow/rules/cobinning.smk b/workflow/rules/cobinning.smk index 31fdb753..57861f39 100644 --- a/workflow/rules/cobinning.smk +++ b/workflow/rules/cobinning.smk @@ -43,7 +43,7 @@ def get_filtered_contigs_of_bingroup(wildcards): if len(samples_of_group) < 5: raise ValueError( f"Bin group {wildcards.bingroup} has {len(samples_of_group)} less than 5 samples." - "For cobinning we reccomend at least 5 samples per bin group." + "For cobinning we recommend at least 5 samples per bin group." "Adapt the sample.tsv to set BinGroup of size [5- 1000]" ) @@ -68,7 +68,7 @@ rule combine_contigs: log: "logs/cobinning/{bingroup}/combine_contigs.log", params: - seperator=config["cobinning_separator"], + separator=config["cobinning_separator"], samples=get_samples_of_bingroup, threads: 1 run: @@ -80,7 +80,7 @@ rule combine_contigs: for line in fin: # if line is a header add sample name if line[0] == ord(">"): - line = f">{sample}{params.seperator}".encode() + line[1:] + line = f">{sample}{params.separator}".encode() + line[1:] # write each line to the combined file fout.write(line) @@ -176,7 +176,7 @@ rule summarize_bam_contig_depths: "logs/cobinning/{bingroup}/combine_coverage.log", conda: "../envs/metabat.yaml" - threads: config["threads"] # multithreaded trough OMP_NUM_THREADS + threads: config["threads"] # multithreaded through OMP_NUM_THREADS benchmark: "logs/benchmarks/cobinning/{bingroup}/summarize_bam_contig_depths.tsv" resources: diff --git a/workflow/rules/download.smk b/workflow/rules/download.smk index 8842041c..dfe7c1a5 100644 --- a/workflow/rules/download.smk +++ b/workflow/rules/download.smk @@ -2,7 +2,7 @@ import hashlib import os from pathlib import Path -# this values are incuded in the snakefile +# this values are included in the snakefile DBDIR = Path(config["database_dir"]).resolve() GUNCDIR = DBDIR / "gunc_database" diff --git a/workflow/rules/genecatalog.smk b/workflow/rules/genecatalog.smk index a0ee605c..13007049 100644 --- a/workflow/rules/genecatalog.smk +++ b/workflow/rules/genecatalog.smk @@ -376,7 +376,7 @@ rule combine_gene_coverages: # TODO: combine RPKM -# TODO: caluclate mapping rate from pileup mapping files +# TODO: calculate mapping rate from pileup mapping files # logs/Genecatalog/alignment/sample2_pileup.log # Reads: 1207217 # Mapped reads: 1071071 diff --git a/workflow/rules/patch.smk b/workflow/rules/patch.smk index 5698fe7c..ba7992da 100644 --- a/workflow/rules/patch.smk +++ b/workflow/rules/patch.smk @@ -2,7 +2,7 @@ localrules: copy_assembly, -# Rules that are usefull temporarily to update to new version of atlas +# Rules that are useful temporarily to update to new version of atlas ruleorder: copy_assembly > finalize_contigs diff --git a/workflow/rules/predict_genes_of_genomes.py b/workflow/rules/predict_genes_of_genomes.py index 3e91f0bd..dcdd2fe2 100644 --- a/workflow/rules/predict_genes_of_genomes.py +++ b/workflow/rules/predict_genes_of_genomes.py @@ -31,7 +31,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts # python 3.5 without f strings @@ -51,7 +51,7 @@ def predict_genes(genome, fasta, out_dir, log): shell('printf "{genome}:\n" > {log}'.format(genome=genome, log=log)) shell( - "prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p sinlge -c -m -f gff 2>> {log} ".format( + "prodigal -i {fasta} -o {gff} -d {fna} -a {faa} -p single -c -m -f gff 2>> {log} ".format( fasta=fasta, log=log, gff=gff, fna=fna, faa=faa ) ) diff --git a/workflow/rules/sample_table.smk b/workflow/rules/sample_table.smk index 6c6adb56..1c5f7f93 100644 --- a/workflow/rules/sample_table.smk +++ b/workflow/rules/sample_table.smk @@ -96,7 +96,7 @@ else: if (len(colum_headers_raw) == 0) and (len(colum_headers_QC) == 0): raise IOError( "Either raw reas or QC reads need to be in the sample table. " - "I din't find any columnns with 'Reads_raw_' or 'Reads_QC_' " + "I din't find any columns with 'Reads_raw_' or 'Reads_QC_' " ) diff --git a/workflow/scripts/cluster_species.py b/workflow/scripts/cluster_species.py index e973d4cc..a1a3d2c2 100644 --- a/workflow/scripts/cluster_species.py +++ b/workflow/scripts/cluster_species.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts import pandas as pd @@ -93,7 +93,7 @@ def get_float(value): n_pre_clusters = nx.connected.number_connected_components(G) -logging.info(f"Found {n_pre_clusters} pre-clusters, itterate over them.") +logging.info(f"Found {n_pre_clusters} pre-clusters, iterate over them.") logging.debug(f"Cluster with threshold {threshold} and {linkage_method}-linkage method") for i, cc in enumerate(nx.connected_components(G)): logging.info(f"Precluster {i+1}/{n_pre_clusters} with {len(cc)} genomes") @@ -165,7 +165,7 @@ def get_float(value): # drop low quality genomes - logging.info("Drop low quality genomes acording to filtercriteria") + logging.info("Drop low quality genomes according to filter criteria") try: filter_criteria = snakemake.config["genome_filter_criteria"] @@ -229,7 +229,7 @@ def get_float(value): n_species = mag2Species.SpeciesNr.unique().shape[0] logging.info(f"Identified {n_species } species in total") -# create propper species names +# create proper species names n_leading_zeros = len(str(mag2Species.SpeciesNr.max())) format_int = "sp{:0" + str(n_leading_zeros) + "d}" mag2Species["Species"] = mag2Species.SpeciesNr.apply(format_int.format) @@ -239,13 +239,13 @@ def get_float(value): logging.info("Define Quality score defined as Completeness - 5x Contamination") -# recalulate quality score as some completeness might be recalibrated. +# recalculate quality score as some completeness might be recalibrated. Q.eval("Quality_score = Completeness - 5* Contamination", inplace=True) quality_score = Q.Quality_score assert ( not quality_score.isnull().any() -), "I have NA quality values for thq quality score, it seems not all of the values defined in the quality_score_formula are presentfor all entries in tables/Genome_quality.tsv " +), "I have NA quality values for the quality score, it seems not all of the values defined in the quality_score_formula are presentfor all entries in tables/Genome_quality.tsv " # select representative diff --git a/workflow/scripts/combine_busco.py b/workflow/scripts/combine_busco.py index eea83ad2..4e063884 100644 --- a/workflow/scripts/combine_busco.py +++ b/workflow/scripts/combine_busco.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts import pandas as pd from utils.parsers import read_busco_output diff --git a/workflow/scripts/combine_checkm.py b/workflow/scripts/combine_checkm.py index dc3d0a16..63ed27df 100644 --- a/workflow/scripts/combine_checkm.py +++ b/workflow/scripts/combine_checkm.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts import pandas as pd from utils.parsers import read_checkm_output diff --git a/workflow/scripts/combine_checkm2.py b/workflow/scripts/combine_checkm2.py index 69f85155..71813ad3 100644 --- a/workflow/scripts/combine_checkm2.py +++ b/workflow/scripts/combine_checkm2.py @@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts import pandas as pd from utils.parsers import read_checkm2_output diff --git a/workflow/scripts/combine_gene_coverages.py b/workflow/scripts/combine_gene_coverages.py index 1c9bdf14..2705d876 100644 --- a/workflow/scripts/combine_gene_coverages.py +++ b/workflow/scripts/combine_gene_coverages.py @@ -28,7 +28,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of script +#### Beginning of script import numpy as np import pandas as pd import gc, os @@ -94,7 +94,7 @@ def measure_memory(write_log_entry=True): "data", shape=gene_matrix_shape, fillvalue=0, compression="gzip" ) - # add Smaple names attribute + # add Sample names attribute sample_names = np.array(list(snakemake.params.samples)).astype("S") combined_cov.attrs["sample_names"] = sample_names combined_counts.attrs["sample_names"] = sample_names diff --git a/workflow/scripts/combine_taxonomy.py b/workflow/scripts/combine_taxonomy.py index c873205c..7647ec78 100644 --- a/workflow/scripts/combine_taxonomy.py +++ b/workflow/scripts/combine_taxonomy.py @@ -28,7 +28,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of scripts +#### Beginning of scripts import pandas as pd import numpy as np diff --git a/workflow/scripts/filter_genomes.py b/workflow/scripts/filter_genomes.py index 855186b9..f1cb2dc7 100644 --- a/workflow/scripts/filter_genomes.py +++ b/workflow/scripts/filter_genomes.py @@ -74,7 +74,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): if Q.shape[0] == 0: logging.error( - f"No bins passed filtering criteria! Bad luck!. You might want to tweek the filtering criteria. Also check the {snakemake.input.quality}" + f"No bins passed filtering criteria! Bad luck!. You might want to tweak the filtering criteria. Also check the {snakemake.input.quality}" ) exit(1) diff --git a/workflow/scripts/gene2genome.py b/workflow/scripts/gene2genome.py index e65bb8f7..995f3279 100644 --- a/workflow/scripts/gene2genome.py +++ b/workflow/scripts/gene2genome.py @@ -28,7 +28,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): # Install exception handler sys.excepthook = handle_exception -#### Begining of script +#### Beginning of script import pandas as pd from utils import gene_scripts diff --git a/workflow/scripts/get_fasta_of_bins.py b/workflow/scripts/get_fasta_of_bins.py index ff8c65d7..1d99c478 100644 --- a/workflow/scripts/get_fasta_of_bins.py +++ b/workflow/scripts/get_fasta_of_bins.py @@ -46,7 +46,7 @@ def get_fasta_of_bins(cluster_attribution, contigs_file, out_folder): Creates individual fasta files for each bin using the contigs fasta and the cluster attribution. input: - - cluster attribution file: tab seperated file of "contig_fasta_header bin" + - cluster attribution file: tab separated file of "contig_fasta_header bin" - contigs: fasta file of contigs - out_prefix: output_prefix for bin fastas {out_folder}/{binid}.fasta """ diff --git a/workflow/scripts/get_read_stats.py b/workflow/scripts/get_read_stats.py index 5fc55a64..4678e9e5 100644 --- a/workflow/scripts/get_read_stats.py +++ b/workflow/scripts/get_read_stats.py @@ -30,7 +30,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): sys.excepthook = handle_exception -# begining of script +# beginning of script import datetime import shutil diff --git a/workflow/scripts/parse_vamb.py b/workflow/scripts/parse_vamb.py index 01f7cdb8..759b19fd 100644 --- a/workflow/scripts/parse_vamb.py +++ b/workflow/scripts/parse_vamb.py @@ -117,7 +117,7 @@ def handle_exception(exc_type, exc_value, exc_traceback): clusters.loc[~clusters.Large_enough, "SampleBin"] = "" -logging.info(f"Write reformated table to {output_culsters}") +logging.info(f"Write reformatted table to {output_culsters}") clusters.to_csv(output_culsters, sep="\t", index=False) # filter for following diff --git a/workflow/scripts/utils/genome_dist.py b/workflow/scripts/utils/genome_dist.py index 676872d9..212ba325 100644 --- a/workflow/scripts/utils/genome_dist.py +++ b/workflow/scripts/utils/genome_dist.py @@ -94,7 +94,7 @@ def load_bindash(dist_file, simplify_names=True): ['Genome1','Genome2','Distance','Pvalue','Fraction','Nmapped','Ntotal','ANI'] in header. - Bindash tables are not necessarily simetrical. + Bindash tables are not necessarily symmetrical. """ F = load_ani_table_( dist_file, ["Distance", "Pvalue", "Fraction"], simplify_names=simplify_names @@ -147,10 +147,10 @@ def pairewise2matrix(M, fillna=np.nan): """ This functions turns a pairewise genome ANI table [genome1, genome2, column...] In to a matrix [genome1 genome2] of the values of column. - When ANI values are symetrical (with minimal error), + When ANI values are symmetrical (with minimal error), usually only one halve of NxN possibilities values are calculated. - During the process missing values are inputed with 0 + During the process missing values are inputted with 0 Diagonal values are set to 1 @@ -195,7 +195,7 @@ def group_species_linkage(M, threshold=0.95, fillna=None, linkage_method="averag M is a series of ANI """ assert type(M) == pd.Series - verify_expected_range(threshold, 0.3, 1, "clustering thrshold") + verify_expected_range(threshold, 0.3, 1, "clustering threshold") verify_expected_range(M.max(), 0.05, 1, "ANI max") verify_expected_range(M.min(), 0.05, 1, "ANI min") diff --git a/workflow/scripts/utils/genome_stats.py b/workflow/scripts/utils/genome_stats.py index 0775df12..38f66157 100644 --- a/workflow/scripts/utils/genome_stats.py +++ b/workflow/scripts/utils/genome_stats.py @@ -60,14 +60,14 @@ def genome_stats(fasta_file, number_of_n_for_split=10): faiter = (x[1] for x in groupby(fasta, lambda line: line[0] == ">")) for record in faiter: - # reccord contains header + # record contains header ## join sequence lines sequence = "".join(s.strip() for s in faiter.__next__()) sequence = sequence.upper() verify_dna(sequence, is_upper=True) - # count ambigous bases + # count ambiguous bases ambigious_bases += sequence.count("N") # get set of scaffold lengths diff --git a/workflow/scripts/utils/io.py b/workflow/scripts/utils/io.py index 57640a99..73f2b852 100644 --- a/workflow/scripts/utils/io.py +++ b/workflow/scripts/utils/io.py @@ -60,7 +60,7 @@ def cat_files(files, outfilename, gzip=False): def convert_percentages(df): - """Convet all columns with strings and % at the end to percentages""" + """Convert all columns with strings and % at the end to percentages""" for col in df.columns: if df.dtypes[col] == "object": if df[col].iloc[0].endswith("%"): diff --git a/workflow/scripts/utils/parsers_bbmap.py b/workflow/scripts/utils/parsers_bbmap.py index 41338ae9..37c1af38 100644 --- a/workflow/scripts/utils/parsers_bbmap.py +++ b/workflow/scripts/utils/parsers_bbmap.py @@ -124,9 +124,9 @@ def read_bbsplit_bincov(bbsplit_bincov_file): # split first index `genome$contig` in two index = pd.Series(binCov.index.levels[0], index=binCov.index.levels[0]) - splitted = index.str.split("$", expand=True) - splitted.columns = ["Genome", "Contig"] - new_index = splitted.loc[binCov.index.get_level_values(0)] + Split = index.str.split("$", expand=True) + Split.columns = ["Genome", "Contig"] + new_index = Split.loc[binCov.index.get_level_values(0)] new_index["Position"] = binCov.index.get_level_values(1).values binCov.index = pd.MultiIndex.from_frame(new_index) diff --git a/workflow/scripts/utils/taxonomy.py b/workflow/scripts/utils/taxonomy.py index 9f661197..bfbd3ba1 100644 --- a/workflow/scripts/utils/taxonomy.py +++ b/workflow/scripts/utils/taxonomy.py @@ -16,7 +16,7 @@ def tax2table(Taxonomy_Series, split_character=";", remove_prefix=False): # drop missing values if Taxonomy_Series.isnull().any(): warnings.warn( - "Some samples have no taxonomy asigned. Samples:\n" + "Some samples have no taxonomy assigned. Samples:\n" + ", ".join(Taxonomy_Series.index[Taxonomy_Series.isnull()].astype(str)) )