metagenome-atlas · SilasK · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -88,7 +88,7 @@ jobs:
             source activate ./atlasenv
             atlas init --db-dir $DATABASE_DIR --threads 1 --working-dir test/Getenvs test/reads/empty
       - run:
-          name: install environements
+          name: install environments
           command: |
             source activate ./atlasenv
             atlas run all --working-dir test/Getenvs --conda-create-envs-only --cores all

diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -21,7 +21,3 @@ jobs:
         uses: actions/checkout@v4
       - name: Codespell
         uses: codespell-project/actions-codespell@v2
-        with:
-          check_filenames: true
-          skip: ".git,*.pdf,*.svg,versioneer.py,*.css,*.html"
-          check_hidden: true
diff --git a/.github/workflows/python-package-conda.yml b/.github/workflows/python-package-conda.yml
@@ -106,7 +106,7 @@ jobs:
           path: databases
           key: conda-envs-assembly
 
-        # - name: upack conda envs
+        # - name: unpack conda envs
         #   if: steps.get-envs.outputs.cache-hit != 'true'
         #   run: tar -xzf assembly_conda_envs.tar.gz
 
@@ -198,7 +198,7 @@ jobs:
           path: wd
           key: assembly-working-dir
 
-      - name: dryrun assembly shold need nothing to be done
+      - name: dryrun assembly should need nothing to be done
         run: |
           ls -l wd
           ls -l databases/conda_envs
@@ -264,7 +264,7 @@ jobs:
           fail-on-cache-miss: true
           key: assembly-working-dir
 
-      - name: dryrun assembly shold need nothing to be done
+      - name: dryrun assembly should need nothing to be done
         run: |
           ls -l wd
           ls -l databases

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,12 +20,12 @@ Fix error with downloading DRAM. Update to DRAM v1.5
 
 - Qc reads, assembly are now written in the sample.tsv from the start. This should fix errors of partial writing to the sample.tsv https://github.com/metagenome-atlas/atlas/issues/695
 - It also allows you to add external assemblies.
-- singletons reads are no longer used trough the pipeline. 
+- singletons reads are no longer used through the pipeline. 
 - This changes the default paths for raw reads and assemblies. 
 assembly are now in `Assembly/fasta/{sample}.fasta`
 reads: `QC/reads/{sample}_{fraction}.fastq.gz`
 
-**Seemless update**: If you update atlas and continue on an old project. Your old files will be copied.
+**Seamless update**: If you update atlas and continue on an old project. Your old files will be copied.
 Or the path defined in the sample.tsv will be used. 
 
 

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -24,7 +24,7 @@ I hope we can help you...
 
 You can ask the maintainers to be added to the repository and work from a *branch* of the main atlas repository or you can work from a fork of the atlas repository. 
 
-Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the developpment version of atlas. This allows you to work with the code you have in the git repository. 
+Follow the [steps](https://github.com/metagenome-atlas/atlas#install-the-development-version-from-github) to set up the development version of atlas. This allows you to work with the code you have in the git repository. 
 
 ## Test the code
 ### Locally
@@ -36,8 +36,8 @@ When you created a new rule and you want to test the output of this rule `my_tar
 
 
 
-### Continous integration
-When you make a pull request to the master branch. Each change in your code get's checked by continous integration (CI). The tests should make shure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`.
+### Continuous integration
+When you make a pull request to the master branch. Each change in your code gets checked by continuous integration (CI). The tests should make sure that your modification don't break any other use of atlas. However due to the requeirements needed during the execution of atlas, it is not possible to test all functionalities via CI. If you add functionalities to atlas, they should also be tested. Have a look at the scripts in `.test`.
 
 
 

diff --git a/README.md b/README.md
@@ -38,7 +38,7 @@ https://metagenome-atlas.readthedocs.io/
 > doi: [10.1186/s12859-020-03585-4](https://doi.org/10.1186/s12859-020-03585-4)
 
 
-# Developpment/Extensions
+# Development/Extensions
 
 Here are some ideas I work or want to work on when I have time. If you want to contribute or have some ideas let me know via a feature request issue.
 

diff --git a/atlas/atlas.py b/atlas/atlas.py
@@ -32,7 +32,7 @@ def handle_max_mem(max_mem, profile):
         import psutil
         from math import floor
 
-        # calulate max  system meory in GB (float!)
+        # calculate max  system memory in GB (float!)
         max_system_memory = psutil.virtual_memory().total / (1024**3)
 
         if max_mem is None:
@@ -146,7 +146,7 @@ def get_snakefile(file="workflow/Snakefile"):
 def run_workflow(
     workflow, working_dir, config_file, jobs, max_mem, profile, dryrun, snakemake_args
 ):
-    """Runs the ATLAS pipline
+    """Runs the ATLAS pipeline
 
     By default all steps are executed but a sub-workflow can be specified.
     Needs a config-file and expects to find a sample table in the working-directory. Both can be generated with 'atlas init'

diff --git a/atlas/init/create_sample_table.py b/atlas/init/create_sample_table.py
@@ -45,7 +45,7 @@ def add_sample_to_table(sample_dict, sample_id, header, fastq):
 
 
 def infer_split_character(base_name):
-    "Infer if fastq filename uses '_R1' '_1' to seperate filenames"
+    "Infer if fastq filename uses '_R1' '_1' to separate filenames"
 
     global split_character, is_paired
 
@@ -59,7 +59,7 @@ def infer_split_character(base_name):
             is_paired = True
         else:
             logger.warning(
-                f"Could't find '_R1'/'_R2' or '_1'/'_2' in your filename {base_name}. Assume you have single-end reads."
+                f"Couldn't find '_R1'/'_R2' or '_1'/'_2' in your filename {base_name}. Assume you have single-end reads."
             )
             split_character = None
             is_paired = False
@@ -145,7 +145,7 @@ def get_samples_from_fastq(path, fraction_split_character=split_character):
     try:
         _, subfolders, files = next(os.walk(path))
     except StopIteration:
-        logger.error(f"Folder {path} seems to conain no files or subfolders.")
+        logger.error(f"Folder {path} seems to contain no files or subfolders.")
         exit(1)
 
     abs_path = os.path.abspath(path)
@@ -213,7 +213,7 @@ def simplify_sample_names(sample_df):
                 lambda row: "{0}-{1}".format(*row), axis=1
             )
 
-        # cannt find unique sample ids
+        # cannot find unique sample ids
         else:
             logger.warning(
                 "Didn't found a way to simplify sample names. "

diff --git a/atlas/init/get_SRA_runinfo.py b/atlas/init/get_SRA_runinfo.py
@@ -276,7 +276,7 @@ def getInfoFromSRAIdentifier(identifier):
             # SAME, SAMD, and SAMN
             return SRAUtils.getInfoFromBioSampleAcc(identifier)
         elif identifier.startswith("PRJ"):
-            # DDBJ archvie bioproject prefix PRJNA SAMEA2796165
+            # DDBJ archive bioproject prefix PRJNA SAMEA2796165
             return SRAUtils.getInfoFromBioProjectAcc(identifier)
         else:
             raise Exception(
@@ -294,7 +294,7 @@ def get_runtable_from_ids(identifiers, output_file="SRA_runtable.tsv", overwrite
     with open(output_file, "w") as outInfoFile:
         identifierCount = 0
 
-        # don't show progress bar if only one elelment
+        # don't show progress bar if only one element
         if len(identifiers) > 1:
             identifier_with_progressbar = tqdm(identifiers)
         else:
@@ -317,7 +317,7 @@ def get_runtable_from_ids(identifiers, output_file="SRA_runtable.tsv", overwrite
 
 
 def parse_arguments_from_terminal():
-    ## Comand line interface
+    ## Command line interface
     import argparse
 
     parser = argparse.ArgumentParser()

diff --git a/atlas/init/parse_sra.py b/atlas/init/parse_sra.py
@@ -106,7 +106,7 @@ def filter_runinfo(RunTable, ignore_paired=False):
         Platforms = ", ".join(RunTable.Platform.unique())
 
         logger.warning(
-            f"Your samples are sequenced on the folowing platform: {Platforms}\n"
+            f"Your samples are sequenced on the following platform: {Platforms}\n"
             "I don't know how well Atlas handles non-illumina reads.\n"
             "If you have long-reads, specify them via a the longreads, column in the sample table."
         )
@@ -139,14 +139,14 @@ def validate_merging_runinfo(path):
 
     if len(problematic_samples) > 0:
         logger.error(
-            f"You attemt to merge runs from the same sample. "
-            f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and should't be merged.\n"
-            f"Please resolve the abiguity in the table {path} and rerun the command.\n"
+            f"You attempt to merge runs from the same sample. "
+            f"But for {len(problematic_samples)} samples the runs are sequenced with different platforms and shouldn't be merged.\n"
+            f"Please resolve the ambiguity in the table {path} and rerun the command.\n"
         )
 
         exit(1)
 
-    # Warn if samples are not identical for the follwing columns
+    # Warn if samples are not identical for the following columns
     Expected_same_values = ["Experiment", "Model", "LibraryName"]
     for key in Expected_same_values:
         problematic_samples = []
@@ -161,7 +161,7 @@ def validate_merging_runinfo(path):
                 problematic_samples_list = " ".join(problematic_samples)
 
                 logger.warning(
-                    "You attemt to merge runs from the same sample. "
+                    "You attempt to merge runs from the same sample. "
                     f"But for {len(problematic_samples)} samples the runs have different {key}: {problematic_samples_list}\n"
                     f"You can modify the table {path} and rerun the command.\n"
                 )

diff --git a/atlas/sample_table.py b/atlas/sample_table.py
@@ -127,7 +127,7 @@ def validate_bingroup_size(sampleTable, config, logger):
     if config["final_binner"] == "DASTool":
         binners = config["binner"]
 
-        logger.info(f"DASTool uses the folowing binners: {binners}")
+        logger.info(f"DASTool uses the following binners: {binners}")
 
         if ("vamb" in binners) or ("SemiBin" in binners):
             validate_bingroup_size_cobinning(sampleTable, logger)

diff --git a/docs/usage/getting_started.rst b/docs/usage/getting_started.rst
@@ -15,7 +15,7 @@ Atlas is based on snakemake, which allows to run steps of the workflow in parall
 
 If you want to try atlas and have a linux computer (OSX may also work), you can use our `example data`_ for testing.
 
-For real metagenomic data atlas should be run on a _linux_ sytem, with enough memory (min ~50GB but assembly usually requires 250GB).
+For real metagenomic data atlas should be run on a _linux_ system, with enough memory (min ~50GB but assembly usually requires 250GB).
 
 
 
@@ -213,9 +213,9 @@ Gives the output::
 
   [Atlas] INFO: Downloading runinfo from SRA
   [Atlas] INFO: Start with 2979 runs from 2979 samples
-  [Atlas] INFO: Runs have the folowing values for LibrarySource: METAGENOMIC, METATRANSCRIPTOMIC
+  [Atlas] INFO: Runs have the following values for LibrarySource: METAGENOMIC, METATRANSCRIPTOMIC
           Select only runs LibrarySource == METAGENOMIC, Filtered out 762 runs
-  [Atlas] INFO: Runs have the folowing values for LibrarySelection: PCR, RT-PCR, RANDOM
+  [Atlas] INFO: Runs have the following values for LibrarySelection: PCR, RT-PCR, RANDOM
           Select only runs LibrarySelection == RANDOM, Filtered out 879 runs
   [Atlas] INFO: Selected 1338 runs from 1338 samples
   [Atlas] INFO: Write filtered runinfo to HMP2/RunInfo.tsv
@@ -269,7 +269,7 @@ We recommend to use atlas on a :ref:`cluster` system, which can be set up in a v
     -h, --help              Show this message and exit.
 
 
-Execue Atlas
+Execute Atlas
 ************
 
 

diff --git a/docs/usage/output.rst b/docs/usage/output.rst
@@ -94,7 +94,7 @@ Genomes
     atlas run genomes
 
 
-Binning can predict several times the same genome from different samples. To remove this reduncancy we use DeRep to filter and de-replicate the genomes. By default the threshold is set to **97.5%**, which corresponds somewhat to the *sub-species level*. The best quality genome for each cluster is choosen as the representative for each cluster. The represenative MAG are then renamed and used for annotation and quantification.
+Binning can predict several times the same genome from different samples. To remove this reduncancy we use DeRep to filter and de-replicate the genomes. By default the threshold is set to **97.5%**, which corresponds somewhat to the *sub-species level*. The best quality genome for each cluster is chosen as the representative for each cluster. The representative MAG are then renamed and used for annotation and quantification.
 
 The fasta sequence of the dereplicated and renamed genomes can be found in ``genomes/genomes``
 and their quality estimation are in ``genomes/checkm/completeness.tsv``.
@@ -138,7 +138,7 @@ All trees are properly rooted using the midpoint. The files can be found in ``ge
 
 **Functional annotation**
 
-Sicne version 2.8, We use `DRAM <https://github.com/shafferm/DRAM>`_ to annotate the genomes with Functional annotations, e.g. KEGG and CAZy as well as to **infere pathways**, or more specifically Kegg modules.
+Since version 2.8, We use `DRAM <https://github.com/shafferm/DRAM>`_ to annotate the genomes with Functional annotations, e.g. KEGG and CAZy as well as to **infere pathways**, or more specifically Kegg modules.
 
 The Functional annotations for each genome can be found in ``genomes/annotations/dram/``
 
@@ -148,7 +148,7 @@ and are contain the following files:
  - ``annotations.tsv`` Table of all annotations
  - ``distil/metabolism_summary.xlsx`` Excel of the summary of all annotations
 
- The tool alos produces a nice report in `distil/product.html`_.
+ The tool also produces a nice report in `distil/product.html`_.
 
 .. _distil/product.html: ../_static/dram_product.html
 
@@ -290,7 +290,7 @@ Here is the R code to calculate the gene copies per million (analogous to transc
 
 
 Before version 2.15 the output of the counts were stored in a parquet file.
-The parquet file can be opended easily with ``pandas.read_parquet`` or ``arrow::read_parquet```.
+The parquet file can be opened easily with ``pandas.read_parquet`` or ``arrow::read_parquet```.
 However you need to load the full data into memory.
 
 .. code-block:: R

diff --git a/setup.cfg b/setup.cfg
@@ -4,3 +4,10 @@ style = pep440
 versionfile_source = atlas/_version.py
 versionfile_build = atlas/_version.py
 tag_prefix = v
+
+
+[codespell]
+check-filenames = 
+skip = .git,*.pdf,*.svg,versioneer.py,*.css,*.html
+ignore-words-list = BRITE
+check-hidden = 
diff --git a/test/test_sra.sh b/test/test_sra.sh
@@ -67,7 +67,7 @@ atlas init-public SAMEA104416160  -w $WD
 
 atlas run None download_sra -w $WD $@
 
-## smal data
+## small data
 
 
 echo "Download reads from small dataset for real test"

diff --git a/workflow/Snakefile b/workflow/Snakefile
@@ -15,11 +15,11 @@ import utils
 
 
 # add default config
-# comand line adds user config
+# command line adds user config
 configfile: os.path.join(workflow_folder, "..", "config", "default_config.yaml")
 
 
-# add defualt values from python (TODO: replace this)
+# add default values from python (TODO: replace this)
 from atlas.make_config import update_config as atlas_update_config
 
 config = atlas_update_config(config)
@@ -227,7 +227,7 @@ for r in workflow.rules:
         # default
         r.resources["mem_mb"] = config["mem"] * 1000
 
-    # add time if ot present. Simple jobs use simple time
+    # add time if not present. Simple jobs use simple time
 
     if "time_min" not in r.resources:
         r.resources["time_min"] = config["runtime"]["default"] * 60

diff --git a/workflow/report/assembly_report.py b/workflow/report/assembly_report.py
@@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):
 # Install exception handler
 sys.excepthook = handle_exception
 
-#### Begining of scripts
+#### Beginning of scripts
 
 from common_report import *
 

diff --git a/workflow/report/bin_report.py b/workflow/report/bin_report.py
@@ -29,7 +29,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):
 # Install exception handler
 sys.excepthook = handle_exception
 
-#### Begining of scripts
+#### Beginning of scripts
 
 
 from common_report import *

diff --git a/workflow/report/qc_report.py b/workflow/report/qc_report.py
@@ -27,7 +27,7 @@ def handle_exception(exc_type, exc_value, exc_traceback):
 # Install exception handler
 sys.excepthook = handle_exception
 
-#### Begining of scripts
+#### Beginning of scripts
 
 from common_report import *
 
@@ -158,7 +158,7 @@ def make_plots(
     Quality_QC_pe, Quality_QC_se = get_stats_from_zips(zipfiles_QC, samples)
     # Quality_raw_pe, Quality_raw_se = get_stats_from_zips(zipfiles_QC,samples)
 
-    # detrmine range of quality values and if paired
+    # determine range of quality values and if paired
     max_quality = 1 + np.nanmax((Quality_QC_pe.max().max(), Quality_QC_se.max().max()))
     quality_range = [min_quality, max_quality]
 

diff --git a/workflow/rules/assemble.smk b/workflow/rules/assemble.smk
@@ -90,7 +90,7 @@ else:
             # make symlink
             assert len(input) == len(
                 output
-            ), "Input and ouput files have not same number, can not create symlinks for all."
+            ), "Input and output files have not same number, can not create symlinks for all."
             for i in range(len(input)):
                 os.symlink(os.path.abspath(input[i]), output[i])
 
@@ -170,7 +170,7 @@ rule error_correction:
     params:
         inputs=lambda wc, input: io_params_for_tadpole(input),
         outputs=lambda wc, output: io_params_for_tadpole(output, key="out"),
-        prefilter=2,  # Ignore kmers with less than 2 occurance
+        prefilter=2,  # Ignore kmers with less than 2 occurrence
         minprob=config["error_correction_minprob"],
         tossdepth=config["error_correction_minimum_kmer_depth"],
         tossjunk="t" if config["error_correction_remove_lowdepth"] else "f",
@@ -656,7 +656,7 @@ rule pileup_contigs_sample:
     benchmark:
         "logs/benchmarks/assembly/calculate_coverage/pileup/{sample}.txt"
     log:
-        "{sample}/logs/assembly/calculate_coverage/pilup_final_contigs.log",  # This log file is uesd for report
+        "{sample}/logs/assembly/calculate_coverage/pilup_final_contigs.log",  # This log file is used for report
     conda:
         "%s/required_packages.yaml" % CONDAENV
     threads: config.get("threads", 1)

diff --git a/workflow/rules/bin_quality.smk b/workflow/rules/bin_quality.smk
@@ -233,7 +233,7 @@ rule get_bin_filenames:
         def get_list_of_files(dirs, pattern):
             fasta_files = []
 
-            # searh for fasta files (.f*) in all bin folders
+            # search for fasta files (.f*) in all bin folders
             for dir in dirs:
                 dir = Path(dir)
                 fasta_files += list(dir.glob(pattern))