diff --git a/MANIFEST.json b/MANIFEST.json new file mode 100644 index 0000000..cd89d0e --- /dev/null +++ b/MANIFEST.json @@ -0,0 +1,7 @@ +{ + "mainWorkflowURL": "main.nf", + "inputFileURLs": [ + "inputs.json" + ], + "engineOptions": "-profile agc" +} diff --git a/README.md b/README.md index 7b75361..501691d 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,127 @@ For more information on the parameters run `nextflow run main.nf --help` The path to the singularity images can also be changed in the singularity profile in `nextflow.config`. Default value is `${baseDir}/singularity` + +## Amazon Genomics CLI ## +The workflow can be executed on Amazon Web Services infrastructure using [Amazon Genomics CLI](https://aws.github.io/amazon-genomics-cli/) (ACG). See [prerequisites](https://aws.github.io/amazon-genomics-cli/docs/getting-started/prerequisites/). + +### Prepare workflow execution through AGC ### + +1. Download and install AGC following the [instructions](https://aws.github.io/amazon-genomics-cli/docs/getting-started/installation) + +2. Activate the AWS account for use with AGC. This will deploy the AGC core infrastructure in your AWS account. + +``` +agc account activate +``` + +3. Define a username + +``` +agc configure email you@youremail.com +``` + +4. Configure additional S3 buckets (optional) + +AGC creates an S3 bucket to store logs and outputs and for input caching. If you want to use a separate bucket for resources and inputs this needs to be configured in the `agc-project.yaml`: + +``` +data: + - location: s3:// + readOnly: true +``` + +Please note that AGC can write to the bucket provisioned on account activation. Access to any other buckets is read only. If you are not using additional S3 buckets delete the data section from `agc-project.yaml`. + +5. Provision resources. Run `provision_resources.sh` to upload KrakenDB files, Bowtie index files and TB pipeline resource files to S3, e.g.: + +``` +./provision_resources.sh s3:///project/tbpipeline/resources/ +``` + +Note that the `resources` folder in the project directory will be moved out of the directory to `../tb-pipeline-resources`. This is to avoid the `resources` being packaged up with the project directory and uploaded to AGC every time an AGC run is submitted. + +6. Deploy the AGC context. This will deploy the compute environment to execute workflows in your AWS account. Two contexts are defined in `agc-project.yaml`: `ondemand` for execution on on-demand EC2 instances and `spot` for execution on spot instances. + +To Deploy the `ondemand` context: + +``` +agc context deploy --context ondemand +``` + +7. Edit the `inputs.json` file as required. The `inputs.json` file defines the workflow parameters used by Nextflow to run the workflow, eg: + +``` +{ + "input_dir": "s3:///input/sequencing/mtuberculosis", + "filetype": "fastq", + "pattern": "*_{1,2}.fastq.gz", + "species": "tuberculosis", + "unmix_myco": "yes", + "resources_dir": "s3:///project/tbpipeline/resources/tbpipeline", + "kraken_db": "s3:///project/tbpipeline/resources/kraken_db/k2_pluspf_16gb_20220607", + "bowtie2_index": "s3:///project/tbpipeline/resources/bowtie2_index/hg19_1kgmaj", + "bowtie_index_name": "hg19_1kgmaj", + "output_dir": "s3:///project/tbpipeline/output", + "vcfmix": "yes", + "gnomon": "yes", + "report_dir": "s3:///project/tbpipeline/reports", + "container_registry": "/tb-pipeline" +} +``` + +The `container_registry` and `report_dir` parameters are optional. If not provided the `container_registry` parameter defaults to `quay.io/pathogen-genomics-cymru`. + + +## Execute and track workflows through AGC## + +1. Submit a workflow run + +``` +agc workflow status -c ondemand -n tbpipeline +``` + +2. Check workflow status + +``` +agc workflow status -c ondemand -r +``` + +3. Check Nextflow engine logs + +``` +agc logs engine -c ondemand -r +``` + +4. Check workflow logs + +``` +agc logs workflow tbpipeline -r +``` + +5. Stop a workflow run + +``` +agc workflow stop +``` + +See the [AGC command reference](https://aws.github.io/amazon-genomics-cli/docs/reference/) for all agc commands. + +### Clean up ### + +1. Destroy the context. This will remove the resources associated with the named context from your account but will keep any S3 outputs and CloudWatch logs. + +``` +agc context destroy ondemand +``` + +2. Deactivate the account. If you want stop using Amazon Genomics CLI in your AWS account entirely and remove all resources created by AGC you need to deactivate it. + +``` +agc account deactivate +``` + + ## Stub-run ## To test the stub run: ``` diff --git a/agc-project.yaml b/agc-project.yaml new file mode 100644 index 0000000..c5d4efc --- /dev/null +++ b/agc-project.yaml @@ -0,0 +1,18 @@ +name: tbpipeline +schemaVersion: 1 +workflows: + tbpipeline: + type: + language: nextflow + version: dsl2 + sourceURL: ./ +contexts: + ondemand: + engines: + - type: nextflow + engine: nextflow + spot: + requestSpotInstances: true + engines: + - type: nextflow + engine: nextflow diff --git a/bin/create_final_json.py b/bin/create_final_json.py old mode 100644 new mode 100755 index 6885316..ad1d59b --- a/bin/create_final_json.py +++ b/bin/create_final_json.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import os import sys diff --git a/bin/identify_tophit_and_contaminants2.py b/bin/identify_tophit_and_contaminants2.py old mode 100644 new mode 100755 index 4ea2ebb..4c9f790 --- a/bin/identify_tophit_and_contaminants2.py +++ b/bin/identify_tophit_and_contaminants2.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import os import sys diff --git a/bin/parse_kraken_report2.py b/bin/parse_kraken_report2.py old mode 100644 new mode 100755 index caf1a5a..6aa583b --- a/bin/parse_kraken_report2.py +++ b/bin/parse_kraken_report2.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + import json import os import sys diff --git a/bin/software-json.py b/bin/software-json.py old mode 100644 new mode 100755 index cf543f9..b68da13 --- a/bin/software-json.py +++ b/bin/software-json.py @@ -17,7 +17,7 @@ def go(path): for filename in glob.glob(os.path.join(path, "Singularity.*")): extension = filename.split('.', 1)[1] version = filename.split('-')[-1] - with open(os.path.join(path, filename), 'r') as infile: + with open(os.path.join(filename), 'r') as infile: copy = False for line in infile: if line.strip() == "%environment": diff --git a/bin/vcfmix.py b/bin/vcfmix.py old mode 100644 new mode 100755 diff --git a/docker/build_and_push.sh b/docker/build_and_push.sh new file mode 100755 index 0000000..dd4d654 --- /dev/null +++ b/docker/build_and_push.sh @@ -0,0 +1,68 @@ +#!/usr/bin/env bash + +# This script shows how to build the Docker image and push it to ECR to be ready for use +# by SageMaker. + +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +image=$1 +version=$2 + +if [ "$image" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + +if [ "$version" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + + +# Get the region defined in the current configuration (default to us-west-2 if none defined) +region=$(aws configure get region) +region=${region:-us-west-2} + +dockerfile="Dockerfile.${image}-${version}" +ecr_repo="${account}.dkr.ecr.${region}.amazonaws.com" +local_tag="tb-pipeline/${image}:${version}" +ecr_tag="${ecr_repo}/tb-pipeline/${image}:${version}" + + +echo "AWS Region: ${region}" +echo "Dockerfile: ${dockerfile}" +echo "Local tag : ${local_tag}" +echo "ECR tag : ${ecr_tag}" + + +# If the repository doesn't exist in ECR, create it. +aws ecr describe-repositories --repository-names "tb-pipeline/${image}" > /dev/null 2>&1 + +if [ $? -ne 0 ] +then + echo "The repository with name tb-pipeline/${image} does not exist in the registry ${ecr_repo}. Creating repository." + aws ecr create-repository --repository-name "tb-pipeline/${image}" +# > /dev/null +fi + +# Get the login command from ECR and execute it directly +aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com + +# Build the docker image locally with the image name and then push it to ECR +# with the full name. + +docker build -t ${local_tag} -f ${dockerfile} ./ +docker tag ${local_tag} ${ecr_tag} + +docker push ${ecr_tag} diff --git a/docker/pull_and_push.sh b/docker/pull_and_push.sh new file mode 100755 index 0000000..cc1b655 --- /dev/null +++ b/docker/pull_and_push.sh @@ -0,0 +1,66 @@ +#!/usr/bin/env bash + +# This script shows how to build the Docker image and push it to ECR to be ready for use +# by SageMaker. + +# The argument to this script is the image name. This will be used as the image on the local +# machine and combined with the account and region to form the repository name for ECR. +image=$1 +version=$2 + +if [ "$image" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + +if [ "$version" == "" ] +then + echo "Usage: $0 " + exit 1 +fi + + +# Get the account number associated with the current IAM credentials +account=$(aws sts get-caller-identity --query Account --output text) + +if [ $? -ne 0 ] +then + exit 255 +fi + + +# Get the region defined in the current configuration (default to us-west-2 if none defined) +region=$(aws configure get region) +region=${region:-eu-west-2} + +ecr_repo="${account}.dkr.ecr.${region}.amazonaws.com" +remote_tag="quay.io/pathogen-genomics-cymru/${image}:${version}" +ecr_tag="${ecr_repo}/tb-pipeline/${image}:${version}" + + +echo "AWS Region : ${region}" +echo "Source image tag: $remote_tag" +echo "Target image tag: $ecr_tag" + + +# If the repository doesn't exist in ECR, create it. +aws ecr describe-repositories --repository-names "tb-pipeline/${image}" > /dev/null 2>&1 + +if [ $? -ne 0 ] +then + echo "The repository with name tb-pipeline/${image} does not exist in the registry ${ecr_repo}. Creating repository." + aws ecr create-repository --repository-name "tb-pipeline/${image}" +# > /dev/null +fi + +# Get the login command from ECR and execute it directly +aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com + +# Build the docker image locally with the image name and then push it to ECR +# with the full name. + +docker pull ${remote_tag} +docker tag ${remote_tag} ${ecr_tag} + +docker push ${ecr_tag} diff --git a/dryrun-test-agc.py b/dryrun-test-agc.py new file mode 100755 index 0000000..5d8a9e6 --- /dev/null +++ b/dryrun-test-agc.py @@ -0,0 +1,154 @@ +#!/usr/bin/env python3 + +import json +import os +import sys +import subprocess +import argparse + +parser = argparse.ArgumentParser( + prog = 'dryrun-test-agc.py', + description = 'For each test scenario generates inputs.json and MANIFEST.json files and submits a workflow run via Amazon Genomics CLI.' +) +parser.add_argument('-o', '--output_dir', dest='output_dir', help='parent output directory for test runs', required=True) +parser.add_argument('-r', '--resources_dir', dest='resources_dir', help='path to TB pipeline resources', required=True) +parser.add_argument('-k', '--kraken_db', dest='kraken_db', help='path to KrakenDB files', required=True) +parser.add_argument('-b', '--bowtie2_index', dest='bowtie2_index', help='path to Bowtie2 index files', required=True) +parser.add_argument('-c', '--context', dest='agc_context', help='AGC context to run the workflow in', default='ondemand', required=False) + +scenarios = [ + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'NOW_DECONTAMINATE_dryRun', 'dryRun', 'NOW_ALIGN_TO_REF_dryRun', 'NOW_VARCALL_dryRun', 'CREATE_ANTIBIOGRAM_dryRun', 'yes', 'yes'], + ['fastq', "*_R{1,2}.fastq.gz", 'null', 'null', 'fail', 'null', 'null', 'null', 'null', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'fail', 'null', 'null', 'null', 'null', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'fail', 'null', 'null', 'null', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'null', 'null', 'null', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'null', 'null', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'NOW_DECONTAMINATE_dryRun', 'dryRun', 'NOW_ALIGN_TO_REF_dryRun', 'NOW_VARCALL_dryRun', 'CREATE_ANTIBIOGRAM_dryRun', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'NOW_DECONTAMINATE_dryRun', 'fail', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'NOW_DECONTAMINATE_dryRun', 'dryRun', 'null', 'null', 'null', 'no', 'no'], + ['fastq', "*_R{1,2}.fastq.gz", 'OK', 'null', 'dryRun', 'dryRun', 'dryRun', 'NOW_DECONTAMINATE_dryRun', 'dryRun', 'NOW_ALIGN_TO_REF_dryRun', 'null', 'null', 'no', 'no'] +] + +file_name_inputs_json = "inputs.json" +file_name_manifest_json = "MANIFEST.json" + +def main(output_dir, resources_dir, kraken_db, bowtie2_index, context): + + print("submitting test workflow runs via Amazon Genomics CLI...") + print("AGC context : {}".format(context)) + print("output directory : {}".format(output_dir)) + print("TB pipeline resources: {}".format(resources_dir)) + print("") + print("backing up {} and {}...".format(file_name_inputs_json, file_name_manifest_json)) + + # back up inputs.json and MANIFEST.json + backup_files([file_name_inputs_json, file_name_manifest_json]) + + # for each scenario... + count = 0 + for scenario in scenarios: + + scenario_no = count + 1 + + output_dir_scenario = os.path.join(output_dir, "scenario{}".format(scenario_no)) + report_dir_scenario = os.path.join(output_dir_scenario, "reports") + scenario.append(output_dir_scenario) + scenario.append(resources_dir) + scenario.append(report_dir_scenario) + scenario.append(kraken_db) + scenario.append(bowtie2_index) + + + # write scenario inputs.json + write_inputs_json(scenario, file_name_inputs_json) + + # write scenario MANIFEST.json + write_manifest_json(file_name_inputs_json, file_name_manifest_json) + + # submit workflow run + print("submitting workflow run for scenario {}...".format(scenario_no)) + submit_workflow_run(context) + + count += 1 + break + + # restore inputs.json and MANIFEST.json + print("restoring {} and {}...".format(file_name_inputs_json, file_name_manifest_json)) + restore_files([file_name_inputs_json, file_name_manifest_json]) + + +def backup_files(file_names): + + for file_name in file_names: + output = subprocess.check_output( + "mv -v {} {}.bk".format(file_name, file_name), + stderr=subprocess.STDOUT, + shell=True + ) + print(output.decode("utf-8").strip()) + + +def restore_files(file_names): + + for file_name in file_names: + output = subprocess.check_output( + "mv -v {}.bk {}".format(file_name, file_name), + stderr=subprocess.STDOUT, + shell=True + ) + print(output.decode("utf-8").strip()) + + +def write_inputs_json(scenario, file_name): + + scenario_dict = { + "filetype": scenario[0], + "pattern": scenario[1], + "checkFqValidity_isok": scenario[2], + "checkBamValidity_isok": scenario[3], + "countReads_runfastp": scenario[4], + "fastp_enoughreads": scenario[5], + "kraken2_runmykrobe": scenario[6], + "identifyBacContam_rundecontam": scenario[7], + "downloadContamGenomes_fapass": scenario[8], + "summary_doWeAlign": scenario[9], + "alignToRef_doWeVarCall": scenario[10], + "minos_isSampleTB": scenario[11], + "vcfmix": scenario[12], + "gnomon": scenario[13], + "output_dir": scenario[14], + "resources_dir": scenario[15], + "report_dir": scenario[16] + } + + with open(file_name, "w") as inputs_json: + inputs_json.write(json.dumps(scenario_dict, indent=2)) + + +def write_manifest_json(file_name_inputs_json, file_name): + + manifest_dict = { + "mainWorkflowURL": "main.nf", + "inputFileURLs": [ + file_name_inputs_json + ], + "engineOptions": "-stub -config ./project/testing.config -profile agc" + } + + with open(file_name, "w") as manifest_json: + manifest_json.write(json.dumps(manifest_dict, indent=2)) + + +def submit_workflow_run(context): + output = subprocess.check_output( + "agc workflow run tbpipeline -c {}".format(context), + stderr=subprocess.STDOUT, + shell=True + ) + print(output.decode("utf-8").strip()) + + +if __name__ == '__main__': + + args = parser.parse_args() + main(args.output_dir, args.resources_dir, args.kraken_db, args.bowtie2_index, args.agc_context) diff --git a/inputs.json b/inputs.json new file mode 100644 index 0000000..bfc233b --- /dev/null +++ b/inputs.json @@ -0,0 +1,16 @@ +{ + "input_dir": "s3:///project/tbpipeline/input/sequencing/mtuberculosis", + "filetype": "fastq", + "pattern": "*_{1,2}.fastq.gz", + "species": "tuberculosis", + "unmix_myco": "yes", + "resources_dir": "s3:///project/tbpipeline/resources/tbpipeline", + "kraken_db": "s3:///project/tbpipeline/resources/kraken_db/k2_pluspf_16gb_20220607", + "bowtie2_index": "s3:///project/tbpipeline/resources/bowtie2_index/hg19_1kgmaj", + "bowtie_index_name": "hg19_1kgmaj", + "output_dir": "s3:///project/tbpipeline/output/", + "vcfmix": "yes", + "gnomon": "yes", + "report_dir": "s3:///project/tbpipeline/reports/", + "container_registry": "/tb-pipeline" +} diff --git a/main.nf b/main.nf index 0ab4489..4b65ca8 100644 --- a/main.nf +++ b/main.nf @@ -67,12 +67,15 @@ Optional parameters: if you DO use this parameter, pipeline will expect this to be the principal species. It will fail the sample if reads from this species are not actually the majority - +--resources_dir Path to TB pipeline resources directory +--report_dir Output directory for execution reports +--container_registry Container registry to pull TB pipeline containers from Profiles: ------------------------------------------------------------------------ singularity to run with singularity -docker to run with docker +docker to run with docker +agc to run with Amazon Genomics CLI Examples: @@ -129,12 +132,16 @@ Parameters used: --vcfmix ${params.vcfmix} --gnomon ${params.gnomon} --amr_cat ${params.amr_cat} +--resources_dir ${params.resources_dir} +--report_dir ${params.report_dir} +--container_registry ${params.container_registry} Runtime data: ------------------------------------------------------------------------ Running with profile ${ANSI_GREEN}${workflow.profile}${ANSI_RESET} Running as user ${ANSI_GREEN}${workflow.userName}${ANSI_RESET} Launch directory ${ANSI_GREEN}${workflow.launchDir}${ANSI_RESET} +Workflow session ID ${ANSI_GREEN}${workflow.sessionId}${ANSI_RESET} """ .stripIndent() @@ -165,19 +172,24 @@ workflow { .set{ input_files } } + // create channel for singularity directory + sing_dir_ch = Channel.fromPath( "${params.sing_dir}" ) + // create channels for kraken2 database and bowtie2 index krakenDB = Channel.fromPath( "${params.kraken_db}/*.k2d" ) bowtie_dir = Channel.fromPath( "${params.bowtie2_index}/*.bt2" ) + // create channel for resources + resources_dir_ch = Channel.fromPath( "${params.resources_dir}" ) // main workflow main: // GETVERSION SUB-WORKFLOW - getversion() + getversion(sing_dir_ch) // PREPROCESSING SUB-WORKFLOW - preprocessing(input_files, krakenDB, bowtie_dir) + preprocessing(input_files, krakenDB, bowtie_dir, resources_dir_ch) // CLOCKWORK SUB-WORKFLOW @@ -188,7 +200,7 @@ workflow { nomix_seqs_json = preprocessing.out.nocontam_seqs_json - clockwork(clockwork_seqs.join(clockwork_json, by: 0).mix(nomix_seqs_json)) + clockwork(clockwork_seqs.join(clockwork_json, by: 0).mix(nomix_seqs_json), resources_dir_ch) } @@ -204,7 +216,7 @@ workflow { minos_vcf = clockwork.out.minos_vcf // VCFPREDICT SUB-WORKFLOW - vcfpredict(mpileup_vcf, minos_vcf) + vcfpredict(mpileup_vcf, minos_vcf, resources_dir_ch) } diff --git a/modules/clockworkModules.nf b/modules/clockworkModules.nf index c5a021b..60e2362 100644 --- a/modules/clockworkModules.nf +++ b/modules/clockworkModules.nf @@ -15,6 +15,7 @@ process alignToRef { input: tuple val(sample_name), path(fq1), path(fq2), path(json), val(doWeAlign) + path resources_dir when: doWeAlign =~ /NOW\_ALIGN\_TO\_REF\_${sample_name}/ @@ -45,8 +46,8 @@ process alignToRef { samtools index ${bam} ${bai} samtools stats ${bam} > ${stats} - python3 ${baseDir}/bin/parse_samtools_stats.py ${bam} ${stats} > ${stats_json} - python3 ${baseDir}/bin/create_final_json.py ${stats_json} ${json} + parse_samtools_stats.py ${bam} ${stats} > ${stats_json} + create_final_json.py ${stats_json} ${json} continue=\$(jq -r '.summary_questions.continue_to_clockwork' ${out_json}) if [ \$continue == 'yes' ]; then printf "NOW_VARCALL_${sample_name}" && printf "" >> ${error_log}; elif [ \$continue == 'no' ]; then echo "error: insufficient number and/or quality of read alignments to the reference genome" >> ${error_log}; fi @@ -122,6 +123,7 @@ process callVarsCortex { input: tuple val(sample_name), path(json), path(bam), path(ref), val(doWeVarCall) + path resources_dir when: doWeVarCall =~ /NOW\_VARCALL\_${sample_name}/ diff --git a/modules/getversionModules.nf b/modules/getversionModules.nf index eaaeda3..1b1495c 100644 --- a/modules/getversionModules.nf +++ b/modules/getversionModules.nf @@ -1,4 +1,5 @@ // modules for the getversion workflow +params.output_dir = "${params.output_dir}" process getversion { @@ -6,13 +7,16 @@ process getversion { publishDir "${params.output_dir}", mode: 'copy', pattern: '*.json', overwrite: 'true' + input: + path sing_dir + output: path("version.json", emit: getversion_json) script: """ - python3 ${baseDir}/bin/software-json.py ${params.sing_dir} + software-json.py ${sing_dir} """ stub: diff --git a/modules/preprocessingModules.nf b/modules/preprocessingModules.nf index 496f6bc..d2b7049 100644 --- a/modules/preprocessingModules.nf +++ b/modules/preprocessingModules.nf @@ -277,9 +277,9 @@ process kraken2 { """ kraken2 --threads ${task.cpus} --db . --output ${kraken2_read_classification} --report ${kraken2_report} --paired $fq1 $fq2 - python3 ${baseDir}/bin/parse_kraken_report2.py ${kraken2_report} ${kraken2_json} 0.5 5000 + parse_kraken_report2.py ${kraken2_report} ${kraken2_json} 0.5 5000 - ${baseDir}/bin/extract_kraken_reads.py -k ${kraken2_read_classification} -r ${kraken2_report} -s $fq1 -s2 $fq2 -o ${nonBac_depleted_reads_1} -o2 ${nonBac_depleted_reads_2} --taxid 2 --include-children --fastq-output >/dev/null + extract_kraken_reads.py -k ${kraken2_read_classification} -r ${kraken2_report} -s $fq1 -s2 $fq2 -o ${nonBac_depleted_reads_1} -o2 ${nonBac_depleted_reads_2} --taxid 2 --include-children --fastq-output >/dev/null gzip -f ${nonBac_depleted_reads_1} gzip -f ${nonBac_depleted_reads_2} @@ -410,6 +410,7 @@ process identifyBacterialContaminants { input: tuple val(sample_name), path(fq1), path(fq2), path(mykrobe_json), val(enough_myco_reads), path(kraken_report), path(kraken_json) + path resources_dir when: enough_myco_reads =~ /${sample_name}/ @@ -425,7 +426,7 @@ process identifyBacterialContaminants { error_log = "${sample_name}.err" """ - python3 ${baseDir}/bin/identify_tophit_and_contaminants2.py ${mykrobe_json} ${kraken_json} ${baseDir}/resources/assembly_summary_refseq.txt ${params.species} ${params.unmix_myco} ${baseDir}/resources null + identify_tophit_and_contaminants2.py ${mykrobe_json} ${kraken_json} ${resources_dir}/assembly_summary_refseq.txt ${params.species} ${params.unmix_myco} ${resources_dir} null cp ${sample_name}_species_in_sample.json ${sample_name}_species_in_sample_previous.json @@ -580,7 +581,7 @@ process reKraken { """ kraken2 --threads ${task.cpus} --db . --output ${kraken2_read_classification} --report ${kraken2_report} --paired $fq1 $fq2 - python3 ${baseDir}/bin/parse_kraken_report2.py ${kraken2_report} ${kraken2_json} 0.5 5000 + parse_kraken_report2.py ${kraken2_report} ${kraken2_json} 0.5 5000 rm -rf ${sample_name}_read_classifications.txt """ @@ -642,6 +643,7 @@ process summarise { input: tuple val(sample_name), path(mykrobe_json), path(kraken_report), path(kraken_json), path(prev_species_json), val(decontam) + path resources_dir output: tuple val(sample_name), path("${sample_name}_species_in_sample.json"), stdout, emit: summary_json @@ -651,7 +653,7 @@ process summarise { error_log = "${sample_name}.err" """ - python3 ${baseDir}/bin/identify_tophit_and_contaminants2.py ${mykrobe_json} ${kraken_json} ${baseDir}/resources/assembly_summary_refseq.txt ${params.species} ${params.unmix_myco} ${baseDir}/resources ${prev_species_json} + identify_tophit_and_contaminants2.py ${mykrobe_json} ${kraken_json} ${resources_dir}/assembly_summary_refseq.txt ${params.species} ${params.unmix_myco} ${resources_dir} ${prev_species_json} contam_to_remove=\$(jq -r '.summary_questions.are_there_contaminants' ${sample_name}_species_in_sample.json) acceptable_species=\$(jq -r '.summary_questions.is_the_top_species_appropriate' ${sample_name}_species_in_sample.json) diff --git a/modules/vcfpredictModules.nf b/modules/vcfpredictModules.nf index 1d1120d..dd23a7d 100644 --- a/modules/vcfpredictModules.nf +++ b/modules/vcfpredictModules.nf @@ -23,7 +23,7 @@ process vcfmix { error_log = "${sample_name}.err" """ - python3 ${baseDir}/bin/vcfmix.py ${bcftools_vcf} + vcfmix.py ${bcftools_vcf} if [ ${params.gnomon} == "no" ]; then printf "workflow complete without error" >> ${error_log}; fi """ @@ -56,6 +56,7 @@ process gnomon { input: tuple val(sample_name), path(vcf), val(isSampleTB) + path resources_dir when: isSampleTB =~ /CREATE\_ANTIBIOGRAM\_${sample_name}/ @@ -70,7 +71,7 @@ process gnomon { error_log = "${sample_name}.err" """ - gnomonicus --genome_object ${baseDir}/resources/H37rV_v3.gbk --catalogue ${params.amr_cat} --vcf_file ${minos_vcf} --output_dir . --json --fasta fixed + gnomonicus --genome_object ${resources_dir}/H37rV_v3.gbk --catalogue ${params.amr_cat} --vcf_file ${minos_vcf} --output_dir . --json --fasta fixed printf "workflow complete without error" >> ${error_log} """ diff --git a/nextflow.config b/nextflow.config index 0c54c1e..3c37ead 100644 --- a/nextflow.config +++ b/nextflow.config @@ -45,6 +45,15 @@ params { // path to singularity recipes (needed for getversion) sing_dir = "${baseDir}/singularity" + + // path to TB pipeline resources directory + resources_dir = "${baseDir}/resources" + + // path to execution report output directory + report_dir = "reports" + + // container registry to pull TB pipeline containers images from + container_registry = "quay.io/pathogen-genomics-cymru" } @@ -163,5 +172,60 @@ profiles { } } } -} + agc { + + // define containers for each process + process { + withLabel:normal_CPU { cpus = 8 } + withLabel:low_memory { memory = '5GB' } + withLabel:medium_memory { memory = '10GB' } + withLabel:high_memory { memory = '18GB' } + + withLabel:getversion { + container = "${params.container_registry}/preprocessing:0.9.4" + } + + withLabel:preprocessing { + container = "${params.container_registry}/preprocessing:0.9.4" + } + + withName:downloadContamGenomes { + // disable strict error checking to allow for non-matching lines in linktestlog.txt + shell = ['/bin/bash','-u'] + } + + withLabel:clockwork { + container = "${params.container_registry}/clockwork:0.9.4" + } + + withLabel:vcfpredict { + container = "711700981500.dkr.ecr.eu-west-2.amazonaws.com/tb-pipeline/vcfpredict:0.9.4" + } + } + + report { + enabled = true + file = "${params.report_dir}/report.html" + overwrite = true + } + + trace { + enabled = true + file = "${params.report_dir}/trace.txt" + overwrite = true + } + + timeline { + enabled = true + file = "${params.report_dir}/timeline.html" + overwrite = true + } + + dag { + enabled = true + file = "${params.report_dir}/workflow_dag.html" + overwrite = true + } + } +} diff --git a/provision_resources.sh b/provision_resources.sh new file mode 100755 index 0000000..117f419 --- /dev/null +++ b/provision_resources.sh @@ -0,0 +1,43 @@ +#!/bin/bash +set -x + +S3_URI=$1 + +# remove trailing / +S3_URI=`echo $S3_URI | sed 's:/$::'` + +KRAKEN_DB='k2_pluspf_16gb_20220607.tar.gz' +KRAKEN_DB_URL='https://genome-idx.s3.amazonaws.com/kraken' +KRAKEN_DB_S3_URL="$S3_URI/kraken_db/k2_pluspf_16gb_20220607/" + +BOWTIE2_INDEX='hg19_1kgmaj_snvs_bt2.zip' +BOWTIE2_INDEX_URL='https://genome-idx.s3.amazonaws.com/bt' +BOWTIE2_INDEX_S3_URI="$S3_URI/bowtie2_index/hg19_1kgmaj/" + +TB_PIPELINE_RESOURCES_S3_URI="$S3_URI/tbpipeline/" + +TMPDIR=$(mktemp -d) + +# download and extract KrakenDB files +wget -c $KRAKEN_DB_URL/$KRAKEN_DB +mkdir $TMPDIR/k2db +tar xzf $KRAKEN_DB -C $TMPDIR/k2db + +# upload KrakenDB files to S3 and remove local copy +aws s3 sync $TMPDIR/k2db $KRAKEN_DB_S3_URL +rm -r $TMPDIR/k2db +rm $KRAKEN_DB + +# download and extract Bowtie index files +wget -c $BOWTIE2_INDEX_URL/$BOWTIE2_INDEX +mkdir $TMPDIR/bowtiedb +unzip $BOWTIE2_INDEX -d $TMPDIR/bowtiedb + +# upload Bowtie index files to S3 and remove local copy +aws s3 sync $TMPDIR/bowtiedb $BOWTIE2_INDEX_S3_URI +rm -r $TMPDIR/bowtiedb +rm $BOWTIE2_INDEX + +# move resources out of project directory and uplaod to S3 +aws s3 sync resources $TB_PIPELINE_RESOURCES_S3_URI +mv resources ../tb-pipeline-resources diff --git a/workflows/clockwork.nf b/workflows/clockwork.nf index 9a8fcb0..ede7ff8 100644 --- a/workflows/clockwork.nf +++ b/workflows/clockwork.nf @@ -13,14 +13,15 @@ workflow clockwork { take: input_seqs_json + resources_dir main: - alignToRef(input_seqs_json) + alignToRef(input_seqs_json, resources_dir) callVarsMpileup(alignToRef.out.alignToRef_bam) - callVarsCortex(alignToRef.out.alignToRef_bam) + callVarsCortex(alignToRef.out.alignToRef_bam, resources_dir) minos(alignToRef.out.alignToRef_bam.join(callVarsCortex.out.cortex_vcf, by: 0).join(callVarsMpileup.out.mpileup_vcf, by: 0)) diff --git a/workflows/preprocessing.nf b/workflows/preprocessing.nf index f413403..d38a814 100644 --- a/workflows/preprocessing.nf +++ b/workflows/preprocessing.nf @@ -25,6 +25,7 @@ workflow preprocessing { input_files krakenDB bowtie_dir + resources_dir main: @@ -54,7 +55,7 @@ workflow preprocessing { bowtie2(kraken2.out.kraken2_fqs, bowtie_dir.toList()) - identifyBacterialContaminants(bowtie2.out.bowtie2_fqs.join(mykrobe.out.mykrobe_report, by: 0).join(kraken2.out.kraken2_report, by: 0)) + identifyBacterialContaminants(bowtie2.out.bowtie2_fqs.join(mykrobe.out.mykrobe_report, by: 0).join(kraken2.out.kraken2_report, by: 0), resources_dir) downloadContamGenomes(identifyBacterialContaminants.out.contam_list) @@ -64,7 +65,7 @@ workflow preprocessing { reMykrobe(mapToContamFa.out.reClassification_fqs) - summarise(reMykrobe.out.reMykrobe_report.join(reKraken.out.reKraken_report, by: 0).join(identifyBacterialContaminants.out.prev_sample_json, by: 0)) + summarise(reMykrobe.out.reMykrobe_report.join(reKraken.out.reKraken_report, by: 0).join(identifyBacterialContaminants.out.prev_sample_json, by: 0), resources_dir) emit: diff --git a/workflows/vcfpredict.nf b/workflows/vcfpredict.nf index 914954f..b7266fc 100644 --- a/workflows/vcfpredict.nf +++ b/workflows/vcfpredict.nf @@ -12,6 +12,7 @@ workflow vcfpredict { clockwork_bcftools clockwork_minos + resources_dir main: @@ -23,7 +24,7 @@ workflow vcfpredict { if ( params.gnomon == "yes" ) { - gnomon(clockwork_minos) + gnomon(clockwork_minos, resources_dir) }