diff --git a/.github/include.yaml b/.github/include.yaml new file mode 100644 index 00000000..a3629f4c --- /dev/null +++ b/.github/include.yaml @@ -0,0 +1,10 @@ +".": + - ./.github/workflows/** + - ./nf-test.config + - ./nextflow.config +tests: + - ./assets/* + - ./bin/* + - ./conf/* + - ./main.nf + - ./nextflow_schema.json diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml deleted file mode 100644 index e84027f9..00000000 --- a/.github/workflows/awsfulltest.yml +++ /dev/null @@ -1,56 +0,0 @@ -name: nf-core AWS full size tests -# This workflow is triggered on PRs opened against the master branch. -# It can be additionally triggered manually with GitHub actions workflow dispatch button. -# It runs the -profile 'test_full' on AWS batch - -on: - pull_request: - branches: - - master - workflow_dispatch: - pull_request_review: - types: [submitted] - -jobs: - run-platform: - name: Run AWS full tests - # run only if the PR is approved by at least 2 reviewers and against the master branch or manually triggered - if: github.repository == 'nf-core/fetchngs' && github.event.review.state == 'approved' && github.event.pull_request.base.ref == 'master' || github.event_name == 'workflow_dispatch' - runs-on: ubuntu-latest - steps: - - uses: octokit/request-action@v2.x - id: check_approvals - with: - route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - id: test_variables - if: github.event_name != 'workflow_dispatch' - run: | - JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' - CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') - test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required - - name: Launch workflow via Seqera Platform - uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-${{ github.sha }}" - } - profiles: test_full - - - uses: actions/upload-artifact@v4 - with: - name: Seqera Platform debug log file - path: | - seqera_platform_action_*.log - seqera_platform_action_*.json diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml deleted file mode 100644 index 456c91de..00000000 --- a/.github/workflows/awstest.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: nf-core AWS test -# This workflow can be triggered manually with the GitHub actions workflow dispatch button. -# It runs the -profile 'test' on AWS batch - -on: - workflow_dispatch: -jobs: - run-platform: - name: Run AWS tests - if: github.repository == 'nf-core/fetchngs' - runs-on: ubuntu-latest - steps: - # Launch workflow using Seqera Platform CLI tool action - - name: Launch workflow via Seqera Platform - uses: seqeralabs/action-tower-launch@v2 - with: - workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} - access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} - compute_env: ${{ secrets.TOWER_COMPUTE_ENV }} - revision: ${{ github.sha }} - workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/fetchngs/work-${{ github.sha }} - parameters: | - { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/fetchngs/results-test-${{ github.sha }}" - } - profiles: test - - - uses: actions/upload-artifact@v4 - with: - name: Seqera Platform debug log file - path: | - seqera_platform_action_*.log - seqera_platform_action_*.json diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 29a4648a..9d1557f6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,15 +1,22 @@ -name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +name: nf-core CI on: - push: - branches: - - dev pull_request: release: types: [published] + merge_group: + types: + - checks_requested + branches: + - master + - dev workflow_dispatch: env: + NFT_DIFF: "pdiff" + NFT_DIFF_ARGS: "--line-numbers --expand-tabs=2" + NFT_VER: "0.9.0" + NFT_WORKDIR: "~" NXF_ANSI_LOG: false NXF_SINGULARITY_CACHEDIR: ${{ github.workspace }}/.singularity NXF_SINGULARITY_LIBRARYDIR: ${{ github.workspace }}/.singularity @@ -19,12 +26,36 @@ concurrency: cancel-in-progress: true jobs: + changes: + name: Check for changes + runs-on: ubuntu-latest + outputs: + nf_test_files: ${{ steps.list.outputs.components }} + steps: + - uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: List nf-test files + id: list + uses: adamrtalbot/detect-nf-test-changes@v0.0.3 + with: + head: ${{ github.sha }} + base: origin/${{ github.base_ref }} + include: .github/include.yaml + + - name: print list of nf-test files + run: | + echo ${{ steps.list.outputs.components }} + test: - name: "Run pipeline with test data (${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }})" + name: "Test on (${{ matrix.NXF_VER }} | ${{ matrix.nf_test_files }} | ${{ matrix.profile }})" # Only run on push if this is the nf-core dev branch (merged PRs) - if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }}" + if: needs.changes.outputs.nf_test_files != '[]' && ${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/fetchngs') }} + needs: [changes] runs-on: ubuntu-latest strategy: + fail-fast: false matrix: NXF_VER: - "24.04.2" @@ -33,8 +64,7 @@ jobs: - "conda" - "docker" - "singularity" - test_name: - - "test" + nf_test_files: ["${{ fromJson(needs.changes.outputs.nf_test_files) }}"] isMaster: - ${{ github.base_ref == 'master' }} # Exclude conda and singularity on dev @@ -52,6 +82,20 @@ jobs: with: version: "${{ matrix.NXF_VER }}" + - uses: nf-core/setup-nf-test@v1 + with: + version: "${{ env.NFT_VER }}" + + - uses: actions/setup-python@v4 + with: + python-version: "3.11" + architecture: "x64" + + - name: Install pdiff to see diff between nf-test snapshots + run: | + python -m pip install --upgrade pip + pip install pdiff + - name: Set up Apptainer if: matrix.profile == 'singularity' uses: eWaterCycle/setup-apptainer@main @@ -80,6 +124,42 @@ jobs: - name: Clean up Disk space uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - name: "Run pipeline with test data ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" + - name: "Run nf-test ${{ matrix.NXF_VER }} | ${{ matrix.test_name }} | ${{ matrix.profile }}" + run: | + nf-test test --verbose ${{ matrix.nf_test_files }} --profile "+${{ matrix.profile }}" --junitxml=test.xml --tap=test.tap + + - uses: pcolby/tap-summary@v1 + with: + path: >- + test.tap + + - name: Output log on failure + if: failure() + run: | + sudo apt install bat > /dev/null + batcat --decorations=always --color=always ${{ github.workspace }}/.nf-test/tests/*/meta/nextflow.log + + - name: Publish Test Report + uses: mikepenz/action-junit-report@v3 + if: always() # always run even if the previous step fails + with: + report_paths: test.xml + confirm-pass: + runs-on: ubuntu-latest + needs: + - changes + - test + if: always() + steps: + - name: All tests ok + if: ${{ !contains(needs.*.result, 'failure') }} + run: exit 0 + - name: One or more tests failed + if: ${{ contains(needs.*.result, 'failure') }} + run: exit 1 + + - name: debug-print + if: always() run: | - nextflow run ${GITHUB_WORKSPACE} -profile ${{ matrix.test_name }},${{ matrix.profile }} --outdir ./results + echo "toJSON(needs) = ${{ toJSON(needs) }}" + echo "toJSON(needs.*.result) = ${{ toJSON(needs.*.result) }}" diff --git a/.github/workflows/cloud_tests_full.yml b/.github/workflows/cloud_tests_full.yml new file mode 100644 index 00000000..b1100564 --- /dev/null +++ b/.github/workflows/cloud_tests_full.yml @@ -0,0 +1,146 @@ +name: nf-core cloud full size tests +# This workflow is triggered on PRs opened against the master branch. +# It can be additionally triggered manually with GitHub actions workflow dispatch button. +# It runs the -profile 'test_full' on Cloud +run-name: Submitting workflow to all cloud providers using full sized data +on: + pull_request: + branches: + - master + pull_request_review: + types: [submitted] + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-full-tests-on-aws: + name: Run AWS full tests + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' || !github.event.inputs }} + runs-on: ubuntu-latest + strategy: + matrix: + download_method: ["aspera", "ftp", "sratools"] + steps: + - uses: octokit/request-action@v2.x + id: check_approvals + with: + route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - id: test_variables + if: github.event_name != 'workflow_dispatch' + run: | + JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' + CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') + test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required + - name: Launch workflow via Seqera Platform + uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_full_${{ matrix.download_method }}" + revision: ${{ github.sha }} + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "download_method": "${{ matrix.download_method }}", + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-${{ github.sha }}/download_method_${{ matrix.download_method }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + + run-full-tests-on-azure: + name: Run Azure full tests + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' || !github.event.inputs }} + runs-on: ubuntu-latest + strategy: + matrix: + download_method: ["aspera", "ftp", "sratools"] + steps: + - uses: octokit/request-action@v2.x + id: check_approvals + with: + route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - id: test_variables + if: github.event_name != 'workflow_dispatch' + run: | + JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' + CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') + test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required + - name: Launch workflow via Seqera Platform + uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_full_${{ matrix.download_method }}" + revision: ${{ github.sha }} + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "download_method": "${{ matrix.download_method }}", + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-${{ github.sha }}/download_method_${{ matrix.download_method }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + + run-full-tests-on-gcp: + name: Run GCP full tests + if: ${{ github.event.inputs.platform == 'gcp' || !github.event.inputs }} + runs-on: ubuntu-latest + strategy: + matrix: + download_method: ["aspera", "ftp", "sratools"] + steps: + - uses: octokit/request-action@v2.x + id: check_approvals + with: + route: GET /repos/${{ github.repository }}/pulls/${{ github.event.pull_request.number }}/reviews + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - id: test_variables + if: github.event_name != 'workflow_dispatch' + run: | + JSON_RESPONSE='${{ steps.check_approvals.outputs.data }}' + CURRENT_APPROVALS_COUNT=$(echo $JSON_RESPONSE | jq -c '[.[] | select(.state | contains("APPROVED")) ] | length') + test $CURRENT_APPROVALS_COUNT -ge 2 || exit 1 # At least 2 approvals are required + - name: Launch workflow via Seqera Platform + uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_full_${{ matrix.download_method }}" + revision: ${{ github.sha }} + profiles: test_full + parameters: | + { + "hook_url": "${{ secrets.MEGATESTS_ALERTS_SLACK_HOOK_URL }}", + "download_method": "${{ matrix.download_method }}", + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-${{ github.sha }}/download_method_${{ matrix.download_method }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.github/workflows/cloud_tests_small.yml b/.github/workflows/cloud_tests_small.yml new file mode 100644 index 00000000..9b593f11 --- /dev/null +++ b/.github/workflows/cloud_tests_small.yml @@ -0,0 +1,83 @@ +name: nf-core cloud tests +# This workflow can be triggered manually with the GitHub actions workflow dispatch button. +# It runs the -profile 'test' on cloud +run-name: Submitting workflow to all cloud providers using small sized data +on: + workflow_dispatch: + inputs: + platform: + description: "Platform to run test" + required: true + default: "all" + type: choice + options: + - all + - aws + - azure + - gcp +jobs: + run-small-tests-on-aws: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'aws' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AWS_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AWS }}/work/fetchngs/work-${{ github.sha }}" + run_name: "aws_fetchngs_small" + revision: ${{ github.sha }} + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AWS }}/fetchngs/results-test-${{ github.sha }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + + run-small-tests-on-azure: + if: ${{ github.event.inputs.platform == 'all' || github.event.inputs.platform == 'azure' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_AZURE_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_AZURE }}/work/fetchngs/work-${{ github.sha }}" + run_name: "azure_fetchngs_small" + revision: ${{ github.sha }} + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_AZURE }}/fetchngs/results-test-${{ github.sha }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log + + run-small-tests-on-gcp: + if: ${{ github.event.inputs.platform == 'gcp' }} + runs-on: ubuntu-latest + steps: + - uses: seqeralabs/action-tower-launch@v2 + with: + workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} + access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} + compute_env: ${{ secrets.TOWER_CE_GCP_CPU }} + workdir: "${{ secrets.TOWER_BUCKET_GCP }}/work/fetchngs/work-${{ github.sha }}" + run_name: "gcp_fetchngs_small" + revision: ${{ github.sha }} + profiles: test + parameters: | + { + "outdir": "${{ secrets.TOWER_BUCKET_GCP }}/fetchngs/results-test-${{ github.sha }}/" + } + - uses: actions/upload-artifact@v3 + with: + name: Tower debug log file + path: tower_action_*.log diff --git a/.gitignore b/.gitignore index a42ce016..72b55d5b 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,7 @@ testing/ testing* *.pyc null/ +.nf-test.log +nf-test +.nf-test* +test.xml diff --git a/.nf-core.yml b/.nf-core.yml index d9331fe1..aa2cf5e4 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -16,6 +16,9 @@ template: name: fetchngs org: nf-core outdir: . - skip_features: null + skip_features: + - fastqc + - igenomes + - multiqc version: 1.13.0dev update: null diff --git a/CHANGELOG.md b/CHANGELOG.md index 61d72915..fed14b50 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,334 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v1.13.0dev - [date] +### Enhancements & fixes + +- [PR #299](https://github.com/nf-core/fetchngs/pull/299) - Template update for nf-core/tools v2.13.1 +- [PR #300](https://github.com/nf-core/fetchngs/pull/300) - Use file paths instead of tags for testing matrix, should make matrices more efficient +- [PR #303](https://github.com/nf-core/fetchngs/pull/303) - Update wget container for SRA_FASTQ_FTP from 1.20.1 to 1.21.4 +- [PR #305](https://github.com/nf-core/fetchngs/pull/305) - Update module sratools/prefetch for reliable download integrity check +- [PR #316](https://github.com/nf-core/fetchngs/pull/316) - Use nf-core/setup-nf-test to install nf-test from cache during CI/CD +- [PR #323](https://github.com/nf-core/fetchngs/pull/323) - Template update for nf-core/tools v3.0.2 + +### Software dependencies + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| `wget` | 1.20.1 | 1.21.4 | +| `sratools` | 3.0.8 | 3.1.0 | + +## [[1.12.0](https://github.com/nf-core/fetchngs/releases/tag/1.12.0)] - 2024-02-29 + +### :warning: Major enhancements + +- The Aspera CLI was recently added to [Bioconda](https://anaconda.org/bioconda/aspera-cli) and we have added it as another way of downloading FastQ files in addition to the existing FTP and sra-tools support. In our limited benchmarks on all public Clouds we found ~50% speed-up in download times compared to FTP! FTP downloads will still be the default download method (i.e. `--download_method ftp`) but you can choose to use sra-tools or Aspera using `--download_method sratools` or `--download_method aspera`, respectively. We would love to have your feedback! +- The `--force_sratools_download` parameter has been deprecated in favour of using `--download_method ` to explicitly specify the download method; available options are `ftp`, `sratools` or `aspera`. +- Support for Synapse ids has been dropped in this release. We haven't had any feedback from users whether it is being used or not. Users can run earlier versions of the pipeline if required. +- We have significantly refactored and standardised the way we are using nf-test within this pipeline. This pipeline is now the current, best-practice implementation for nf-test usage on nf-core. We required a number of features to be added to nf-test and a huge shoutout to [Lukas Forer](https://github.com/lukfor) for entertaining our requests and implementing them within upstream :heart:! + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Alexandru Mizeranschi](https://github.com/nicolae06) +- [Alexander Blaessle](https://github.com/alexblaessle) +- [Lukas Forer](https://github.com/lukfor) +- [Matt Niederhuber](https://github.com/mniederhuber) +- [Maxime Garcia](https://github.com/maxulysse) +- [Sateesh Peri](https://github.com/sateeshperi) +- [Sebastian Uhrig](https://github.com/suhrig) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [PR #238](https://github.com/nf-core/fetchngs/pull/238) - Resolved bug when prefetching large studies ([#236](https://github.com/nf-core/fetchngs/issues/236)) +- [PR #241](https://github.com/nf-core/fetchngs/pull/241) - Use wget instead of curl to download files from FTP ([#169](https://github.com/nf-core/fetchngs/issues/169), [#194](https://github.com/nf-core/fetchngs/issues/194)) +- [PR #242](https://github.com/nf-core/fetchngs/pull/242) - Template update for nf-core/tools v2.11 +- [PR #243](https://github.com/nf-core/fetchngs/pull/243) - Fixes for [PR #238](https://github.com/nf-core/fetchngs/pull/238) +- [PR #245](https://github.com/nf-core/fetchngs/pull/246) - Refactor nf-test CI and test and other pre-release fixes ([#233](https://github.com/nf-core/fetchngs/issues/233)) +- [PR #246](https://github.com/nf-core/fetchngs/pull/246) - Handle dark/light mode for logo in GitHub README properly +- [PR #248](https://github.com/nf-core/fetchngs/pull/248) - Update pipeline level test data path to use mirror on s3 +- [PR #249](https://github.com/nf-core/fetchngs/pull/249) - Update modules which includes absolute paths for test data, making module level test compatible within the pipeline. +- [PR #253](https://github.com/nf-core/fetchngs/pull/253) - Add implicit tags in nf-test files for simpler testing strategy +- [PR #257](https://github.com/nf-core/fetchngs/pull/257) - Template update for nf-core/tools v2.12 +- [PR #258](https://github.com/nf-core/fetchngs/pull/258) - Fixes for [PR #253](https://github.com/nf-core/fetchngs/pull/253) +- [PR #259](https://github.com/nf-core/fetchngs/pull/259) - Add Aspera CLI download support to pipeline ([#68](https://github.com/nf-core/fetchngs/issues/68)) +- [PR #261](https://github.com/nf-core/fetchngs/pull/261) - Revert sratools fasterqdump version ([#221](https://github.com/nf-core/fetchngs/issues/221)) +- [PR #262](https://github.com/nf-core/fetchngs/pull/262) - Use nf-test version v0.8.4 and remove implicit tags +- [PR #263](https://github.com/nf-core/fetchngs/pull/263) - Refine tags used for workflows +- [PR #264](https://github.com/nf-core/fetchngs/pull/264) - Remove synapse workflow from pipeline +- [PR #265](https://github.com/nf-core/fetchngs/pull/265) - Use "+" syntax for profiles to accumulate profiles in nf-test +- [PR #266](https://github.com/nf-core/fetchngs/pull/266) - Make .gitignore match template +- [PR #268](https://github.com/nf-core/fetchngs/pull/268) - Add mermaid diagram +- [PR #273](https://github.com/nf-core/fetchngs/pull/273) - Update utility subworkflows +- [PR #283](https://github.com/nf-core/fetchngs/pull/283) - Template update for nf-core/tools v2.13 +- [PR #288](https://github.com/nf-core/fetchngs/pull/288) - Update Github Action to run full-sized test for all 3 download methods +- [PR #290](https://github.com/nf-core/fetchngs/pull/290) - Remove mentions of deprecated Synapse functionality in pipeline +- [PR #294](https://github.com/nf-core/fetchngs/pull/294) - Replace mermaid diagram with subway map +- [PR #295](https://github.com/nf-core/fetchngs/pull/295) - Be less stringent with test expectations for CI +- [PR #296](https://github.com/nf-core/fetchngs/pull/296) - Remove params.outdir from tests where required and update snapshots +- [PR #298](https://github.com/nf-core/fetchngs/pull/298) - `export CONDA_PREFIX` into container when using Singularity and Apptainer + +### Software dependencies + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| `wget` | | 1.20.1 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + +### Parameters + +| Old parameter | New parameter | +| --------------------------- | ------------------- | +| | `--download_method` | +| `--input_type` | | +| `--force_sratools_download` | | +| `--synapse_config` | | + +> **NB:** Parameter has been **updated** if both old and new parameter information is present. +> **NB:** Parameter has been **added** if just the new parameter information is present. +> **NB:** Parameter has been **removed** if new parameter information isn't present. + +## [[1.11.0](https://github.com/nf-core/fetchngs/releases/tag/1.11.0)] - 2023-10-18 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Edmund Miller](https://github.com/edmundmiller) +- [Esha Joshi](https://github.com/ejseqera) +- [Harshil Patel](https://github.com/drpatelh) +- [Lukas Forer](https://github.com/lukfor) +- [James Fellows Yates](https://github.com/jfy133) +- [Maxime Garcia](https://github.com/maxulysse) +- [Rob Syme](https://github.com/robsyme) +- [Sateesh Peri](https://github.com/sateeshperi) +- [Sebastian Schönherr](https://github.com/seppinho) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [PR #188](https://github.com/nf-core/fetchngs/pull/188) - Use nf-test for all pipeline testing + +## [[1.10.1](https://github.com/nf-core/fetchngs/releases/tag/1.10.1)] - 2023-10-08 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Davide Carlson](https://github.com/davidecarlson) +- [Harshil Patel](https://github.com/drpatelh) +- [Maxime Garcia](https://github.com/maxulysse) +- [MCMandR](https://github.com/MCMandR) +- [Rob Syme](https://github.com/robsyme) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [#173](https://github.com/nf-core/fetchngs/issues/173) - Add compatibility for sralite files +- [PR #205](https://github.com/nf-core/fetchngs/pull/205) - Rename all local modules, workflows and remove `public_aws_ecr profile` +- [PR #206](https://github.com/nf-core/fetchngs/pull/206) - CI improvments and code cleanup +- [PR #208](https://github.com/nf-core/fetchngs/pull/208) - Template update with nf-core/tools 2.10 + +### Software dependencies + +| Dependency | Old version | New version | +| ----------- | ----------- | ----------- | +| `sra-tools` | 2.11.0 | 3.0.8 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + +## [[1.10.0](https://github.com/nf-core/fetchngs/releases/tag/1.10.0)] - 2023-05-16 + +### Credits + +Special thanks to the following for their contributions to the release: + +- [Adam Talbot](https://github.com/adamrtalbot) +- [Esha Joshi](https://github.com/ejseqera) +- [Maxime Garcia](https://github.com/maxulysse) +- [Moritz E. Beber](https://github.com/Midnighter) +- [Rob Syme](https://github.com/robsyme) +- [sirclockalot](https://github.com/sirclockalot) + +Thank you to everyone else that has contributed by reporting bugs, enhancements or in any other way, shape or form. + +### Enhancements & fixes + +- [#85](https://github.com/nf-core/fetchngs/issues/85) - Not able to fetch metadata for ERR ids associated with ArrayExpress +- [#104](https://github.com/nf-core/fetchngs/issues/104) - Add support back in for [GEO IDs](https://www.ncbi.nlm.nih.gov/geo) (removed in v1.7) +- [#129](https://github.com/nf-core/fetchngs/issues/129) - Pipeline is working with SRA run ids but failing with corresponding Biosample ids +- [#138](https://github.com/nf-core/fetchngs/issues/138) - Add support for downloading protected dbGAP data using a JWT file +- [#144](https://github.com/nf-core/fetchngs/issues/144) - Add support to download 10X Genomics data +- [PR #140](https://github.com/nf-core/fetchngs/pull/140) - Bumped modules version to allow for sratools download of sralite format files +- [PR #147](https://github.com/nf-core/fetchngs/pull/147) - Updated pipeline template to [nf-core/tools 2.8](https://github.com/nf-core/tools/releases/tag/2.8) +- [PR #148](https://github.com/nf-core/fetchngs/pull/148) - Fix default metadata fields for ENA API v2.0 +- [PR #150](https://github.com/nf-core/fetchngs/pull/150) - Add infrastructure and CI for multi-cloud full-sized tests run via Nextflow Tower +- [PR #157](https://github.com/nf-core/fetchngs/pull/157) - Add `public_aws_ecr.config` to source mulled containers when using `public.ecr.aws` Docker Biocontainer registry + +### Software dependencies + +| Dependency | Old version | New version | +| --------------- | ----------- | ----------- | +| `synapseclient` | 2.6.0 | 2.7.1 | + +> **NB:** Dependency has been **updated** if both old and new version information is present. +> +> **NB:** Dependency has been **added** if just the new version information is present. +> +> **NB:** Dependency has been **removed** if new version information isn't present. + +## [[1.9](https://github.com/nf-core/fetchngs/releases/tag/1.9)] - 2022-12-21 + +### Enhancements & fixes + +- Bumped minimum Nextflow version from `21.10.3` -> `22.10.1` +- Updated pipeline template to [nf-core/tools 2.7.2](https://github.com/nf-core/tools/releases/tag/2.7.2) +- Added support for generating nf-core/atacseq compatible samplesheets +- Added `--nf_core_rnaseq_strandedness` parameter to specify value for `strandedness` entry added to samplesheet created when using `--nf_core_pipeline rnaseq`. The default is `auto` which can be used with nf-core/rnaseq v3.10 onwards to auto-detect strandedness during the pipeline execution. + +## [[1.8](https://github.com/nf-core/fetchngs/releases/tag/1.8)] - 2022-11-08 + +### Enhancements & fixes + +- [#111](https://github.com/nf-core/fetchngs/issues/111) - Change input mimetype to csv +- [#114](https://github.com/nf-core/fetchngs/issues/114) - Final samplesheet is not created when `--skip_fastq_download` is provided +- [#118](https://github.com/nf-core/fetchngs/issues/118) - Allow input pattern validation for csv/tsv/txt +- [#119](https://github.com/nf-core/fetchngs/issues/119) - `--force_sratools_download` results in different fastq names compared to FTP download +- [#121](https://github.com/nf-core/fetchngs/issues/121) - Add `tower.yml` to render samplesheet as Report in Tower +- Fetch `SRR` and `DRR` metadata from ENA API instead of NCBI API to bypass frequent breaking changes +- Updated pipeline template to [nf-core/tools 2.6](https://github.com/nf-core/tools/releases/tag/2.6) + +## [[1.7](https://github.com/nf-core/fetchngs/releases/tag/1.7)] - 2022-07-01 + +### :warning: Major enhancements + +Support for GEO ids has been dropped in this release due to breaking changes introduced in the NCBI API. For more detailed information please see [this PR](https://github.com/nf-core/fetchngs/pull/102). + +As a workaround, if you have a GEO accession you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline e.g. `--input SRR_Acc_List.txt`. + +### Enhancements & fixes + +- [#97](https://github.com/nf-core/fetchngs/pull/97) - Add support for generating nf-core/taxprofiler compatible samplesheets. +- [#99](https://github.com/nf-core/fetchngs/issues/99) - SRA_IDS_TO_RUNINFO fails due to bad request +- Add `enum` field for `--nf_core_pipeline` to parameter schema so only accept supported pipelines are accepted + +## [[1.6](https://github.com/nf-core/fetchngs/releases/tag/1.6)] - 2022-05-17 + +- [#57](https://github.com/nf-core/fetchngs/pull/57) - fetchngs fails if FTP is blocked +- [#89](https://github.com/nf-core/fetchngs/pull/89) - Improve detection and usage of the NCBI user settings by using the standardized sra-tools modules from nf-core. +- [#93](https://github.com/nf-core/fetchngs/pull/93) - Adjust modules configuration to respect the `publish_dir_mode` parameter. +- [[nf-core/rnaseq#764](https://github.com/nf-core/rnaseq/issues/764)] - Test fails when using GCP due to missing tools in the basic biocontainer +- Updated pipeline template to [nf-core/tools 2.4.1](https://github.com/nf-core/tools/releases/tag/2.4.1) + +### Software dependencies + +| Dependency | Old version | New version | +| --------------- | ----------- | ----------- | +| `synapseclient` | 2.4.0 | 2.6.0 | + +## [[1.5](https://github.com/nf-core/fetchngs/releases/tag/1.5)] - 2021-12-01 + +- Finish porting the pipeline to the updated Nextflow DSL2 syntax adopted on nf-core/modules + - Bump minimum Nextflow version from `21.04.0` -> `21.10.3` + - Removed `--publish_dir_mode` as it is no longer required for the new syntax + +### Enhancements & fixes + +## [[1.4](https://github.com/nf-core/fetchngs/releases/tag/1.4)] - 2021-11-09 + +### Enhancements & fixes + +- Convert pipeline to updated Nextflow DSL2 syntax for future adoption across nf-core +- Added a workflow to download FastQ files and to create samplesheets for ids from the [Synapse platform](https://www.synapse.org/) hosted by [Sage Bionetworks](https://sagebionetworks.org/). +- SRA identifiers not available for direct download via the ENA FTP will now be downloaded via [`sra-tools`](https://github.com/ncbi/sra-tools). +- Added `--force_sratools_download` parameter to preferentially download all FastQ files via `sra-tools` instead of ENA FTP. +- Correctly handle errors from SRA identifiers that do **not** return metadata, for example, due to being private. +- Retry an error in prefetch via bash script in order to allow it to resume interrupted downloads. +- Name output FastQ files by `{EXP_ACC}_{RUN_ACC}*fastq.gz` instead of `{EXP_ACC}_{T*}*fastq.gz` for run id provenance +- [[#46](https://github.com/nf-core/fetchngs/issues/46)] - Bug in sra_ids_to_runinfo.py +- Added support for [DDBJ ids](https://www.ddbj.nig.ac.jp/index-e.html). See examples below: + +| `DDBJ` | +| ------------ | +| PRJDB4176 | +| SAMD00114846 | +| DRA008156 | +| DRP004793 | +| DRR171822 | +| DRS090921 | +| DRX162434 | + +## [[1.3](https://github.com/nf-core/fetchngs/releases/tag/1.3)] - 2021-09-15 + +### Enhancements & fixes + +- Replaced Python `requests` with `urllib` to fetch ENA metadata + +### Software dependencies + +| Dependency | Old version | New version | +| ---------- | ----------- | ----------- | +| `python` | 3.8.3 | 3.9.5 | + +## [[1.2](https://github.com/nf-core/fetchngs/releases/tag/1.2)] - 2021-07-28 + +### Enhancements & fixes + +- Updated pipeline template to [nf-core/tools 2.1](https://github.com/nf-core/tools/releases/tag/2.1) +- [[#26](https://github.com/nf-core/fetchngs/pull/26)] - Update broken EBI API URL + +## [[1.1](https://github.com/nf-core/fetchngs/releases/tag/1.1)] - 2021-06-22 + +### Enhancements & fixes + +- [[#12](https://github.com/nf-core/fetchngs/issues/12)] - Error when using singularity - /etc/resolv.conf doesn't exist in container +- Added `--sample_mapping_fields` parameter to create a separate `id_mappings.csv` and `multiqc_config.yml` with selected fields that can be used to rename samples in general and in [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) + +## [[1.0](https://github.com/nf-core/fetchngs/releases/tag/1.0)] - 2021-06-08 + Initial release of nf-core/fetchngs, created with the [nf-core](https://nf-co.re/) template. -### `Added` +## Pipeline summary + +Via a single file of ids, provided one-per-line the pipeline performs the following steps: + +1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) +2. Fetch extensive id metadata including direct download links to FastQ files via ENA API +3. Download FastQ files in parallel via `curl` and perform `md5sum` check +4. Collate id metadata and paths to FastQ files in a single samplesheet -### `Fixed` +## Supported database ids -### `Dependencies` +Currently, the following types of example identifiers are supported: -### `Deprecated` +| `SRA` | `ENA` | `GEO` | +| ------------ | ------------ | ---------- | +| SRR11605097 | ERR4007730 | GSM4432381 | +| SRX8171613 | ERX4009132 | GSE147507 | +| SRS6531847 | ERS4399630 | | +| SAMN14689442 | SAMEA6638373 | | +| SRP256957 | ERP120836 | | +| SRA1068758 | ERA2420837 | | +| PRJNA625551 | PRJEB37513 | | diff --git a/CITATIONS.md b/CITATIONS.md index 677129dc..62235e72 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,15 +10,33 @@ ## Pipeline tools -- [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/) +- [Aspera CLI](https://github.com/IBM/aspera-cli) -> Andrews, S. (2010). FastQC: A Quality Control Tool for High Throughput Sequence Data [Online]. +- [Python](http://www.python.org) -- [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) +- [Requests](https://docs.python-requests.org/) -> Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +- [sra-tools](https://github.com/ncbi/sra-tools) -## Software packaging/containerisation tools +## Pipeline resources + +- [ENA](https://pubmed.ncbi.nlm.nih.gov/33175160/) + + > Harrison PW, Ahamed A, Aslam R, Alako BTF, Burgin J, Buso N, Courtot M, Fan J, Gupta D, Haseeb M, Holt S, Ibrahim T, Ivanov E, Jayathilaka S, Kadhirvelu VB, Kumar M, Lopez R, Kay S, Leinonen R, Liu X, O'Cathail C, Pakseresht A, Park Y, Pesant S, Rahman N, Rajan J, Sokolov A, Vijayaraja S, Waheed Z, Zyoud A, Burdett T, Cochrane G. The European Nucleotide Archive in 2020. Nucleic Acids Res. 2021 Jan 8;49(D1):D82-D85. doi: 10.1093/nar/gkaa1028. PubMed PMID: 33175160; PubMed Central PMCID: PMC7778925. + +- [SRA](https://pubmed.ncbi.nlm.nih.gov/21062823/) + + > Leinonen R, Sugawara H, Shumway M, International Nucleotide Sequence Database Collaboration. The sequence read archive. Nucleic Acids Res. 2011 Jan;39 (Database issue):D19-21. doi: 10.1093/nar/gkq1019. Epub 2010 Nov 9. PubMed PMID: 21062823; PubMed Central PMCID: PMC3013647. + +- [DDBJ](https://pubmed.ncbi.nlm.nih.gov/33156332/) + + > Fukuda A, Kodama Y, Mashima J, Fujisawa T, Ogasawara O. DDBJ update: streamlining submission and access of human data. Nucleic Acids Res. 2021 Jan 8;49(D1):D71-D75. doi: 10.1093/nar/gkaa982. PubMed PMID: 33156332; PubMed Central PMCID: PMC7779041. + +- [GEO](https://pubmed.ncbi.nlm.nih.gov/23193258/) + + > Barrett T, Wilhite SE, Ledoux P, Evangelista C, Kim IF, Tomashevsky M, Marshall KA, Phillippy KH, Sherman PM, Holko M, Yefanov A, Lee H, Zhang N, Robertson CL, Serova N, Davis S, Soboleva A. NCBI GEO: archive for functional genomics data sets--update. Nucleic Acids Res. 2013 Jan;41(Database issue):D991-5. doi: 10.1093/nar/gks1193. Epub 2012 Nov 27. PubMed PMID: 23193258; PubMed Central PMCID: PMC3531084. + +## Software packaging/containerisation/testing tools - [Anaconda](https://anaconda.com) @@ -36,6 +54,8 @@ > Merkel, D. (2014). Docker: lightweight linux containers for consistent development and deployment. Linux Journal, 2014(239), 2. doi: 10.5555/2600239.2600241. +- [nf-test](https://code.askimed.com/nf-test) + - [Singularity](https://pubmed.ncbi.nlm.nih.gov/28494014/) > Kurtzer GM, Sochat V, Bauer MW. Singularity: Scientific containers for mobility of compute. PLoS One. 2017 May 11;12(5):e0177459. doi: 10.1371/journal.pone.0177459. eCollection 2017. PubMed PMID: 28494014; PubMed Central PMCID: PMC5426675. diff --git a/README.md b/README.md index c9da5de7..4410c6e2 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,7 @@ [![GitHub Actions CI Status](https://github.com/nf-core/fetchngs/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/fetchngs/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/fetchngs/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/fetchngs/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/fetchngs/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) -[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) +[![GitHub Actions Linting Status](https://github.com/nf-core/fetchngs/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/fetchngs/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/fetchngs/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.5070524-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.5070524)[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A524.04.2-23aa62.svg)](https://www.nextflow.io/) [![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/) @@ -19,50 +18,36 @@ ## Introduction -**nf-core/fetchngs** is a bioinformatics pipeline that ... +**nf-core/fetchngs** is a bioinformatics pipeline to fetch metadata and raw FastQ files from both public databases. At present, the pipeline supports SRA / ENA / DDBJ / GEO ids (see [usage docs](https://nf-co.re/fetchngs/usage#introduction)). - - - - - -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +![nf-core/fetchngs metro map](docs/images/nf-core-fetchngs_metro_map_grey.png) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - +Each line represents a database id. Please see next section for supported ids. Now, you can run the pipeline using: - - ```bash nextflow run nf-core/fetchngs \ -profile \ - --input samplesheet.csv \ + --input ids.csv \ --outdir ``` @@ -71,19 +56,37 @@ nextflow run nf-core/fetchngs \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/fetchngs/usage) and the [parameter documentation](https://nf-co.re/fetchngs/parameters). +## Supported ids + +Via a single file of ids, provided one-per-line (see [example input file](https://raw.githubusercontent.com/nf-core/test-datasets/fetchngs/sra_ids_test.csv)) the pipeline performs the following steps: + +### SRA / ENA / DDBJ / GEO ids + +1. Resolve database ids back to appropriate experiment-level ids and to be compatible with the [ENA API](https://ena-docs.readthedocs.io/en/latest/retrieval/programmatic-access.html) +2. Fetch extensive id metadata via ENA API +3. Download FastQ files: + - If direct download links are available from the ENA API: + - Fetch in parallel via `wget` and perform `md5sum` check (`--download_method ftp`; default). + - Fetch in parallel via `aspera-cli` and perform `md5sum` check. Use `--download_method aspera` to force this behaviour. + - Otherwise use [`sra-tools`](https://github.com/ncbi/sra-tools) to download `.sra` files and convert them to FastQ. Use `--download_method sratools` to force this behaviour. +4. Collate id metadata and paths to FastQ files in a single samplesheet + ## Pipeline output -To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/fetchngs/results) tab on the nf-core website pipeline page. +The columns in the output samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines (see [usage docs](https://nf-co.re/fetchngs/usage#samplesheet-format)), these currently include: + +- [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) +- [nf-core/atacseq](https://nf-co.re/atacseq/usage#samplesheet-input) +- Ilumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) +- [nf-core/taxprofiler](https://nf-co.re/nf-core/taxprofiler) + +To see the the results of a test run with a full size dataset refer to the [results](https://nf-co.re/fetchngs/results) tab on the nf-core website pipeline page. For more details about the output files and reports, please refer to the [output documentation](https://nf-co.re/fetchngs/output). ## Credits -nf-core/fetchngs was originally written by Harshil Patel, Moritz E. Beber and Jose Espinosa-Carrasco. - -We thank the following people for their extensive assistance in the development of this pipeline: - - +nf-core/fetchngs was originally written by Harshil Patel ([@drpatelh](https://github.com/drpatelh)) from [Seqera Labs, Spain](https://seqera.io/) and Jose Espinosa-Carrasco ([@JoseEspinosa](https://github.com/JoseEspinosa)) from [The Comparative Bioinformatics Group](https://www.crg.eu/en/cedric_notredame) at [The Centre for Genomic Regulation, Spain](https://www.crg.eu/). Support for download of sequencing reads without FTP links via sra-tools was added by Moritz E. Beber ([@Midnighter](https://github.com/Midnighter)) from [Unseen Bio ApS, Denmark](https://unseenbio.com). ## Contributions and Support @@ -93,10 +96,7 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - - - +If you use nf-core/fetchngs for your analysis, please cite it using the following doi: [10.5281/zenodo.5070524](https://doi.org/10.5281/zenodo.5070524) An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index 49a0a9fd..469f4a46 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,8 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/fetchngs Methods Description" section_href: "https://github.com/nf-core/fetchngs" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline -## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

Data was processed using nf-core/fetchngs v${workflow.manifest.version} ${doi_text} of the nf-core collection of workflows (Ewels et al., 2020), utilising reproducible software environments from the Bioconda (Grüning et al., 2018) and Biocontainers (da Veiga Leprevost et al., 2017) projects.

diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml deleted file mode 100644 index 94115f0c..00000000 --- a/assets/multiqc_config.yml +++ /dev/null @@ -1,15 +0,0 @@ -report_comment: > - This report has been generated by the nf-core/fetchngs - analysis pipeline. For information about how to interpret these results, please see the - documentation. -report_section_order: - "nf-core-fetchngs-methods-description": - order: -1000 - software_versions: - order: -1001 - "nf-core-fetchngs-summary": - order: -1002 - -export_plots: true - -disable_version_detection: true diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv deleted file mode 100644 index 5f653ab7..00000000 --- a/assets/samplesheet.csv +++ /dev/null @@ -1,3 +0,0 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, diff --git a/assets/schema_input.json b/assets/schema_input.json index 51b251dd..4f17c242 100644 --- a/assets/schema_input.json +++ b/assets/schema_input.json @@ -7,27 +7,11 @@ "items": { "type": "object", "properties": { - "sample": { + "": { "type": "string", - "pattern": "^\\S+$", - "errorMessage": "Sample name must be provided and cannot contain spaces", - "meta": ["id"] - }, - "fastq_1": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 1 must be provided, cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" - }, - "fastq_2": { - "type": "string", - "format": "file-path", - "exists": true, - "pattern": "^\\S+\\.f(ast)?q\\.gz$", - "errorMessage": "FastQ file for reads 2 cannot contain spaces and must have extension '.fq.gz' or '.fastq.gz'" + "pattern": "^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM]))(\\d+)$", + "errorMessage": "Please provide a valid SRA, ENA, DDBJ or GEO identifier" } - }, - "required": ["sample", "fastq_1"] + } } } diff --git a/assets/sendmail_template.txt b/assets/sendmail_template.txt index 93001cae..bbb56b38 100644 --- a/assets/sendmail_template.txt +++ b/assets/sendmail_template.txt @@ -25,29 +25,4 @@ Content-Disposition: inline; filename="nf-core-fetchngs_logo_light.png" flatten(). join( '\n' ) %> -<% -if (mqcFile){ -def mqcFileObj = new File("$mqcFile") -if (mqcFileObj.length() < mqcMaxSize){ -out << """ ---nfcoremimeboundary -Content-Type: text/html; name=\"multiqc_report\" -Content-Transfer-Encoding: base64 -Content-ID: -Content-Disposition: attachment; filename=\"${mqcFileObj.getName()}\" - -${mqcFileObj. - bytes. - encodeBase64(). - toString(). - tokenize( '\n' )*. - toList()*. - collate( 76 )*. - collect { it.join() }. - flatten(). - join( '\n' )} -""" -}} -%> - --nfcoremimeboundary-- diff --git a/bin/multiqc_mappings_config.py b/bin/multiqc_mappings_config.py new file mode 100755 index 00000000..3ffe35ec --- /dev/null +++ b/bin/multiqc_mappings_config.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python + +import sys + +with open(sys.argv[1], "r") as fin, open(sys.argv[2], "w") as fout: + header = fin.readline().split(",") + config = "sample_names_rename_buttons:\n" + config += "\n".join([" - " + x.strip('"') for x in header]) + config += "sample_names_rename:\n" + rename = [] + for line in fin: + rename.append(f" - [{', '.join(line.strip().split(','))}]") + fout.write(config + "\n".join(sorted(rename)) + "\n") diff --git a/bin/sra_ids_to_runinfo.py b/bin/sra_ids_to_runinfo.py new file mode 100755 index 00000000..0ffafba8 --- /dev/null +++ b/bin/sra_ids_to_runinfo.py @@ -0,0 +1,498 @@ +#!/usr/bin/env python + + +import argparse +import cgi +import csv +import gzip +import logging +import os +import re +import sys +import zlib +from pathlib import Path +from urllib.error import HTTPError, URLError +from urllib.parse import urlencode +from urllib.request import urlopen +import json +import time + +logger = logging.getLogger() + + +# Example ids supported by this script +SRA_IDS = ( + "PRJNA63463", + "SAMN00765663", + "SRA023522", + "SRP003255", + "SRR390278", + "SRS282569", + "SRX111814", +) +ENA_IDS = ( + "PRJEB7743", + "SAMEA3121481", + "ERA2421642", + "ERP120836", + "ERR674736", + "ERS4399631", + "ERX629702", +) +DDBJ_IDS = ( + "PRJDB4176", + "SAMD00114846", + "DRA008156", + "DRP004793", + "DRR171822", + "DRS090921", + "DRX162434", +) +GEO_IDS = ("GSE18729", "GSM465244") +ID_REGEX = re.compile(r"^([A-Z]+)([0-9]+)$") +PREFIX_LIST = sorted({ID_REGEX.match(id).group(1) for id in SRA_IDS + ENA_IDS + DDBJ_IDS + GEO_IDS}) + + +# List of metadata fields fetched from the ENA API - can be overriden by options +# `-ef` or `--ena_metadata_fields`. +# Full list of accepted fields can be obtained here: +# https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run +ENA_METADATA_FIELDS = ( + "run_accession", + "experiment_accession", + "sample_accession", + "secondary_sample_accession", + "study_accession", + "secondary_study_accession", + "submission_accession", + "run_alias", + "experiment_alias", + "sample_alias", + "study_alias", + "library_layout", + "library_selection", + "library_source", + "library_strategy", + "library_name", + "instrument_model", + "instrument_platform", + "base_count", + "read_count", + "tax_id", + "scientific_name", + "sample_title", + "experiment_title", + "study_title", + "sample_description", + "fastq_md5", + "fastq_bytes", + "fastq_ftp", + "fastq_galaxy", + "fastq_aspera", +) + + +class Response: + """ + Define an HTTP response class. + + This class should not have to be instantiated directly. + + Attributes: + status (int): The numeric HTTP status code of the response. + reason (str): The response's reason phrase. + body (bytes): The response's decompressed body content as bytes. + + Methods: + text: The response's body as a decoded string. + + """ + + def __init__(self, *, response, **kwargs): + """ + Initialize an HTTP response object. + + Args: + response (http.client.HTTPResponse): A standard library response object + that is wrapped by this class. + **kwargs: Passed to parent classes. + + """ + super().__init__(**kwargs) + self._response = response + # Immediately read the body while the response context is still available. + self._raw = self._response.read() + self._content = None + + def _decompress(self): + """Decompress the response body if necessary.""" + method = self._response.getheader("Content-Encoding", "") + if not method: + self._content = self._raw + return + if method == "gzip": + self._content = gzip.decompress(self._raw) + elif method == "deflate": + self._content = zlib.decompress(self._raw) + else: + raise ValueError(f"Unsupported compression: {method}") + + @property + def status(self): + """Get the response's HTTP status code.""" + return self._response.status + + @property + def reason(self): + """Get the response's reason phrase.""" + return self._response.reason + + @property + def body(self): + """Get the response's decompressed body content as bytes.""" + if self._content is None: + self._decompress() + return self._content + + def text(self, encoding=None): + """Return the response's body as a decoded string.""" + if encoding is None: + _, params = cgi.parse_header(self._response.getheader("Content-Type", "")) + encoding = params.get("charset", "utf-8") + return self.body.decode(encoding) + + +class DatabaseIdentifierChecker: + """Define a service class for validating database identifiers.""" + + _VALID_PREFIXES = frozenset(PREFIX_LIST) + + @classmethod + def is_valid(cls, identifier): + """ + Check the validity of the given database identifier. + + Args: + identifier (str): A short identifier presumably belonging to one of the + supported databases. + + Returns: + bool: Whether or not the identifier is valid. + + """ + match = ID_REGEX.match(identifier) + if match is None: + return False + return match.group(1) in cls._VALID_PREFIXES + + +class DatabaseResolver: + """Define a service class for resolving various identifiers to experiments.""" + + _GEO_GSM_PREFIXES = {"GSM"} + _GEO_GSE_PREFIXES = {"GDS", "GSE"} + _SRA_PREFIXES = { + "DRA", + "DRP", + "DRS", + "DRX", + "PRJDB", + "SAMD", + } + _ENA_PREFIXES = {"ERR", "SRR", "SAMN", "DRR"} + + @classmethod + def expand_identifier(cls, identifier): + """ + Expand the given identifier to potentially multiple experiment identifiers. + + Args: + identifier (str): A short identifier presumably belonging to one of the + supported databases. + + Returns: + list: A list of one or more SRA/ENA experiment identifiers. + + """ + prefix = ID_REGEX.match(identifier).group(1) + if prefix in cls._GEO_GSM_PREFIXES: + return cls._gsm_to_srx(identifier) + elif prefix in cls._GEO_GSE_PREFIXES: + return cls._gse_to_srx(identifier) + elif prefix in cls._SRA_PREFIXES: + return cls._id_to_srx(identifier) + elif prefix in cls._ENA_PREFIXES: + return cls._id_to_erx(identifier) + else: + return [identifier] + + @classmethod + def _content_check(cls, response, identifier): + """Check that the response has content or terminate.""" + if response.status == 204: + logger.error(f"There is no content for id {identifier}. Maybe you lack the right " f"permissions?") + sys.exit(1) + + @classmethod + def _id_to_srx(cls, identifier): + """Resolve the identifier to SRA experiments.""" + params = {"id": identifier, "db": "sra", "rettype": "runinfo", "retmode": "text"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + return [row["Experiment"] for row in open_table(response, delimiter=",")] + + @classmethod + def _gsm_to_srx(cls, identifier): + """Resolve the GEO identifier to SRA experiments.""" + ids = [] + params = {"term": identifier, "db": "sra", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + gsm_ids = r_json["esearchresult"]["idlist"] + for gsm_id in gsm_ids: + ids += cls._id_to_srx(gsm_id) + return ids + + @classmethod + def _gds_to_gsm(cls, identifier): + """Resolve the GEO UIDs to GSM IDs to then resolve to SRA IDs.""" + ids = [] + params = {"id": identifier, "db": "gds", "retmode": "json", "retmax": 10} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + + for each in r_json["result"][identifier]["samples"][0:]: + ids += cls._gsm_to_srx(each["accession"]) + return ids + + @classmethod + def _gse_to_srx(cls, identifier): + """Resolve the GSE identifier to GEO UIDs.""" + ids = [] + params = {"term": identifier, "db": "gds", "retmode": "json"} + response = fetch_url(f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?{urlencode(params)}") + cls._content_check(response, identifier) + r_json = json.loads(response.text()) + gds_uids = r_json["esearchresult"]["idlist"] + for gds_uid in gds_uids: + ids += cls._gds_to_gsm(gds_uid) + return ids + + @classmethod + def _id_to_erx(cls, identifier): + """Resolve the identifier to ENA experiments.""" + fields = ["run_accession", "experiment_accession"] + params = { + "accession": identifier, + "result": "read_run", + "fields": ",".join(fields), + } + response = fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}") + cls._content_check(response, identifier) + return [row["experiment_accession"] for row in open_table(response, delimiter="\t")] + + +class ENAMetadataFetcher: + """Define a service class for fetching metadata from ENA.""" + + def __init__(self, ena_metadata_fields, **kwargs): + """ + Initialize the service with the desired metadata fields. + + Args: + ena_metadata_fields (iterable): An iterable of the desired fields. + **kwargs: Passed to parent constructor. + """ + super().__init__(**kwargs) + self._params = {"result": "read_run", "fields": ",".join(ena_metadata_fields)} + + def open_experiment_table(self, accession): + """ + Open the metadata table belonging to the given experiment accession. + + Args: + accession (str): An ENA experiment accession. + + Returns: + csv.DictReader: A CSV reader instance of the metadata. + + """ + params = {**self._params, "accession": accession} + response = fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/filereport?{urlencode(params)}") + self._content_check(response, accession) + return open_table(response, delimiter="\t") + + @classmethod + def _content_check(cls, response, identifier): + """Check that the response has content or terminate.""" + if response.status == 204: + logger.error(f"There is no content for id {identifier}. Maybe you lack the right " f"permissions?") + sys.exit(1) + + +def open_table(response, delimiter=","): + """ + Return a CSV reader instance from the given response. + + Args: + response (Response): An instance of the local HTTP response class. + delimiter (str): The delimiter separating the table fields. + + Returns: + csv.DictReader: A CSV reader instance of the response body. + + """ + return csv.DictReader(response.text().splitlines(), delimiter=delimiter) + + +def parse_args(args=None): + parser = argparse.ArgumentParser( + description="Download and create a run information metadata file from SRA / " "ENA / DDBJ / GEO identifiers.", + epilog="Example usage: python fetch_sra_runinfo.py ", + ) + parser.add_argument( + "file_in", + metavar="FILE_IN", + type=Path, + help="File containing database identifiers, one per line.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Output file in tab-delimited format.", + ) + parser.add_argument( + "-ef", + "--ena_metadata_fields", + type=str, + default=",".join(ENA_METADATA_FIELDS), + help=f"Comma-separated list of ENA metadata fields to fetch " f"(default: {','.join(ENA_METADATA_FIELDS)}).", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(args) + + +def validate_fields_parameter(param, valid_vals, param_desc): + if not param: + return [] + user_vals = param.split(",") + if len(set(user_vals) & set(valid_vals)) == len(user_vals): + return user_vals + else: + invalid_vals = [x for x in user_vals if x not in valid_vals] + logger.error( + f"Please provide a valid value for {param_desc}!\n" + f"Provided values = {param}\n" + f"Accepted values = {','.join(valid_vals)}\n" + f"The following values are invalid: {','.join(invalid_vals)}\n" + ) + sys.exit(1) + + +def fetch_url(url): + """Return a response object for the given URL and handle errors appropriately.""" + sleep_time = 5 # Hardcode sleep duration in seconds + max_num_attempts = 3 # Hardcode max number of request attempts + attempt = 0 + + try: + with urlopen(url) as response: + return Response(response=response) + + except HTTPError as e: + if e.status == 429: + # If the response is 429, sleep and retry + if "Retry-After" in e.headers: + retry_after = int(e.headers["Retry-After"]) + logging.warning(f"Received 429 response from server. Retrying after {retry_after} seconds...") + time.sleep(retry_after) + else: + logging.warning(f"Received 429 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 # Increment sleep time + attempt += 1 + return fetch_url(url) # Recursive call to retry request + + elif e.status == 500: + # If the response is 500, sleep and retry max 3 times + if attempt <= max_num_attempts: + logging.warning(f"Received 500 response from server. Retrying in {sleep_time} seconds...") + time.sleep(sleep_time) + sleep_time *= 2 + attempt += 1 + return fetch_url(url) + else: + logging.error("Exceeded max request attempts. Exiting.") + sys.exit(1) + + except URLError as e: + logger.error("We failed to reach a server.") + logger.error(f"Reason: {e.reason}") + sys.exit(1) + + +def get_ena_fields(): + params = {"dataPortal": "ena", "format": "tsv", "result": "read_run"} + return [ + row["columnId"] + for row in open_table( + fetch_url(f"https://www.ebi.ac.uk/ena/portal/api/returnFields?{urlencode(params)}"), + delimiter="\t", + ) + ] + + +def fetch_sra_runinfo(file_in, file_out, ena_metadata_fields): + seen_ids = set() + run_ids = set() + ena_fetcher = ENAMetadataFetcher(ena_metadata_fields) + with open(file_in, "r") as fin, open(file_out, "w") as fout: + writer = csv.DictWriter(fout, fieldnames=ena_metadata_fields, delimiter="\t") + writer.writeheader() + for line in fin: + db_id = line.strip() + if db_id in seen_ids: + continue + seen_ids.add(db_id) + if not DatabaseIdentifierChecker.is_valid(db_id): + id_str = ", ".join([x + "*" for x in PREFIX_LIST]) + logger.error(f"Please provide a valid database id starting with {id_str}!\n" f"Line: '{line.strip()}'") + sys.exit(1) + ids = DatabaseResolver.expand_identifier(db_id) + if not ids: + logger.error(f"No matches found for database id {db_id}!\nLine: '{line.strip()}'") + sys.exit(1) + for accession in ids: + for row in ena_fetcher.open_experiment_table(accession): + run_accession = row["run_accession"] + if run_accession not in run_ids: + writer.writerow(row) + run_ids.add(run_accession) + + +def main(args=None): + args = parse_args(args) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + if not args.file_in.is_file(): + logger.error(f"The given input file {args.file_in} was not found!") + sys.exit(1) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + ena_metadata_fields = validate_fields_parameter( + args.ena_metadata_fields, + valid_vals=get_ena_fields(), + param_desc="--ena_metadata_fields", + ) + fetch_sra_runinfo(args.file_in, args.file_out, ena_metadata_fields) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/bin/sra_runinfo_to_ftp.py b/bin/sra_runinfo_to_ftp.py new file mode 100755 index 00000000..ef80ec80 --- /dev/null +++ b/bin/sra_runinfo_to_ftp.py @@ -0,0 +1,162 @@ +#!/usr/bin/env python + + +import argparse +import csv +import logging +import sys +from itertools import chain +from pathlib import Path + + +logger = logging.getLogger() + + +def parse_args(args=None): + Description = "Create samplesheet with FTP download links and md5ums from sample information obtained via 'sra_ids_to_runinfo.py' script." + Epilog = "Example usage: python sra_runinfo_to_ftp.py " + + parser = argparse.ArgumentParser(description=Description, epilog=Epilog) + parser.add_argument( + "files_in", + metavar="FILES_IN", + help="Comma-separated list of metadata file created from 'sra_ids_to_runinfo.py' script.", + ) + parser.add_argument( + "file_out", + metavar="FILE_OUT", + type=Path, + help="Output file containing paths to download FastQ files along with their associated md5sums.", + ) + parser.add_argument( + "-l", + "--log-level", + help="The desired log level (default WARNING).", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG"), + default="WARNING", + ) + return parser.parse_args(args) + + +def valid_fastq_extension(fastq): + return fastq.endswith("fastq.gz") + + +def parse_sra_runinfo(file_in): + runinfo = {} + columns = [ + "run_accession", + "experiment_accession", + "library_layout", + "fastq_ftp", + "fastq_md5", + ] + extensions = [ + "fastq_1", + "fastq_2", + "md5_1", + "md5_2", + "single_end", + ] + with open(file_in, "r", newline="") as fin: + reader = csv.DictReader(fin, delimiter="\t", skipinitialspace=True) + header = list(reader.fieldnames) + if missing := frozenset(columns).difference(frozenset(header)): + logger.critical(f"The following expected columns are missing from {file_in}: " f"{', '.join(missing)}.") + sys.exit(1) + for row in reader: + db_id = row["experiment_accession"] + if row["fastq_ftp"]: + fq_files = row["fastq_ftp"].split(";")[-2:] + fq_md5 = row["fastq_md5"].split(";")[-2:] + if len(fq_files) == 1: + assert fq_files[0].endswith(".fastq.gz"), f"Unexpected FastQ file format {file_in.name}." + if row["library_layout"] != "SINGLE": + logger.warning(f"The library layout '{row['library_layout']}' should be " f"'SINGLE'.") + sample = { + "fastq_1": fq_files[0], + "fastq_2": None, + "md5_1": fq_md5[0], + "md5_2": None, + "single_end": "true", + } + elif len(fq_files) == 2: + assert fq_files[0].endswith("_1.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." + assert fq_files[1].endswith("_2.fastq.gz"), f"Unexpected FastQ file format {file_in.name}." + if row["library_layout"] != "PAIRED": + logger.warning(f"The library layout '{row['library_layout']}' should be " f"'PAIRED'.") + sample = { + "fastq_1": fq_files[0], + "fastq_2": fq_files[1], + "md5_1": fq_md5[0], + "md5_2": fq_md5[1], + "single_end": "false", + } + else: + raise RuntimeError(f"Unexpected number of FastQ files: {fq_files}.") + else: + # In some instances, FTP links don't exist for FastQ files. + # These have to be downloaded with the run accession using sra-tools. + sample = dict.fromkeys(extensions, None) + if row["library_layout"] == "SINGLE": + sample["single_end"] = "true" + elif row["library_layout"] == "PAIRED": + sample["single_end"] = "false" + + sample.update(row) + if db_id not in runinfo: + runinfo[db_id] = [sample] + else: + if sample in runinfo[db_id]: + logger.error( + f"Input run info file contains duplicate rows!\n" f"{', '.join([row[col] for col in header])}" + ) + else: + runinfo[db_id].append(sample) + + return runinfo, header + extensions + + +def sra_runinfo_to_ftp(files_in, file_out): + samplesheet = {} + header = [] + for file_in in files_in: + runinfo, sample_header = parse_sra_runinfo(file_in) + header.append(sample_header) + for db_id, rows in runinfo.items(): + if db_id not in samplesheet: + samplesheet[db_id] = rows + else: + logger.warning(f"Duplicate sample identifier found!\nID: '{db_id}'") + + # Create a combined header from all input files. + combined_header = header[0] + list(set().union(chain.from_iterable(header)).difference(header[0])) + combined_header.insert(0, "id") + + # Write samplesheet with paths to FastQ files and md5 sums. + if samplesheet: + with file_out.open("w", newline="") as fout: + writer = csv.DictWriter(fout, fieldnames=combined_header, delimiter="\t") + writer.writeheader() + for db_id in sorted(samplesheet): + for idx, row in enumerate(samplesheet[db_id], start=1): + row["id"] = f"{db_id}" + if "run_accession" in row: + row["id"] = f"{db_id}_{row['run_accession']}" + writer.writerow(row) + + +def main(args=None): + args = parse_args(args) + logging.basicConfig(level=args.log_level, format="[%(levelname)s] %(message)s") + files = [Path(x.strip()) for x in args.files_in.split(",")] + for path in files: + if not path.is_file(): + logger.critical(f"The given input file {path} was not found!") + sys.exit(1) + args.file_out.parent.mkdir(parents=True, exist_ok=True) + sra_runinfo_to_ftp(files, args.file_out) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/conf/base.config b/conf/base.config index 78c474dd..13b3c3e7 100644 --- a/conf/base.config +++ b/conf/base.config @@ -15,6 +15,12 @@ process { memory = { 6.GB * task.attempt } time = { 4.h * task.attempt } + publishDir = [ + path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } maxRetries = 1 maxErrors = '-1' @@ -24,7 +30,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { 1 } diff --git a/conf/igenomes.config b/conf/igenomes.config deleted file mode 100644 index 3f114377..00000000 --- a/conf/igenomes.config +++ /dev/null @@ -1,440 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for iGenomes paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines reference genomes using iGenome paths. - Can be used by any config that customises the base path using: - $params.igenomes_base / --igenomes_base ----------------------------------------------------------------------------------------- -*/ - -params { - // illumina iGenomes reference file paths - genomes { - 'GRCh37' { - fasta = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/Ensembl/GRCh37/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/GRCh37-blacklist.bed" - } - 'GRCh38' { - fasta = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/NCBI/GRCh38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'CHM13' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAIndex/" - bwamem2 = "${params.igenomes_base}/Homo_sapiens/UCSC/CHM13/Sequence/BWAmem2Index/" - gtf = "${params.igenomes_base}/Homo_sapiens/NCBI/CHM13/Annotation/Genes/genes.gtf" - gff = "ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/009/914/755/GCF_009914755.1_T2T-CHM13v2.0/GCF_009914755.1_T2T-CHM13v2.0_genomic.gff.gz" - mito_name = "chrM" - } - 'GRCm38' { - fasta = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/Ensembl/GRCm38/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/GRCm38-blacklist.bed" - } - 'TAIR10' { - fasta = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Arabidopsis_thaliana/Ensembl/TAIR10/Annotation/README.txt" - mito_name = "Mt" - } - 'EB2' { - fasta = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bacillus_subtilis_168/Ensembl/EB2/Annotation/README.txt" - } - 'UMD3.1' { - fasta = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Bos_taurus/Ensembl/UMD3.1/Annotation/README.txt" - mito_name = "MT" - } - 'WBcel235' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/Ensembl/WBcel235/Annotation/Genes/genes.bed" - mito_name = "MtDNA" - macs_gsize = "9e7" - } - 'CanFam3.1' { - fasta = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/Ensembl/CanFam3.1/Annotation/README.txt" - mito_name = "MT" - } - 'GRCz10' { - fasta = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/Ensembl/GRCz10/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'BDGP6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/Ensembl/BDGP6/Annotation/Genes/genes.bed" - mito_name = "M" - macs_gsize = "1.2e8" - } - 'EquCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/Ensembl/EquCab2/Annotation/README.txt" - mito_name = "MT" - } - 'EB1' { - fasta = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Escherichia_coli_K_12_DH10B/Ensembl/EB1/Annotation/README.txt" - } - 'Galgal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/Ensembl/Galgal4/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Gm01' { - fasta = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Glycine_max/Ensembl/Gm01/Annotation/README.txt" - } - 'Mmul_1' { - fasta = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Macaca_mulatta/Ensembl/Mmul_1/Annotation/README.txt" - mito_name = "MT" - } - 'IRGSP-1.0' { - fasta = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Oryza_sativa_japonica/Ensembl/IRGSP-1.0/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'CHIMP2.1.4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/Ensembl/CHIMP2.1.4/Annotation/README.txt" - mito_name = "MT" - } - 'Rnor_5.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_5.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'Rnor_6.0' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/Ensembl/Rnor_6.0/Annotation/Genes/genes.bed" - mito_name = "MT" - } - 'R64-1-1' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Saccharomyces_cerevisiae/Ensembl/R64-1-1/Annotation/Genes/genes.bed" - mito_name = "MT" - macs_gsize = "1.2e7" - } - 'EF2' { - fasta = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Schizosaccharomyces_pombe/Ensembl/EF2/Annotation/README.txt" - mito_name = "MT" - macs_gsize = "1.21e7" - } - 'Sbi1' { - fasta = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sorghum_bicolor/Ensembl/Sbi1/Annotation/README.txt" - } - 'Sscrofa10.2' { - fasta = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/Ensembl/Sscrofa10.2/Annotation/README.txt" - mito_name = "MT" - } - 'AGPv3' { - fasta = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Zea_mays/Ensembl/AGPv3/Annotation/Genes/genes.bed" - mito_name = "Mt" - } - 'hg38' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg38/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg38-blacklist.bed" - } - 'hg19' { - fasta = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Homo_sapiens/UCSC/hg19/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "2.7e9" - blacklist = "${projectDir}/assets/blacklists/hg19-blacklist.bed" - } - 'mm10' { - fasta = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Mus_musculus/UCSC/mm10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.87e9" - blacklist = "${projectDir}/assets/blacklists/mm10-blacklist.bed" - } - 'bosTau8' { - fasta = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Bos_taurus/UCSC/bosTau8/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'ce10' { - fasta = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Caenorhabditis_elegans/UCSC/ce10/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "9e7" - } - 'canFam3' { - fasta = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Canis_familiaris/UCSC/canFam3/Annotation/README.txt" - mito_name = "chrM" - } - 'danRer10' { - fasta = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Danio_rerio/UCSC/danRer10/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.37e9" - } - 'dm6' { - fasta = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Drosophila_melanogaster/UCSC/dm6/Annotation/Genes/genes.bed" - mito_name = "chrM" - macs_gsize = "1.2e8" - } - 'equCab2' { - fasta = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Equus_caballus/UCSC/equCab2/Annotation/README.txt" - mito_name = "chrM" - } - 'galGal4' { - fasta = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Gallus_gallus/UCSC/galGal4/Annotation/README.txt" - mito_name = "chrM" - } - 'panTro4' { - fasta = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Pan_troglodytes/UCSC/panTro4/Annotation/README.txt" - mito_name = "chrM" - } - 'rn6' { - fasta = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Rattus_norvegicus/UCSC/rn6/Annotation/Genes/genes.bed" - mito_name = "chrM" - } - 'sacCer3' { - fasta = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Sequence/BismarkIndex/" - readme = "${params.igenomes_base}/Saccharomyces_cerevisiae/UCSC/sacCer3/Annotation/README.txt" - mito_name = "chrM" - macs_gsize = "1.2e7" - } - 'susScr3' { - fasta = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/WholeGenomeFasta/genome.fa" - bwa = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BWAIndex/version0.6.0/" - bowtie2 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/Bowtie2Index/" - star = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/STARIndex/" - bismark = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Sequence/BismarkIndex/" - gtf = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.gtf" - bed12 = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/Genes/genes.bed" - readme = "${params.igenomes_base}/Sus_scrofa/UCSC/susScr3/Annotation/README.txt" - mito_name = "chrM" - } - } -} diff --git a/conf/igenomes_ignored.config b/conf/igenomes_ignored.config deleted file mode 100644 index b4034d82..00000000 --- a/conf/igenomes_ignored.config +++ /dev/null @@ -1,9 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Nextflow config file for iGenomes paths -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Empty genomes dictionary to use when igenomes is ignored. ----------------------------------------------------------------------------------------- -*/ - -params.genomes = [:] diff --git a/conf/test.config b/conf/test.config index c5e61d08..a4cd23aa 100644 --- a/conf/test.config +++ b/conf/test.config @@ -23,10 +23,5 @@ params { config_profile_description = 'Minimal test dataset to check pipeline function' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' - - // Genome references - genome = 'R64-1-1' + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/2732b911c57e607fa7aea5ba0c3d91b25bafb662/testdata/v1.12.0/sra_ids_test.csv' } diff --git a/conf/test_full.config b/conf/test_full.config index a59e88c1..84595dfa 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -14,11 +14,6 @@ params { config_profile_name = 'Full test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = params.pipelines_testdata_base_path + 'viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' - - // Genome references - genome = 'R64-1-1' + // File containing SRA ids from nf-core/rnaseq -profile test_full for full-sized test + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/100736c99d87667fb7c247c267bc8acfac647bed/testdata/v1.12.0/sra_ids_rnaseq_test_full.csv' } diff --git a/docs/images/nf-core-fetchngs_metro_map_grey.png b/docs/images/nf-core-fetchngs_metro_map_grey.png new file mode 100644 index 00000000..eb4b49bc Binary files /dev/null and b/docs/images/nf-core-fetchngs_metro_map_grey.png differ diff --git a/docs/images/nf-core-fetchngs_metro_map_grey.svg b/docs/images/nf-core-fetchngs_metro_map_grey.svg new file mode 100644 index 00000000..9857bc12 --- /dev/null +++ b/docs/images/nf-core-fetchngs_metro_map_grey.svg @@ -0,0 +1,12733 @@ + + + +21License:3nf-core/fetchngsENASRADDBJGEODATABASE IDsSTAGE2. Download FASTQ files1. Download metadata3. Downstream pipelinesAsperaFTPsra-toolsFASTQCSVFetchmetadataGetdownloadlinksnf-core/rnaseqnf-core/atacseqnf-core/viralreconnf-core/taxprofilerCSVCSVCSVCSVCSV diff --git a/docs/output.md b/docs/output.md index cc7c7bf8..5a27bfca 100644 --- a/docs/output.md +++ b/docs/output.md @@ -2,48 +2,37 @@ ## Introduction -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. - -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - +This document describes the output produced by the pipeline. The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. ## Pipeline overview -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: +The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data depending on the type of ids provided: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline +- Download FastQ files and create samplesheet from [SRA / ENA / DDBJ / GEO ids](#sra--ena--ddbj--geo-ids) - [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution -### FastQC - -
-Output files - -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. - -
+Please see the [usage documentation](https://nf-co.re/fetchngs/usage#introduction) for a list of supported public repository identifiers and how to provide them to the pipeline. -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -### MultiQC +### SRA / ENA / DDBJ / GEO ids
Output files -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. +- `fastq/` + - `*.fastq.gz`: Paired-end/single-end reads downloaded from the SRA / ENA / DDBJ / GEO. +- `fastq/md5/` + - `*.md5`: Files containing `md5` sum for FastQ files downloaded from the ENA. +- `samplesheet/` + - `samplesheet.csv`: Auto-created samplesheet with collated metadata and paths to downloaded FastQ files. + - `id_mappings.csv`: File with selected fields that can be used to rename samples to more informative names; see [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. + - `multiqc_config.yml`: [MultiQC](https://multiqc.info/docs/#bulk-sample-renaming) config file that can be passed to most nf-core pipelines via the `--multiqc_config` parameter for bulk renaming of sample names from database ids; [`--sample_mapping_fields`](https://nf-co.re/fetchngs/parameters#sample_mapping_fields) parameter to customise this behaviour. +- `metadata/` + - `*.runinfo_ftp.tsv`: Re-formatted metadata file downloaded from the ENA. + - `*.runinfo.tsv`: Original metadata file downloaded from the ENA.
-[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. - -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. ### Pipeline information @@ -53,7 +42,6 @@ Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQ - `pipeline_info/` - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Reformatted samplesheet files used as input to the pipeline: `samplesheet.valid.csv`. - Parameters used by the pipeline run: `params.json`. diff --git a/docs/usage.md b/docs/usage.md index 83f78529..b0e06bb9 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -6,58 +6,93 @@ ## Introduction - +The pipeline has been set-up to automatically download and process the raw FastQ files from public repositories. Identifiers can be provided in a file, one-per-line via the `--input` parameter. Currently, the following types of example identifiers are supported: -## Samplesheet input +| `SRA` | `ENA` | `DDBJ` | `GEO` | +| ------------ | ------------ | ------------ | ---------- | +| SRR11605097 | ERR4007730 | DRR171822 | GSM4432381 | +| SRX8171613 | ERX4009132 | DRX162434 | GSE147507 | +| SRS6531847 | ERS4399630 | DRS090921 | | +| SAMN14689442 | SAMEA6638373 | SAMD00114846 | | +| SRP256957 | ERP120836 | DRP004793 | | +| SRA1068758 | ERA2420837 | DRA008156 | | +| PRJNA625551 | PRJEB37513 | PRJDB4176 | | -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +### SRR / ERR / DRR ids + +If `SRR`/`ERR`/`DRR` run ids are provided then these will be resolved back to their appropriate `SRX`/`ERX`/`DRX` ids to be able to merge multiple runs from the same experiment. This is conceptually the same as merging multiple libraries sequenced from the same sample. + +The final sample information for all identifiers is obtained from the ENA which provides direct download links for FastQ files as well as their associated md5 sums. If download links exist, the files will be downloaded in parallel by FTP. Otherwise they are downloaded using sra-tools. + +All of the sample metadata obtained from the ENA will be appended as additional columns to help you manually curate the generated samplesheet before you run the pipeline. You can customise the metadata fields that are appended to the samplesheet via the `--ena_metadata_fields` parameter. The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. However, this pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. A comprehensive list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run). + +If you have a GEO accession (found in the data availability section of published papers) you can directly download a text file containing the appropriate SRA ids to pass to the pipeline: + +- Search for your GEO accession on [GEO](https://www.ncbi.nlm.nih.gov/geo) +- Click `SRA Run Selector` at the bottom of the GEO accession page +- Select the desired samples in the `SRA Run Selector` and then download the `Accession List` + +This downloads a text file called `SRR_Acc_List.txt` that can be directly provided to the pipeline once renamed with a .csv extension e.g. `--input SRR_Acc_List.csv`. + +### Samplesheet format + +As a bonus, the columns in the auto-created samplesheet can be tailored to be accepted out-of-the-box by selected nf-core pipelines, these currently include: + +- [nf-core/rnaseq](https://nf-co.re/rnaseq/usage#samplesheet-input) +- [nf-core/atacseq](https://nf-co.re/atacseq/usage#samplesheet-input) +- Ilumina processing mode of [nf-core/viralrecon](https://nf-co.re/viralrecon/usage#illumina-samplesheet-format) +- [nf-core/taxprofiler](https://nf-co.re/nf-core/taxprofiler) + +You can use the `--nf_core_pipeline` parameter to customise this behaviour e.g. `--nf_core_pipeline rnaseq`. More pipelines will be supported in due course as we adopt and standardise samplesheet input across nf-core. It is highly recommended that you double-check that all of the identifiers required by the downstream nf-core pipeline are accurately represented in the samplesheet. For example, the nf-core/atacseq pipeline requires a `replicate` column to be provided in it's input samplehsheet, however, public databases don't reliably hold information regarding replicates so you may need to amend these entries if your samplesheet was created by providing `--nf_core_pipeline atacseq`. + +From v1.9 of this pipeline the default `strandedness` in the output samplesheet will be set to `auto` when using `--nf_core_pipeline rnaseq`. This will only work with v3.10 onwards of nf-core/rnaseq which permits the auto-detection of strandedness during the pipeline execution. You can change this behaviour with the `--nf_core_rnaseq_strandedness` parameter which is set to `auto` by default. + +### Accessions with more than 2 FastQ files + +Using `SRR9320616` as an example, if we run the pipeline with default options to download via Aspera/FTP the ENA API indicates that this sample is associated with a single FastQ file: -```bash ---input '[path to samplesheet file]' +``` +run_accession experiment_accession sample_accession secondary_sample_accession study_accession secondary_study_accession submission_accession run_alias experiment_alias sample_alias study_alias library_layout library_selection library_source library_strategy library_name instrument_model instrument_platform base_count read_count tax_id scientific_name sample_title experiment_title study_title sample_description fastq_md5 fastq_bytes fastq_ftp fastq_galaxy fastq_aspera +SRR9320616 SRX6088086 SAMN12086751 SRS4989433 PRJNA549480 SRP201778 SRA900583 GSM3895942_r1 GSM3895942 GSM3895942 GSE132901 PAIRED cDNA TRANSCRIPTOMIC RNA-Seq Illumina HiSeq 2500 ILLUMINA 11857688850 120996825 10090 Mus musculus Old 3 Kidney Illumina HiSeq 2500 sequencing: GSM3895942: Old 3 Kidney Mus musculus RNA-Seq A murine aging cell atlas reveals cell identity and tissue-specific trajectories of aging Old 3 Kidney 98c939bbae1a1fcf9624905516485b67 7763114613 ftp.sra.ebi.ac.uk/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz fasp.sra.ebi.ac.uk:/vol1/fastq/SRR932/006/SRR9320616/SRR9320616.fastq.gz ``` -### Multiple runs of the same sample +However, this sample actually has 2 additional FastQ files that are flagged as technical and can only be obtained by running sra-tools. This is particularly important for certain preps like 10x and others using UMI barcodes. -The `sample` identifiers have to be the same when you have re-sequenced the same sample more than once e.g. to increase sequencing depth. The pipeline will concatenate the raw reads before performing any downstream analysis. Below is an example for the same sample sequenced across 3 lanes: +``` +$ fasterq-dump --threads 6 --split-files --include-technical SRR9320616 --outfile SRR9320616.fastq --progress -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L003_R1_001.fastq.gz,AEG588A1_S1_L003_R2_001.fastq.gz -CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz +SRR9320616_1.fastq +SRR9320616_2.fastq +SRR9320616_3.fastq ``` -### Full samplesheet +This highlights that there is a discrepancy between the read data hosted on the ENA API and what can actually be fetched from sra-tools, where the latter seems to be the source of truth. If you anticipate that you may have more than 2 FastQ files per sample, it is recommended to use this pipeline with the `--download_method sratools` parameter. -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +See [issue #260](https://github.com/nf-core/fetchngs/issues/260) for more details. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +### Primary options for downloading data -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, -``` +If the appropriate download links are available, the pipeline uses FTP by default to download FastQ files by setting the `--download_method ftp` parameter. If you are having issues and prefer to use sra-tools or Aspera instead, you can set the [`--download_method`](https://nf-co.re/fetchngs/parameters#download_method) parameter to `--download_method sratools` or `--download_method aspera`, respectively. + +### Downloading dbGAP data with JWT + +As of v1.10.0, the SRA Toolkit used in this pipeline can be configured to access protected data from dbGAP using a [JWT cart file](https://www.ncbi.nlm.nih.gov/sra/docs/sra-dbGAP-cloud-download/) on a supported cloud computing environment (Amazon Web Services or Google Cloud Platform). The JWT cart file can be specified with `--dbgap_key /path/to/cart.jwt`. + +Note that due to the way the pipeline resolves SRA IDs down to the experiment to be able to merge multiple runs, your JWT cart file must be generated for _all_ runs in an experiment. Otherwise, upon running `prefetch` and `fasterq-dump`, the pipeline will return a `403 Error` when trying to download data for other runs under an experiment that are not authenticated for with the provided JWT cart file. + +Users can log into the [SRA Run Selector](https://www.ncbi.nlm.nih.gov/Traces/study/), search for the dbGAP study they have been granted access to using the phs identifier, and select all available runs to activate the `JWT Cart` button to download the file. -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +To test this functionality in your cloud computing environment, you can use the protected dbGAP cloud testing study with experiment accession `SRX512039`: -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +- On the [SRA Run Selector page for `SRX512039`](https://www.ncbi.nlm.nih.gov/Traces/study/?acc=SRX512039&o=acc_s%3Aa), select the two available runs (`SRR1219865` and `SRR1219902`) and click on `JWT Cart` to download a key file called `cart.jwt` that can be directly provided to the pipeline with `--dbgap_key cart.jwt` +- Click on `Accession List` to download a text file called `SRR_Acc_List.txt` with the SRR IDs that can be directly provided to the pipeline with `--input SRR_Acc_List.txt` ## Running the pipeline The typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/fetchngs --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/fetchngs --input ./ids.csv --outdir ./results -profile docker ``` This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. @@ -90,7 +125,6 @@ with: ```yaml title="params.yaml" input: './samplesheet.csv' outdir: './results/' -genome: 'GRCh37' <...> ``` @@ -181,7 +215,7 @@ To change the resource requests, please see the [max resources](https://nf-co.re ### Custom Containers -In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version maybe out of date. +In some cases you may wish to change which container or conda environment a step of the pipeline uses for a particular tool. By default nf-core pipelines use containers and software from the [biocontainers](https://biocontainers.pro/) or [bioconda](https://bioconda.github.io/) projects. However in some cases the pipeline specified version may be out of date. To use a different container from the default container or conda environment specified in a pipeline, please see the [updating tool versions](https://nf-co.re/docs/usage/configuration#updating-tool-versions) section of the nf-core website. diff --git a/main.nf b/main.nf index ac69a56e..6280a384 100644 --- a/main.nf +++ b/main.nf @@ -15,21 +15,9 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -include { FETCHNGS } from './workflows/fetchngs' +include { SRA } from './workflows/sra' include { PIPELINE_INITIALISATION } from './subworkflows/local/utils_nfcore_fetchngs_pipeline' include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_fetchngs_pipeline' -include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_fetchngs_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - GENOME PARAMETER VALUES -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` -params.fasta = getGenomeAttribute('fasta') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -38,24 +26,22 @@ params.fasta = getGenomeAttribute('fasta') */ // -// WORKFLOW: Run main analysis pipeline depending on type of input +// WORKFLOW: Run main nf-core/fetchngs analysis pipeline depending on type of identifier provided // workflow NFCORE_FETCHNGS { take: - samplesheet // channel: samplesheet read in from --input + ids // channel: database ids read in from --input main: // - // WORKFLOW: Run pipeline + // WORKFLOW: Download FastQ files for SRA / ENA / GEO / DDBJ ids // - FETCHNGS ( - samplesheet - ) - emit: - multiqc_report = FETCHNGS.out.multiqc_report // channel: /path/to/multiqc_report.html + SRA ( ids ) + } + /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RUN MAIN WORKFLOW @@ -74,14 +60,15 @@ workflow { params.monochrome_logs, args, params.outdir, - params.input + params.input, + params.ena_metadata_fields ) // - // WORKFLOW: Run main workflow + // WORKFLOW: Run primary workflows for the pipeline // NFCORE_FETCHNGS ( - PIPELINE_INITIALISATION.out.samplesheet + PIPELINE_INITIALISATION.out.ids ) // // SUBWORKFLOW: Run completion tasks @@ -92,8 +79,7 @@ workflow { params.plaintext_email, params.outdir, params.monochrome_logs, - params.hook_url, - NFCORE_FETCHNGS.out.multiqc_report + params.hook_url ) } diff --git a/modules.json b/modules.json index c66da77f..d4c23bdf 100644 --- a/modules.json +++ b/modules.json @@ -5,20 +5,36 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { - "fastqc": { + "custom/sratoolsncbisettings": { "branch": "master", "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, - "multiqc": { + "sratools/fasterqdump": { "branch": "master", - "git_sha": "cf17ca47590cc578dfb47db1c2a44ef86f89976d", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", + "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"], + "patch": "modules/nf-core/sratools/fasterqdump/sratools-fasterqdump.diff" + }, + "sratools/prefetch": { + "branch": "master", + "git_sha": "1fc29f92e439d5631fdf34b8ac4687297d70f5ec", + "installed_by": ["fastq_download_prefetch_fasterqdump_sratools"] + }, + "untar": { + "branch": "master", + "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", "installed_by": ["modules"] } } }, "subworkflows": { "nf-core": { + "fastq_download_prefetch_fasterqdump_sratools": { + "branch": "master", + "git_sha": "1fc29f92e439d5631fdf34b8ac4687297d70f5ec", + "installed_by": ["subworkflows"] + }, "utils_nextflow_pipeline": { "branch": "master", "git_sha": "3aa0aec1d52d492fe241919f0c6100ebf0074082", diff --git a/modules/nf-core/fastqc/environment.yml b/modules/local/aspera_cli/environment.yml similarity index 62% rename from modules/nf-core/fastqc/environment.yml rename to modules/local/aspera_cli/environment.yml index 691d4c76..b47dd86a 100644 --- a/modules/nf-core/fastqc/environment.yml +++ b/modules/local/aspera_cli/environment.yml @@ -2,4 +2,4 @@ channels: - conda-forge - bioconda dependencies: - - bioconda::fastqc=0.12.1 + - bioconda::aspera-cli=4.14.0 diff --git a/modules/local/aspera_cli/main.nf b/modules/local/aspera_cli/main.nf new file mode 100644 index 00000000..b38d17c0 --- /dev/null +++ b/modules/local/aspera_cli/main.nf @@ -0,0 +1,68 @@ +process ASPERA_CLI { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/aspera-cli:4.14.0--hdfd78af_1' : + 'biocontainers/aspera-cli:4.14.0--hdfd78af_1' }" + + input: + tuple val(meta), val(fastq) + val user + + output: + tuple val(meta), path("*fastq.gz"), emit: fastq + tuple val(meta), path("*md5") , emit: md5 + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + def conda_prefix = ['singularity', 'apptainer'].contains(workflow.containerEngine) ? "export CONDA_PREFIX=/usr/local" : "" + if (meta.single_end) { + """ + $conda_prefix + + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[0]} \\ + ${meta.id}.fastq.gz + + echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 + md5sum -c ${meta.id}.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aspera_cli: \$(ascli --version) + END_VERSIONS + """ + } else { + """ + $conda_prefix + + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[0]} \\ + ${meta.id}_1.fastq.gz + + echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 + md5sum -c ${meta.id}_1.fastq.gz.md5 + + ascp \\ + $args \\ + -i \$CONDA_PREFIX/etc/aspera/aspera_bypass_dsa.pem \\ + ${user}@${fastq[1]} \\ + ${meta.id}_2.fastq.gz + + echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 + md5sum -c ${meta.id}_2.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + aspera_cli: \$(ascli --version) + END_VERSIONS + """ + } +} diff --git a/modules/local/aspera_cli/nextflow.config b/modules/local/aspera_cli/nextflow.config new file mode 100644 index 00000000..fa2dbd90 --- /dev/null +++ b/modules/local/aspera_cli/nextflow.config @@ -0,0 +1,17 @@ +process { + withName: 'ASPERA_CLI' { + ext.args = '-QT -l 300m -P33001' + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } +} diff --git a/modules/local/aspera_cli/tests/main.nf.test b/modules/local/aspera_cli/tests/main.nf.test new file mode 100644 index 00000000..63347000 --- /dev/null +++ b/modules/local/aspera_cli/tests/main.nf.test @@ -0,0 +1,31 @@ +nextflow_process { + + name "Test process: ASPERA_CLI" + script "../main.nf" + process "ASPERA_CLI" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [ id:'SRX9626017_SRR13191702', single_end:false, md5_1: '89c5be920021a035084d8aeb74f32df7', md5_2: '56271be38a80db78ef3bdfc5d9909b98' ], // meta map + [ + 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_1.fastq.gz', + 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_2.fastq.gz' + ] + ] + input[1] = 'era-fasp' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/aspera_cli/tests/main.nf.test.snap b/modules/local/aspera_cli/tests/main.nf.test.snap new file mode 100644 index 00000000..0ba6a643 --- /dev/null +++ b/modules/local/aspera_cli/tests/main.nf.test.snap @@ -0,0 +1,75 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "1": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "2": [ + "versions.yml:md5,a51a1dfc6308d71058ddc12c46101dd3" + ], + "fastq": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "md5": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "versions": [ + "versions.yml:md5,a51a1dfc6308d71058ddc12c46101dd3" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:52:00.601018" + } +} \ No newline at end of file diff --git a/modules/local/multiqc_mappings_config/main.nf b/modules/local/multiqc_mappings_config/main.nf new file mode 100644 index 00000000..8efe1caa --- /dev/null +++ b/modules/local/multiqc_mappings_config/main.nf @@ -0,0 +1,27 @@ + +process MULTIQC_MAPPINGS_CONFIG { + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path csv + + output: + path "*yml" , emit: yml + path "versions.yml", emit: versions + + script: + """ + multiqc_mappings_config.py \\ + $csv \\ + multiqc_config.yml + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/multiqc_mappings_config/nextflow.config b/modules/local/multiqc_mappings_config/nextflow.config new file mode 100644 index 00000000..11c58341 --- /dev/null +++ b/modules/local/multiqc_mappings_config/nextflow.config @@ -0,0 +1,9 @@ +process { + withName: 'MULTIQC_MAPPINGS_CONFIG' { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/modules/local/multiqc_mappings_config/tests/main.nf.test b/modules/local/multiqc_mappings_config/tests/main.nf.test new file mode 100644 index 00000000..dbb4d74f --- /dev/null +++ b/modules/local/multiqc_mappings_config/tests/main.nf.test @@ -0,0 +1,24 @@ +nextflow_process { + + name "Test process: MULTIQC_MAPPINGS_CONFIG" + script "../main.nf" + process "MULTIQC_MAPPINGS_CONFIG" + + test("Should run without failures") { + + when { + process { + """ + input[0] = file(params.pipelines_testdata_base_path + 'csv/SRX9626017_SRR13191702.mappings.csv', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/multiqc_mappings_config/tests/main.nf.test.snap b/modules/local/multiqc_mappings_config/tests/main.nf.test.snap new file mode 100644 index 00000000..43e46f61 --- /dev/null +++ b/modules/local/multiqc_mappings_config/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + "multiqc_config.yml:md5,7f3cb10fff83ba9eb3e8fa6862d1290a", + "versions.yml:md5,dd4c66f0551d15510b36bb2e2b2fdd73" + ] + ], + "1": [ + "versions.yml:md5,dd4c66f0551d15510b36bb2e2b2fdd73" + ], + "versions": [ + "versions.yml:md5,dd4c66f0551d15510b36bb2e2b2fdd73" + ], + "yml": [ + [ + "multiqc_config.yml:md5,7f3cb10fff83ba9eb3e8fa6862d1290a", + "versions.yml:md5,dd4c66f0551d15510b36bb2e2b2fdd73" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:52:12.65888" + } +} \ No newline at end of file diff --git a/modules/local/sra_fastq_ftp/main.nf b/modules/local/sra_fastq_ftp/main.nf new file mode 100644 index 00000000..017a08c9 --- /dev/null +++ b/modules/local/sra_fastq_ftp/main.nf @@ -0,0 +1,61 @@ + +process SRA_FASTQ_FTP { + tag "$meta.id" + label 'process_low' + label 'error_retry' + + conda "conda-forge::wget=1.21.4" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/wget:1.21.4' : + 'biocontainers/wget:1.21.4' }" + + input: + tuple val(meta), val(fastq) + + output: + tuple val(meta), path("*fastq.gz"), emit: fastq + tuple val(meta), path("*md5") , emit: md5 + path "versions.yml" , emit: versions + + script: + def args = task.ext.args ?: '' + if (meta.single_end) { + """ + wget \\ + $args \\ + -O ${meta.id}.fastq.gz \\ + ${fastq[0]} + + echo "${meta.md5_1} ${meta.id}.fastq.gz" > ${meta.id}.fastq.gz.md5 + md5sum -c ${meta.id}.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo \$(wget --version | head -n 1 | sed 's/^GNU Wget //; s/ .*\$//')) + END_VERSIONS + """ + } else { + """ + wget \\ + $args \\ + -O ${meta.id}_1.fastq.gz \\ + ${fastq[0]} + + echo "${meta.md5_1} ${meta.id}_1.fastq.gz" > ${meta.id}_1.fastq.gz.md5 + md5sum -c ${meta.id}_1.fastq.gz.md5 + + wget \\ + $args \\ + -O ${meta.id}_2.fastq.gz \\ + ${fastq[1]} + + echo "${meta.md5_2} ${meta.id}_2.fastq.gz" > ${meta.id}_2.fastq.gz.md5 + md5sum -c ${meta.id}_2.fastq.gz.md5 + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + wget: \$(echo \$(wget --version | head -n 1 | sed 's/^GNU Wget //; s/ .*\$//')) + END_VERSIONS + """ + } +} diff --git a/modules/local/sra_fastq_ftp/nextflow.config b/modules/local/sra_fastq_ftp/nextflow.config new file mode 100644 index 00000000..56e43959 --- /dev/null +++ b/modules/local/sra_fastq_ftp/nextflow.config @@ -0,0 +1,17 @@ +process { + withName: 'SRA_FASTQ_FTP' { + ext.args = '-t 5 -nv -c -T 60' + publishDir = [ + [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" + ], + [ + path: { "${params.outdir}/fastq/md5" }, + mode: params.publish_dir_mode, + pattern: "*.md5" + ] + ] + } +} diff --git a/modules/local/sra_fastq_ftp/tests/main.nf.test b/modules/local/sra_fastq_ftp/tests/main.nf.test new file mode 100644 index 00000000..bf005290 --- /dev/null +++ b/modules/local/sra_fastq_ftp/tests/main.nf.test @@ -0,0 +1,30 @@ +nextflow_process { + + name "Test process: SRA_FASTQ_FTP" + script "../main.nf" + process "SRA_FASTQ_FTP" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [ + [ id:'SRX9626017_SRR13191702', single_end:false, md5_1: '89c5be920021a035084d8aeb74f32df7', md5_2: '56271be38a80db78ef3bdfc5d9909b98' ], // meta map + [ + 'ftp.sra.ebi.ac.uk/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_1.fastq.gz', + 'ftp.sra.ebi.ac.uk/vol1/fastq/SRR131/002/SRR13191702/SRR13191702_2.fastq.gz' + ] + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/sra_fastq_ftp/tests/main.nf.test.snap b/modules/local/sra_fastq_ftp/tests/main.nf.test.snap new file mode 100644 index 00000000..229ad4fe --- /dev/null +++ b/modules/local/sra_fastq_ftp/tests/main.nf.test.snap @@ -0,0 +1,75 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "1": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "2": [ + "versions.yml:md5,7bfc86ca1f3e3236dbb91eb85c1d7af0" + ], + "fastq": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz:md5,baaaea61cba4294ec696fdfea1610848", + "SRX9626017_SRR13191702_2.fastq.gz:md5,8e43ad99049fabb6526a4b846da01c32" + ] + ] + ], + "md5": [ + [ + { + "id": "SRX9626017_SRR13191702", + "single_end": false, + "md5_1": "89c5be920021a035084d8aeb74f32df7", + "md5_2": "56271be38a80db78ef3bdfc5d9909b98" + }, + [ + "SRX9626017_SRR13191702_1.fastq.gz.md5:md5,055a6916ec9ee478e453d50651f87997", + "SRX9626017_SRR13191702_2.fastq.gz.md5:md5,c30ac785f8d80ec563fabf604d8bf945" + ] + ] + ], + "versions": [ + "versions.yml:md5,7bfc86ca1f3e3236dbb91eb85c1d7af0" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:51:51.301654" + } +} diff --git a/modules/local/sra_ids_to_runinfo/main.nf b/modules/local/sra_ids_to_runinfo/main.nf new file mode 100644 index 00000000..7d47f5e3 --- /dev/null +++ b/modules/local/sra_ids_to_runinfo/main.nf @@ -0,0 +1,33 @@ + +process SRA_IDS_TO_RUNINFO { + tag "$id" + label 'error_retry' + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + val id + val fields + + output: + path "*.tsv" , emit: tsv + path "versions.yml", emit: versions + + script: + def metadata_fields = fields ? "--ena_metadata_fields ${fields}" : '' + """ + echo $id > id.txt + sra_ids_to_runinfo.py \\ + id.txt \\ + ${id}.runinfo.tsv \\ + $metadata_fields + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/sra_ids_to_runinfo/nextflow.config b/modules/local/sra_ids_to_runinfo/nextflow.config new file mode 100644 index 00000000..9b9d0b16 --- /dev/null +++ b/modules/local/sra_ids_to_runinfo/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'SRA_IDS_TO_RUNINFO' { + publishDir = [ + path: { "${params.outdir}/metadata" }, + enabled: false + ] + } +} diff --git a/modules/local/sra_ids_to_runinfo/tests/main.nf.test b/modules/local/sra_ids_to_runinfo/tests/main.nf.test new file mode 100644 index 00000000..48797a1a --- /dev/null +++ b/modules/local/sra_ids_to_runinfo/tests/main.nf.test @@ -0,0 +1,25 @@ +nextflow_process { + + name "Test process: SRA_IDS_TO_RUNINFO" + script "../main.nf" + process "SRA_IDS_TO_RUNINFO" + + test("Should run without failures") { + + when { + process { + """ + input[0] = 'SRR13191702' + input[1] = '' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/sra_ids_to_runinfo/tests/main.nf.test.snap b/modules/local/sra_ids_to_runinfo/tests/main.nf.test.snap new file mode 100644 index 00000000..f7b6cee5 --- /dev/null +++ b/modules/local/sra_ids_to_runinfo/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "SRR13191702.runinfo.tsv:md5,3a1be35781ca6e8a28d8fd4d2f3bbe85" + ], + "1": [ + "versions.yml:md5,1c14442e9b494b586eafe41e77300fae" + ], + "tsv": [ + "SRR13191702.runinfo.tsv:md5,3a1be35781ca6e8a28d8fd4d2f3bbe85" + ], + "versions": [ + "versions.yml:md5,1c14442e9b494b586eafe41e77300fae" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:52:05.345153" + } +} \ No newline at end of file diff --git a/modules/local/sra_runinfo_to_ftp/main.nf b/modules/local/sra_runinfo_to_ftp/main.nf new file mode 100644 index 00000000..9c83cf53 --- /dev/null +++ b/modules/local/sra_runinfo_to_ftp/main.nf @@ -0,0 +1,27 @@ + +process SRA_RUNINFO_TO_FTP { + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path runinfo + + output: + path "*.tsv" , emit: tsv + path "versions.yml", emit: versions + + script: + """ + sra_runinfo_to_ftp.py \\ + ${runinfo.join(',')} \\ + ${runinfo.toString().tokenize(".")[0]}.runinfo_ftp.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/sra_runinfo_to_ftp/nextflow.config b/modules/local/sra_runinfo_to_ftp/nextflow.config new file mode 100644 index 00000000..43263648 --- /dev/null +++ b/modules/local/sra_runinfo_to_ftp/nextflow.config @@ -0,0 +1,9 @@ +process { + withName: 'SRA_RUNINFO_TO_FTP' { + publishDir = [ + path: { "${params.outdir}/metadata" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } +} diff --git a/modules/local/sra_runinfo_to_ftp/tests/main.nf.test b/modules/local/sra_runinfo_to_ftp/tests/main.nf.test new file mode 100644 index 00000000..39db814a --- /dev/null +++ b/modules/local/sra_runinfo_to_ftp/tests/main.nf.test @@ -0,0 +1,24 @@ +nextflow_process { + + name "Test process: SRA_RUNINFO_TO_FTP" + script "../main.nf" + process "SRA_RUNINFO_TO_FTP" + + test("Should run without failures") { + + when { + process { + """ + input[0] = file(params.pipelines_testdata_base_path + 'tsv/SRR13191702.runinfo.tsv', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/sra_runinfo_to_ftp/tests/main.nf.test.snap b/modules/local/sra_runinfo_to_ftp/tests/main.nf.test.snap new file mode 100644 index 00000000..be190f5a --- /dev/null +++ b/modules/local/sra_runinfo_to_ftp/tests/main.nf.test.snap @@ -0,0 +1,25 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + "SRR13191702.runinfo_ftp.tsv:md5,94378c448c044b3e20e5c54e442ab62e" + ], + "1": [ + "versions.yml:md5,e95f8185f665127a73622a19d321bcca" + ], + "tsv": [ + "SRR13191702.runinfo_ftp.tsv:md5,94378c448c044b3e20e5c54e442ab62e" + ], + "versions": [ + "versions.yml:md5,e95f8185f665127a73622a19d321bcca" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:51:45.748227" + } +} \ No newline at end of file diff --git a/modules/local/sra_to_samplesheet/main.nf b/modules/local/sra_to_samplesheet/main.nf new file mode 100644 index 00000000..92edf5df --- /dev/null +++ b/modules/local/sra_to_samplesheet/main.nf @@ -0,0 +1,75 @@ + +process SRA_TO_SAMPLESHEET { + tag "$meta.id" + + executor 'local' + memory 100.MB + + input: + val meta + val pipeline + val strandedness + val mapping_fields + + output: + tuple val(meta), path("*samplesheet.csv"), emit: samplesheet + tuple val(meta), path("*mappings.csv") , emit: mappings + + exec: + // + // Create samplesheet containing metadata + // + + // Remove custom keys needed to download the data + def meta_clone = meta.clone() + meta_clone.remove("id") + meta_clone.remove("fastq_1") + meta_clone.remove("fastq_2") + meta_clone.remove("md5_1") + meta_clone.remove("md5_2") + meta_clone.remove("single_end") + + // Add relevant fields to the beginning of the map + pipeline_map = [ + sample : "${meta.id.split('_')[0..-2].join('_')}", + fastq_1 : meta.fastq_1, + fastq_2 : meta.fastq_2 + ] + + // Add nf-core pipeline specific entries + if (pipeline) { + if (pipeline == 'rnaseq') { + pipeline_map << [ strandedness: strandedness ] + } else if (pipeline == 'atacseq') { + pipeline_map << [ replicate: 1 ] + } else if (pipeline == 'taxprofiler') { + pipeline_map << [ fasta: '' ] + } + } + pipeline_map << meta_clone + + // Create a samplesheet + samplesheet = pipeline_map.keySet().collect{ '"' + it + '"'}.join(",") + '\n' + samplesheet += pipeline_map.values().collect{ '"' + it + '"'}.join(",") + + // Write samplesheet to file + def samplesheet_file = task.workDir.resolve("${meta.id}.samplesheet.csv") + samplesheet_file.text = samplesheet + + // + // Create sample id mappings file + // + mappings_map = pipeline_map.clone() + def fields = mapping_fields ? ['sample'] + mapping_fields.split(',').collect{ it.trim().toLowerCase() } : [] + if ((mappings_map.keySet() + fields).unique().size() != mappings_map.keySet().size()) { + error("Invalid option for '--sample_mapping_fields': ${mapping_fields}.\nValid options: ${mappings_map.keySet().join(', ')}") + } + + // Create mappings + mappings = fields.collect{ '"' + it + '"'}.join(",") + '\n' + mappings += mappings_map.subMap(fields).values().collect{ '"' + it + '"'}.join(",") + + // Write mappings to file + def mappings_file = task.workDir.resolve("${meta.id}.mappings.csv") + mappings_file.text = mappings +} diff --git a/modules/local/sra_to_samplesheet/nextflow.config b/modules/local/sra_to_samplesheet/nextflow.config new file mode 100644 index 00000000..da241c1a --- /dev/null +++ b/modules/local/sra_to_samplesheet/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: SRA_TO_SAMPLESHEET { + publishDir = [ + path: { "${params.outdir}/samplesheet" }, + enabled: false + ] + } +} diff --git a/modules/local/sra_to_samplesheet/tests/main.nf.test b/modules/local/sra_to_samplesheet/tests/main.nf.test new file mode 100644 index 00000000..ed765158 --- /dev/null +++ b/modules/local/sra_to_samplesheet/tests/main.nf.test @@ -0,0 +1,27 @@ +nextflow_process { + + name "Test process: SRA_TO_SAMPLESHEET" + script "../main.nf" + process "SRA_TO_SAMPLESHEET" + + test("Should run without failures") { + + when { + process { + """ + input[0] = [id:'ERX1188904_ERR1109373', run_accession:'ERR1109373', experiment_accession:'ERX1188904', sample_accession:'SAMEA3643867', experiment_alias:'ena-EXPERIMENT-CAM-03-11-2015-17:01:52:847-7', run_alias:'ena-RUN-CAM-03-11-2015-17:01:52:847-7', sample_alias:'sample_56', study_alias:'ena-STUDY-CAM-02-11-2015-17:42:24:189-13', library_layout:'PAIRED', experiment_title:'Illumina HiSeq 2500 paired end sequencing', sample_title:'RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome', sample_description:'RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome', fastq_md5:'8d7d7b854d0207d1226477a30103fade;9fd57225d6c07a31843276d6df9b15c0;5a62e8f785687dce890cfb4fe3e607f9', fastq_ftp:'ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_2.fastq.gz', fastq_1:'./results/fastq/ERX1188904_ERR1109373_1.fastq.gz', fastq_2:'./results/fastq/ERX1188904_ERR1109373_2.fastq.gz', md5_1:'9fd57225d6c07a31843276d6df9b15c0', md5_2:'5a62e8f785687dce890cfb4fe3e607f9', single_end:false] + input[1] = 'rnaseq' + input[2] = 'auto' + input[3] = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description' + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/local/sra_to_samplesheet/tests/main.nf.test.snap b/modules/local/sra_to_samplesheet/tests/main.nf.test.snap new file mode 100644 index 00000000..568f3ea7 --- /dev/null +++ b/modules/local/sra_to_samplesheet/tests/main.nf.test.snap @@ -0,0 +1,117 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + { + "id": "ERX1188904_ERR1109373", + "run_accession": "ERR1109373", + "experiment_accession": "ERX1188904", + "sample_accession": "SAMEA3643867", + "experiment_alias": "ena-EXPERIMENT-CAM-03-11-2015-17:01:52:847-7", + "run_alias": "ena-RUN-CAM-03-11-2015-17:01:52:847-7", + "sample_alias": "sample_56", + "study_alias": "ena-STUDY-CAM-02-11-2015-17:42:24:189-13", + "library_layout": "PAIRED", + "experiment_title": "Illumina HiSeq 2500 paired end sequencing", + "sample_title": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "sample_description": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "fastq_md5": "8d7d7b854d0207d1226477a30103fade;9fd57225d6c07a31843276d6df9b15c0;5a62e8f785687dce890cfb4fe3e607f9", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_2.fastq.gz", + "fastq_1": "./results/fastq/ERX1188904_ERR1109373_1.fastq.gz", + "fastq_2": "./results/fastq/ERX1188904_ERR1109373_2.fastq.gz", + "md5_1": "9fd57225d6c07a31843276d6df9b15c0", + "md5_2": "5a62e8f785687dce890cfb4fe3e607f9", + "single_end": false + }, + "ERX1188904_ERR1109373.samplesheet.csv:md5,e7898191d57258e049ee7129d36f5c08" + ] + ], + "1": [ + [ + { + "id": "ERX1188904_ERR1109373", + "run_accession": "ERR1109373", + "experiment_accession": "ERX1188904", + "sample_accession": "SAMEA3643867", + "experiment_alias": "ena-EXPERIMENT-CAM-03-11-2015-17:01:52:847-7", + "run_alias": "ena-RUN-CAM-03-11-2015-17:01:52:847-7", + "sample_alias": "sample_56", + "study_alias": "ena-STUDY-CAM-02-11-2015-17:42:24:189-13", + "library_layout": "PAIRED", + "experiment_title": "Illumina HiSeq 2500 paired end sequencing", + "sample_title": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "sample_description": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "fastq_md5": "8d7d7b854d0207d1226477a30103fade;9fd57225d6c07a31843276d6df9b15c0;5a62e8f785687dce890cfb4fe3e607f9", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_2.fastq.gz", + "fastq_1": "./results/fastq/ERX1188904_ERR1109373_1.fastq.gz", + "fastq_2": "./results/fastq/ERX1188904_ERR1109373_2.fastq.gz", + "md5_1": "9fd57225d6c07a31843276d6df9b15c0", + "md5_2": "5a62e8f785687dce890cfb4fe3e607f9", + "single_end": false + }, + "ERX1188904_ERR1109373.mappings.csv:md5,d09ddb4f0709675e5dfe1eadf12c608f" + ] + ], + "mappings": [ + [ + { + "id": "ERX1188904_ERR1109373", + "run_accession": "ERR1109373", + "experiment_accession": "ERX1188904", + "sample_accession": "SAMEA3643867", + "experiment_alias": "ena-EXPERIMENT-CAM-03-11-2015-17:01:52:847-7", + "run_alias": "ena-RUN-CAM-03-11-2015-17:01:52:847-7", + "sample_alias": "sample_56", + "study_alias": "ena-STUDY-CAM-02-11-2015-17:42:24:189-13", + "library_layout": "PAIRED", + "experiment_title": "Illumina HiSeq 2500 paired end sequencing", + "sample_title": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "sample_description": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "fastq_md5": "8d7d7b854d0207d1226477a30103fade;9fd57225d6c07a31843276d6df9b15c0;5a62e8f785687dce890cfb4fe3e607f9", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_2.fastq.gz", + "fastq_1": "./results/fastq/ERX1188904_ERR1109373_1.fastq.gz", + "fastq_2": "./results/fastq/ERX1188904_ERR1109373_2.fastq.gz", + "md5_1": "9fd57225d6c07a31843276d6df9b15c0", + "md5_2": "5a62e8f785687dce890cfb4fe3e607f9", + "single_end": false + }, + "ERX1188904_ERR1109373.mappings.csv:md5,d09ddb4f0709675e5dfe1eadf12c608f" + ] + ], + "samplesheet": [ + [ + { + "id": "ERX1188904_ERR1109373", + "run_accession": "ERR1109373", + "experiment_accession": "ERX1188904", + "sample_accession": "SAMEA3643867", + "experiment_alias": "ena-EXPERIMENT-CAM-03-11-2015-17:01:52:847-7", + "run_alias": "ena-RUN-CAM-03-11-2015-17:01:52:847-7", + "sample_alias": "sample_56", + "study_alias": "ena-STUDY-CAM-02-11-2015-17:42:24:189-13", + "library_layout": "PAIRED", + "experiment_title": "Illumina HiSeq 2500 paired end sequencing", + "sample_title": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "sample_description": "RNA-Seq reads mapped onto L. Boulardi Toti-like virus genome", + "fastq_md5": "8d7d7b854d0207d1226477a30103fade;9fd57225d6c07a31843276d6df9b15c0;5a62e8f785687dce890cfb4fe3e607f9", + "fastq_ftp": "ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR110/003/ERR1109373/ERR1109373_2.fastq.gz", + "fastq_1": "./results/fastq/ERX1188904_ERR1109373_1.fastq.gz", + "fastq_2": "./results/fastq/ERX1188904_ERR1109373_2.fastq.gz", + "md5_1": "9fd57225d6c07a31843276d6df9b15c0", + "md5_2": "5a62e8f785687dce890cfb4fe3e607f9", + "single_end": false + }, + "ERX1188904_ERR1109373.samplesheet.csv:md5,e7898191d57258e049ee7129d36f5c08" + ] + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:51:38.244046" + } +} \ No newline at end of file diff --git a/modules/nf-core/multiqc/environment.yml b/modules/nf-core/custom/sratoolsncbisettings/environment.yml similarity index 100% rename from modules/nf-core/multiqc/environment.yml rename to modules/nf-core/custom/sratoolsncbisettings/environment.yml diff --git a/modules/nf-core/custom/sratoolsncbisettings/main.nf b/modules/nf-core/custom/sratoolsncbisettings/main.nf new file mode 100644 index 00000000..577117ed --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/main.nf @@ -0,0 +1,23 @@ +process CUSTOM_SRATOOLSNCBISETTINGS { + tag 'ncbi-settings' + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:3.0.8--h9f5acd7_0' : + 'biocontainers/sra-tools:3.0.8--h9f5acd7_0' }" + + input: + val ids + + output: + path('*.mkfg') , emit: ncbi_settings + path 'versions.yml', emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + config = "/LIBS/GUID = \"${UUID.randomUUID().toString()}\"\\n/libs/cloud/report_instance_identity = \"true\"\\n" + template 'detect_ncbi_settings.sh' +} diff --git a/modules/nf-core/custom/sratoolsncbisettings/meta.yml b/modules/nf-core/custom/sratoolsncbisettings/meta.yml new file mode 100644 index 00000000..46a6cd32 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/meta.yml @@ -0,0 +1,28 @@ +name: "custom_sratoolsncbisettings" +description: Test for the presence of suitable NCBI settings or create them on the fly. +keywords: + - NCBI + - settings + - sra-tools + - prefetch + - fasterq-dump +tools: + - "sratools": + description: "SRA Toolkit and SDK from NCBI" + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - ncbi_settings: + type: file + description: An NCBI user settings file. + pattern: "*.mkfg" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh b/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh new file mode 100644 index 00000000..cfe3a324 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/templates/detect_ncbi_settings.sh @@ -0,0 +1,45 @@ +#!/usr/bin/env bash + +set -u + + +# Get the expected NCBI settings path and define the environment variable +# `NCBI_SETTINGS`. +eval "$(vdb-config -o n NCBI_SETTINGS | sed 's/[" ]//g')" + +# If the user settings do not exist yet, create a file suitable for `prefetch` +# and `fasterq-dump`. If an existing settings file does not contain the required +# values, error out with a helpful message. +if [[ ! -f "${NCBI_SETTINGS}" ]]; then + printf '!{config}' > 'user-settings.mkfg' +else + prefetch --help &> /dev/null + if [[ $? = 78 ]]; then + echo "You have an existing vdb-config at '${NCBI_SETTINGS}' but it is"\ + "missing the required entries for /LIBS/GUID and"\ + "/libs/cloud/report_instance_identity."\ + "Feel free to add the following to your settings file:" >&2 + echo "$(printf '!{config}')" >&2 + exit 1 + fi + fasterq-dump --help &> /dev/null + if [[ $? = 78 ]]; then + echo "You have an existing vdb-config at '${NCBI_SETTINGS}' but it is"\ + "missing the required entries for /LIBS/GUID and"\ + "/libs/cloud/report_instance_identity."\ + "Feel free to add the following to your settings file:" >&2 + echo "$(printf '!{config}')" >&2 + exit 1 + fi + if [[ "${NCBI_SETTINGS}" != *.mkfg ]]; then + echo "The detected settings '${NCBI_SETTINGS}' do not have the required"\ + "file extension '.mkfg'." >&2 + exit 1 + fi + cp "${NCBI_SETTINGS}" ./ +fi + +cat <<-END_VERSIONS > versions.yml +"!{task.process}": + sratools: $(vdb-config --version 2>&1 | grep -Eo '[0-9.]+') +END_VERSIONS diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test new file mode 100644 index 00000000..e9ea68dc --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test @@ -0,0 +1,38 @@ +nextflow_process { + + name "Test Process CUSTOM_SRATOOLSNCBISETTINGS" + script "../main.nf" + process "CUSTOM_SRATOOLSNCBISETTINGS" + config "modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config" + + test("Should run without failures") { + + when { + params { + settings_path = '/tmp/.ncbi' + settings_file = "${params.settings_path}/user-settings.mkfg" + } + + process { + """ + input[0] = ["SRX6725035"] + file(params.settings_path).mkdirs() + def settings = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + settings.copyTo(params.settings_file) + """ + } + } + + then { + assert process.success + assert snapshot( + process.out.versions + ).match() + + with(process.out.ncbi_settings) { + assert path(get(0)).readLines().any { it.contains('/LIBS/GUID') } + assert path(get(0)).readLines().any { it.contains('/libs/cloud/report_instance_identity') } + } + } + } +} diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap new file mode 100644 index 00000000..5e314f0b --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/main.nf.test.snap @@ -0,0 +1,14 @@ +{ + "Should run without failures": { + "content": [ + [ + "versions.yml:md5,3d6ee88cce1ee517e198633f062589a8" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:47:15.824443" + } +} \ No newline at end of file diff --git a/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config b/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config new file mode 100644 index 00000000..df5def04 --- /dev/null +++ b/modules/nf-core/custom/sratoolsncbisettings/tests/nextflow.config @@ -0,0 +1,14 @@ +params.settings_path = '/tmp/.ncbi' +params.settings_file = "${params.settings_path}/user-settings.mkfg" + +env.NCBI_SETTINGS = params.settings_file + +process { + withName: CUSTOM_SRATOOLSNCBISETTINGS { + containerOptions = { + (workflow.containerEngine == 'singularity') ? + "-B ${params.settings_path}:${params.settings_path}" : + "-v ${params.settings_path}:${params.settings_path}" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/fastqc/main.nf b/modules/nf-core/fastqc/main.nf deleted file mode 100644 index d8989f48..00000000 --- a/modules/nf-core/fastqc/main.nf +++ /dev/null @@ -1,64 +0,0 @@ -process FASTQC { - tag "$meta.id" - label 'process_medium' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/fastqc:0.12.1--hdfd78af_0' : - 'biocontainers/fastqc:0.12.1--hdfd78af_0' }" - - input: - tuple val(meta), path(reads) - - output: - tuple val(meta), path("*.html"), emit: html - tuple val(meta), path("*.zip") , emit: zip - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - // Make list of old name and new name pairs to use for renaming in the bash while loop - def old_new_pairs = reads instanceof Path || reads.size() == 1 ? [[ reads, "${prefix}.${reads.extension}" ]] : reads.withIndex().collect { entry, index -> [ entry, "${prefix}_${index + 1}.${entry.extension}" ] } - def rename_to = old_new_pairs*.join(' ').join(' ') - def renamed_files = old_new_pairs.collect{ old_name, new_name -> new_name }.join(' ') - - // The total amount of allocated RAM by FastQC is equal to the number of threads defined (--threads) time the amount of RAM defined (--memory) - // https://github.com/s-andrews/FastQC/blob/1faeea0412093224d7f6a07f777fad60a5650795/fastqc#L211-L222 - // Dividing the task.memory by task.cpu allows to stick to requested amount of RAM in the label - def memory_in_mb = MemoryUnit.of("${task.memory}").toUnit('MB') / task.cpus - // FastQC memory value allowed range (100 - 10000) - def fastqc_memory = memory_in_mb > 10000 ? 10000 : (memory_in_mb < 100 ? 100 : memory_in_mb) - - """ - printf "%s %s\\n" $rename_to | while read old_name new_name; do - [ -f "\${new_name}" ] || ln -s \$old_name \$new_name - done - - fastqc \\ - $args \\ - --threads $task.cpus \\ - --memory $fastqc_memory \\ - $renamed_files - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ - - stub: - def prefix = task.ext.prefix ?: "${meta.id}" - """ - touch ${prefix}.html - touch ${prefix}.zip - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - fastqc: \$( fastqc --version | sed '/FastQC v/!d; s/.*v//' ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/fastqc/meta.yml b/modules/nf-core/fastqc/meta.yml deleted file mode 100644 index 4827da7a..00000000 --- a/modules/nf-core/fastqc/meta.yml +++ /dev/null @@ -1,66 +0,0 @@ -name: fastqc -description: Run FastQC on sequenced reads -keywords: - - quality control - - qc - - adapters - - fastq -tools: - - fastqc: - description: | - FastQC gives general quality metrics about your reads. - It provides information about the quality score distribution - across your reads, the per base sequence content (%A/C/G/T). - You get information about adapter contamination and other - overrepresented sequences. - homepage: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ - documentation: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/ - licence: ["GPL-2.0-only"] - identifier: biotools:fastqc -input: - - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - reads: - type: file - description: | - List of input FastQ files of size 1 and 2 for single-end and paired-end data, - respectively. -output: - - html: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.html": - type: file - description: FastQC report - pattern: "*_{fastqc.html}" - - zip: - - meta: - type: map - description: | - Groovy Map containing sample information - e.g. [ id:'test', single_end:false ] - - "*.zip": - type: file - description: FastQC report archive - pattern: "*_{fastqc.zip}" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" -maintainers: - - "@drpatelh" - - "@grst" - - "@ewels" - - "@FelixKrueger" diff --git a/modules/nf-core/fastqc/tests/main.nf.test b/modules/nf-core/fastqc/tests/main.nf.test deleted file mode 100644 index e9d79a07..00000000 --- a/modules/nf-core/fastqc/tests/main.nf.test +++ /dev/null @@ -1,309 +0,0 @@ -nextflow_process { - - name "Test Process FASTQC" - script "../main.nf" - process "FASTQC" - - tag "modules" - tag "modules_nfcore" - tag "fastqc" - - test("sarscov2 single-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - // NOTE The report contains the date inside it, which means that the md5sum is stable per day, but not longer than that. So you can't md5sum it. - // looks like this:
Mon 2 Oct 2023
test.gz
- // https://github.com/nf-core/modules/pull/3903#issuecomment-1743620039 - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 paired-end [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 interleaved [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 paired-end [bam]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/test_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/test_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 multiple [fastq]") { - - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1][0] ==~ ".*/test_1_fastqc.html" }, - { assert process.out.html[0][1][1] ==~ ".*/test_2_fastqc.html" }, - { assert process.out.html[0][1][2] ==~ ".*/test_3_fastqc.html" }, - { assert process.out.html[0][1][3] ==~ ".*/test_4_fastqc.html" }, - { assert process.out.zip[0][1][0] ==~ ".*/test_1_fastqc.zip" }, - { assert process.out.zip[0][1][1] ==~ ".*/test_2_fastqc.zip" }, - { assert process.out.zip[0][1][2] ==~ ".*/test_3_fastqc.zip" }, - { assert process.out.zip[0][1][3] ==~ ".*/test_4_fastqc.zip" }, - { assert path(process.out.html[0][1][0]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][1]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][2]).text.contains("File typeConventional base calls") }, - { assert path(process.out.html[0][1][3]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 custom_prefix") { - - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert process.out.html[0][1] ==~ ".*/mysample_fastqc.html" }, - { assert process.out.zip[0][1] ==~ ".*/mysample_fastqc.zip" }, - { assert path(process.out.html[0][1]).text.contains("File typeConventional base calls") }, - { assert snapshot(process.out.versions).match() } - ) - } - } - - test("sarscov2 single-end [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [ id: 'test', single_end:true ], - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 paired-end [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 interleaved [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_interleaved.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 paired-end [bam] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/bam/test.paired_end.sorted.bam', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 multiple [fastq] - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [id: 'test', single_end: false], // meta map - [ file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_2.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_1.fastq.gz', checkIfExists: true), - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test2_2.fastq.gz', checkIfExists: true) ] - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } - - test("sarscov2 custom_prefix - stub") { - - options "-stub" - when { - process { - """ - input[0] = Channel.of([ - [ id:'mysample', single_end:true ], // meta map - file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) - ]) - """ - } - } - - then { - assertAll ( - { assert process.success }, - { assert snapshot(process.out).match() } - ) - } - } -} diff --git a/modules/nf-core/fastqc/tests/main.nf.test.snap b/modules/nf-core/fastqc/tests/main.nf.test.snap deleted file mode 100644 index d5db3092..00000000 --- a/modules/nf-core/fastqc/tests/main.nf.test.snap +++ /dev/null @@ -1,392 +0,0 @@ -{ - "sarscov2 custom_prefix": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:16.374038" - }, - "sarscov2 single-end [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": true - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": true - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": true - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": true - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:24.993809" - }, - "sarscov2 custom_prefix - stub": { - "content": [ - { - "0": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "mysample", - "single_end": true - }, - "mysample.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:03:10.93942" - }, - "sarscov2 interleaved [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:42.355718" - }, - "sarscov2 paired-end [bam]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:53.276274" - }, - "sarscov2 multiple [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:05.527626" - }, - "sarscov2 paired-end [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:31.188871" - }, - "sarscov2 paired-end [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:34.273566" - }, - "sarscov2 multiple [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:03:02.304411" - }, - "sarscov2 single-end [fastq]": { - "content": [ - [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:01:19.095607" - }, - "sarscov2 interleaved [fastq] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:44.640184" - }, - "sarscov2 paired-end [bam] - stub": { - "content": [ - { - "0": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "1": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "2": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "html": [ - [ - { - "id": "test", - "single_end": false - }, - "test.html:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "versions": [ - "versions.yml:md5,e1cc25ca8af856014824abd842e93978" - ], - "zip": [ - [ - { - "id": "test", - "single_end": false - }, - "test.zip:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ] - } - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.3" - }, - "timestamp": "2024-07-22T11:02:53.550742" - } -} \ No newline at end of file diff --git a/modules/nf-core/fastqc/tests/tags.yml b/modules/nf-core/fastqc/tests/tags.yml deleted file mode 100644 index 7834294b..00000000 --- a/modules/nf-core/fastqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -fastqc: - - modules/nf-core/fastqc/** diff --git a/modules/nf-core/multiqc/main.nf b/modules/nf-core/multiqc/main.nf deleted file mode 100644 index cc0643e1..00000000 --- a/modules/nf-core/multiqc/main.nf +++ /dev/null @@ -1,63 +0,0 @@ -process MULTIQC { - label 'process_single' - - conda "${moduleDir}/environment.yml" - container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? - 'https://depot.galaxyproject.org/singularity/multiqc:1.25.1--pyhdfd78af_0' : - 'biocontainers/multiqc:1.25.1--pyhdfd78af_0' }" - - input: - path multiqc_files, stageAs: "?/*" - path(multiqc_config) - path(extra_multiqc_config) - path(multiqc_logo) - path(replace_names) - path(sample_names) - - output: - path "*multiqc_report.html", emit: report - path "*_data" , emit: data - path "*_plots" , optional:true, emit: plots - path "versions.yml" , emit: versions - - when: - task.ext.when == null || task.ext.when - - script: - def args = task.ext.args ?: '' - def prefix = task.ext.prefix ? "--filename ${task.ext.prefix}.html" : '' - def config = multiqc_config ? "--config $multiqc_config" : '' - def extra_config = extra_multiqc_config ? "--config $extra_multiqc_config" : '' - def logo = multiqc_logo ? "--cl-config 'custom_logo: \"${multiqc_logo}\"'" : '' - def replace = replace_names ? "--replace-names ${replace_names}" : '' - def samples = sample_names ? "--sample-names ${sample_names}" : '' - """ - multiqc \\ - --force \\ - $args \\ - $config \\ - $prefix \\ - $extra_config \\ - $logo \\ - $replace \\ - $samples \\ - . - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ - - stub: - """ - mkdir multiqc_data - mkdir multiqc_plots - touch multiqc_report.html - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - multiqc: \$( multiqc --version | sed -e "s/multiqc, version //g" ) - END_VERSIONS - """ -} diff --git a/modules/nf-core/multiqc/meta.yml b/modules/nf-core/multiqc/meta.yml deleted file mode 100644 index b16c1879..00000000 --- a/modules/nf-core/multiqc/meta.yml +++ /dev/null @@ -1,78 +0,0 @@ -name: multiqc -description: Aggregate results from bioinformatics analyses across many samples into - a single report -keywords: - - QC - - bioinformatics tools - - Beautiful stand-alone HTML report -tools: - - multiqc: - description: | - MultiQC searches a given directory for analysis logs and compiles a HTML report. - It's a general use tool, perfect for summarising the output from numerous bioinformatics tools. - homepage: https://multiqc.info/ - documentation: https://multiqc.info/docs/ - licence: ["GPL-3.0-or-later"] - identifier: biotools:multiqc -input: - - - multiqc_files: - type: file - description: | - List of reports / files recognised by MultiQC, for example the html and zip output of FastQC - - - multiqc_config: - type: file - description: Optional config yml for MultiQC - pattern: "*.{yml,yaml}" - - - extra_multiqc_config: - type: file - description: Second optional config yml for MultiQC. Will override common sections - in multiqc_config. - pattern: "*.{yml,yaml}" - - - multiqc_logo: - type: file - description: Optional logo file for MultiQC - pattern: "*.{png}" - - - replace_names: - type: file - description: | - Optional two-column sample renaming file. First column a set of - patterns, second column a set of corresponding replacements. Passed via - MultiQC's `--replace-names` option. - pattern: "*.{tsv}" - - - sample_names: - type: file - description: | - Optional TSV file with headers, passed to the MultiQC --sample_names - argument. - pattern: "*.{tsv}" -output: - - report: - - "*multiqc_report.html": - type: file - description: MultiQC report file - pattern: "multiqc_report.html" - - data: - - "*_data": - type: directory - description: MultiQC data dir - pattern: "multiqc_data" - - plots: - - "*_plots": - type: file - description: Plots created by MultiQC - pattern: "*_data" - - versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" -authors: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" -maintainers: - - "@abhi18av" - - "@bunop" - - "@drpatelh" - - "@jfy133" diff --git a/modules/nf-core/multiqc/tests/main.nf.test b/modules/nf-core/multiqc/tests/main.nf.test deleted file mode 100644 index 33316a7d..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test +++ /dev/null @@ -1,92 +0,0 @@ -nextflow_process { - - name "Test Process MULTIQC" - script "../main.nf" - process "MULTIQC" - - tag "modules" - tag "modules_nfcore" - tag "multiqc" - - config "./nextflow.config" - - test("sarscov2 single-end [fastqc]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_single") } - ) - } - - } - - test("sarscov2 single-end [fastqc] [config]") { - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = Channel.of(file("https://github.com/nf-core/tools/raw/dev/nf_core/pipeline-template/assets/multiqc_config.yml", checkIfExists: true)) - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert process.out.report[0] ==~ ".*/multiqc_report.html" }, - { assert process.out.data[0] ==~ ".*/multiqc_data" }, - { assert snapshot(process.out.versions).match("multiqc_versions_config") } - ) - } - } - - test("sarscov2 single-end [fastqc] - stub") { - - options "-stub" - - when { - process { - """ - input[0] = Channel.of(file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastqc/test_fastqc.zip', checkIfExists: true)) - input[1] = [] - input[2] = [] - input[3] = [] - input[4] = [] - input[5] = [] - """ - } - } - - then { - assertAll( - { assert process.success }, - { assert snapshot(process.out.report.collect { file(it).getName() } + - process.out.data.collect { file(it).getName() } + - process.out.plots.collect { file(it).getName() } + - process.out.versions ).match("multiqc_stub") } - ) - } - - } -} diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap deleted file mode 100644 index 2fcbb5ff..00000000 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ /dev/null @@ -1,41 +0,0 @@ -{ - "multiqc_versions_single": { - "content": [ - [ - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-02T17:51:46.317523" - }, - "multiqc_stub": { - "content": [ - [ - "multiqc_report.html", - "multiqc_data", - "multiqc_plots", - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-02T17:52:20.680978" - }, - "multiqc_versions_config": { - "content": [ - [ - "versions.yml:md5,41f391dcedce7f93ca188f3a3ffa0916" - ] - ], - "meta": { - "nf-test": "0.9.0", - "nextflow": "24.04.4" - }, - "timestamp": "2024-10-02T17:52:09.185842" - } -} \ No newline at end of file diff --git a/modules/nf-core/multiqc/tests/nextflow.config b/modules/nf-core/multiqc/tests/nextflow.config deleted file mode 100644 index c537a6a3..00000000 --- a/modules/nf-core/multiqc/tests/nextflow.config +++ /dev/null @@ -1,5 +0,0 @@ -process { - withName: 'MULTIQC' { - ext.prefix = null - } -} diff --git a/modules/nf-core/multiqc/tests/tags.yml b/modules/nf-core/multiqc/tests/tags.yml deleted file mode 100644 index bea6c0d3..00000000 --- a/modules/nf-core/multiqc/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -multiqc: - - modules/nf-core/multiqc/** diff --git a/modules/nf-core/sratools/fasterqdump/environment.yml b/modules/nf-core/sratools/fasterqdump/environment.yml new file mode 100644 index 00000000..dd0faa56 --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/environment.yml @@ -0,0 +1,8 @@ +name: sratools_fasterqdump +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sra-tools=2.11.0 + - conda-forge::pigz=2.6 diff --git a/modules/nf-core/sratools/fasterqdump/main.nf b/modules/nf-core/sratools/fasterqdump/main.nf new file mode 100644 index 00000000..e7cf157a --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/main.nf @@ -0,0 +1,55 @@ +process SRATOOLS_FASTERQDUMP { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : + 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + + input: + tuple val(meta), path(sra) + path ncbi_settings + path certificate + + output: + tuple val(meta), path('*.fastq.gz'), emit: reads + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def outfile = meta.single_end ? "${prefix}.fastq" : prefix + def key_file = '' + if (certificate.toString().endsWith('.jwt')) { + key_file += " --perm ${certificate}" + } else if (certificate.toString().endsWith('.ngc')) { + key_file += " --ngc ${certificate}" + } + """ + export NCBI_SETTINGS="\$PWD/${ncbi_settings}" + + fasterq-dump \\ + $args \\ + --threads $task.cpus \\ + --outfile $outfile \\ + ${key_file} \\ + ${sra} + + pigz \\ + $args2 \\ + --no-name \\ + --processes $task.cpus \\ + *.fastq + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + sratools: \$(fasterq-dump --version 2>&1 | grep -Eo '[0-9.]+') + pigz: \$( pigz --version 2>&1 | sed 's/pigz //g' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/sratools/fasterqdump/meta.yml b/modules/nf-core/sratools/fasterqdump/meta.yml new file mode 100644 index 00000000..b5e0175a --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/meta.yml @@ -0,0 +1,53 @@ +name: sratools_fasterqdump +description: Extract sequencing reads in FASTQ format from a given NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - FASTQ + - dump +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - sra: + type: directory + description: Directory containing ETL data for the given SRA. + pattern: "*/*.sra" + - ncbi_settings: + type: file + description: > + An NCBI user settings file. + + pattern: "*.mkfg" + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/sratools/fasterqdump/nextflow.config b/modules/nf-core/sratools/fasterqdump/nextflow.config new file mode 100644 index 00000000..f98b140d --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/nextflow.config @@ -0,0 +1,10 @@ +process { + withName: SRATOOLS_FASTERQDUMP { + ext.args = '--split-files --include-technical' + publishDir = [ + path: { "${params.outdir}/fastq" }, + mode: params.publish_dir_mode, + pattern: "*.fastq.gz" + ] + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/fasterqdump/sratools-fasterqdump.diff b/modules/nf-core/sratools/fasterqdump/sratools-fasterqdump.diff new file mode 100644 index 00000000..089862bf --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/sratools-fasterqdump.diff @@ -0,0 +1,62 @@ +Changes in module 'nf-core/sratools/fasterqdump' +--- modules/nf-core/sratools/fasterqdump/main.nf ++++ modules/nf-core/sratools/fasterqdump/main.nf +@@ -4,8 +4,8 @@ + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? +- 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' : +- 'biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:2f4a4c900edd6801ff0068c2b3048b4459d119eb-0' }" ++ 'https://depot.galaxyproject.org/singularity/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' : ++ 'quay.io/biocontainers/mulled-v2-5f89fe0cd045cb1d615630b9261a1d17943a9b6a:6a9ff0e76ec016c3d0d27e0c0d362339f2d787e6-0' }" + + input: + tuple val(meta), path(sra) + +--- /dev/null ++++ modules/nf-core/sratools/fasterqdump/nextflow.config +@@ -0,0 +1,10 @@ ++process { ++ withName: SRATOOLS_FASTERQDUMP { ++ ext.args = '--split-files --include-technical' ++ publishDir = [ ++ path: { "${params.outdir}/fastq" }, ++ mode: params.publish_dir_mode, ++ pattern: "*.fastq.gz" ++ ] ++ } ++} +--- modules/nf-core/sratools/fasterqdump/environment.yml ++++ modules/nf-core/sratools/fasterqdump/environment.yml +@@ -4,5 +4,5 @@ + - bioconda + - defaults + dependencies: +- - bioconda::sra-tools=3.0.8 ++ - bioconda::sra-tools=2.11.0 + - conda-forge::pigz=2.6 + +--- modules/nf-core/sratools/fasterqdump/tests/main.nf.test ++++ modules/nf-core/sratools/fasterqdump/tests/main.nf.test +@@ -3,11 +3,8 @@ + script "../main.nf" + config "./nextflow.config" + process "SRATOOLS_FASTERQDUMP" +- tag "modules" +- tag "modules_nfcore" +- tag "untar" +- tag "sratools" +- tag "sratools/fasterqdump" ++ ++ tag "UNTAR" + + test("Single-end") { + + +--- modules/nf-core/sratools/fasterqdump/tests/tags.yml ++++ /dev/null +@@ -1,2 +0,0 @@ +-sratools/fasterqdump: +- - modules/nf-core/sratools/fasterqdump/** + +************************************************************ diff --git a/modules/nf-core/sratools/fasterqdump/tests/main.nf.test b/modules/nf-core/sratools/fasterqdump/tests/main.nf.test new file mode 100644 index 00000000..695394d4 --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/tests/main.nf.test @@ -0,0 +1,70 @@ +nextflow_process { + name "Test Process SRATOOLS_FASTERQDUMP" + script "../main.nf" + config "./nextflow.config" + process "SRATOOLS_FASTERQDUMP" + + tag "UNTAR" + + test("Single-end") { + + setup { + run("UNTAR") { + script "modules/nf-core/untar/main.nf" + process { + """ + input[0] = Channel.of([ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/sra/SRR13255544.tar.gz', checkIfExists: true) ]) + """ + } + } + } + + when { + process { + """ + input[0] = UNTAR.out.untar.map{ meta, files -> [ [ id:'test_single_end', single_end:true ], files]} + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("Paired-end") { + + setup { + run("UNTAR") { + script "modules/nf-core/untar/main.nf" + process { + """ + input[0] = Channel.of([ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/sra/SRR11140744.tar.gz', checkIfExists: true) ]) + """ + } + } + } + + when { + process { + """ + input[0] = UNTAR.out.untar.map{ meta, files -> [ [ id:'test_paired_end', single_end:false ], files]} + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/sratools/fasterqdump/tests/main.nf.test.snap b/modules/nf-core/sratools/fasterqdump/tests/main.nf.test.snap new file mode 100644 index 00000000..ce0f9800 --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/tests/main.nf.test.snap @@ -0,0 +1,78 @@ +{ + "Single-end": { + "content": [ + { + "0": [ + [ + { + "id": "test_single_end", + "single_end": true + }, + "test_single_end.fastq.gz:md5,674d78c1cc3c1308d6d39d6369a42887" + ] + ], + "1": [ + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" + ], + "reads": [ + [ + { + "id": "test_single_end", + "single_end": true + }, + "test_single_end.fastq.gz:md5,674d78c1cc3c1308d6d39d6369a42887" + ] + ], + "versions": [ + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T15:25:52.837288" + }, + "Paired-end": { + "content": [ + { + "0": [ + [ + { + "id": "test_paired_end", + "single_end": false + }, + [ + "test_paired_end_1.fastq.gz:md5,8573015c91d099b6e30789f8bab2f43c", + "test_paired_end_2.fastq.gz:md5,37e6f719a022dc3c9994c80fbc20c311" + ] + ] + ], + "1": [ + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" + ], + "reads": [ + [ + { + "id": "test_paired_end", + "single_end": false + }, + [ + "test_paired_end_1.fastq.gz:md5,8573015c91d099b6e30789f8bab2f43c", + "test_paired_end_2.fastq.gz:md5,37e6f719a022dc3c9994c80fbc20c311" + ] + ] + ], + "versions": [ + "versions.yml:md5,6ff2d50b15c3f0eb9c72cd13a4a20295" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T15:26:42.466223" + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/fasterqdump/tests/nextflow.config b/modules/nf-core/sratools/fasterqdump/tests/nextflow.config new file mode 100644 index 00000000..23e4100b --- /dev/null +++ b/modules/nf-core/sratools/fasterqdump/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: SRATOOLS_FASTERQDUMP { + ext.args = '' + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/prefetch/environment.yml b/modules/nf-core/sratools/prefetch/environment.yml new file mode 100644 index 00000000..0abad336 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/environment.yml @@ -0,0 +1,8 @@ +name: sratools_prefetch +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::sra-tools=3.1.0 + - conda-forge::curl=8.5.0 diff --git a/modules/nf-core/sratools/prefetch/main.nf b/modules/nf-core/sratools/prefetch/main.nf new file mode 100644 index 00000000..170f1753 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/main.nf @@ -0,0 +1,35 @@ +process SRATOOLS_PREFETCH { + tag "$id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/sra-tools:3.1.0--h9f5acd7_0' : + 'biocontainers/sra-tools:3.1.0--h9f5acd7_0' }" + + input: + tuple val(meta), val(id) + path ncbi_settings + path certificate + + output: + tuple val(meta), path(id), emit: sra + path 'versions.yml' , emit: versions + + when: + task.ext.when == null || task.ext.when + + shell: + args = task.ext.args ?: '' + args2 = task.ext.args2 ?: '5 1 100' // + if (certificate) { + if (certificate.toString().endsWith('.jwt')) { + args += " --perm ${certificate}" + } + else if (certificate.toString().endsWith('.ngc')) { + args += " --ngc ${certificate}" + } + } + + template 'retry_with_backoff.sh' +} diff --git a/modules/nf-core/sratools/prefetch/meta.yml b/modules/nf-core/sratools/prefetch/meta.yml new file mode 100644 index 00000000..ff54229f --- /dev/null +++ b/modules/nf-core/sratools/prefetch/meta.yml @@ -0,0 +1,56 @@ +name: sratools_prefetch +description: Download sequencing data from the NCBI Sequence Read Archive (SRA). +keywords: + - sequencing + - fastq + - prefetch +tools: + - sratools: + description: SRA Toolkit and SDK from NCBI + homepage: https://github.com/ncbi/sra-tools + documentation: https://github.com/ncbi/sra-tools/wiki + tool_dev_url: https://github.com/ncbi/sra-tools + licence: ["Public Domain"] +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - id: + type: string + description: > + A string denoting an SRA id. + + - ncbi_settings: + type: file + description: > + An NCBI user settings file. + + pattern: "*.mkfg" + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - sra: + type: directory + description: > + Directory containing the ETL data for the given SRA id. + + pattern: "*/*.sra" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" +maintainers: + - "@Midnighter" diff --git a/modules/nf-core/sratools/prefetch/nextflow.config b/modules/nf-core/sratools/prefetch/nextflow.config new file mode 100644 index 00000000..a2ca8848 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: SRATOOLS_PREFETCH { + publishDir = [ + path: { "${params.outdir}/sra" }, + enabled: false + ] + } +} \ No newline at end of file diff --git a/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh new file mode 100755 index 00000000..bfee6070 --- /dev/null +++ b/modules/nf-core/sratools/prefetch/templates/retry_with_backoff.sh @@ -0,0 +1,65 @@ +#!/usr/bin/env bash + +set -u + +retry_with_backoff() { + local max_attempts=${1} + local delay=${2} + local max_time=${3} + local attempt=1 + local output= + local status= + + # Remove the first three arguments to this function in order to access + # the 'real' command with `${@}`. + shift 3 + + while [ ${attempt} -le ${max_attempts} ]; do + output=$("${@}") + status=${?} + + if [ ${status} -eq 0 ]; then + break + fi + + if [ ${attempt} -lt ${max_attempts} ]; then + echo "Failed attempt ${attempt} of ${max_attempts}. Retrying in ${delay} s." >&2 + sleep ${delay} + elif [ ${attempt} -eq ${max_attempts} ]; then + echo "Failed after ${attempt} attempts." >&2 + return ${status} + fi + + attempt=$(( ${attempt} + 1 )) + delay=$(( ${delay} * 2 )) + if [ ${delay} -ge ${max_time} ]; then + delay=${max_time} + fi + done + + echo "${output}" +} + +export NCBI_SETTINGS="$PWD/!{ncbi_settings}" + +retry_with_backoff !{args2} \ + prefetch \ + !{args} \ + !{id} + +# check file integrity using vdb-validate or (when archive contains no checksums) md5sum +vdb-validate !{id} > vdb-validate_result.txt 2>&1 || exit 1 +if grep -q "checksums missing" vdb-validate_result.txt; then + VALID_MD5SUMS=$(curl --silent --fail --location --retry 3 --retry-delay 60 'https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?filetype=run&acc=!{id}') + LOCAL_MD5SUMS=$(md5sum !{id}/* | cut -f1 -d' ') + if ! grep -q -F -f <(echo "$LOCAL_MD5SUMS") <(echo "$VALID_MD5SUMS"); then + echo "MD5 sum check failed" 1>&2 + exit 1 + fi +fi + +cat <<-END_VERSIONS > versions.yml +"!{task.process}": + sratools: $(prefetch --version 2>&1 | grep -Eo '[0-9.]+') + curl: $(curl --version | head -n 1 | sed 's/^curl //; s/ .*$//') +END_VERSIONS diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test b/modules/nf-core/sratools/prefetch/tests/main.nf.test new file mode 100644 index 00000000..f59de7cb --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test @@ -0,0 +1,49 @@ +nextflow_process { + name "Test Process SRATOOLS_PREFETCH" + script "../main.nf" + process "SRATOOLS_PREFETCH" + tag "modules" + tag "modules_nfcore" + tag "sratools" + tag "sratools/prefetch" + + test("sratools/prefetch") { + + when { + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'DRR000774' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } + + test("sratools/prefetch with sralite") { + + when { + process { + """ + input[0] = Channel.of([ [ id:'test', single_end:false ], 'SRR1170046' ]) + input[1] = file(params.modules_testdata_base_path + 'generic/config/ncbi_user_settings.mkfg', checkIfExists: true) + input[2] = [] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + } +} diff --git a/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap new file mode 100644 index 00000000..67795ebb --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/main.nf.test.snap @@ -0,0 +1,80 @@ +{ + "sratools/prefetch with sralite": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "1": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "SRR1170046.sralite:md5,7acfce556ca0951aff49d780899c105b" + ] + ] + ], + "versions": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:02.309737" + }, + "sratools/prefetch": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "1": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ], + "sra": [ + [ + { + "id": "test", + "single_end": false + }, + [ + "DRR000774.sra:md5,7647dba20c89c0e3d7ad13842f060eb0" + ] + ] + ], + "versions": [ + "versions.yml:md5,83d1b23f5ff5b2ad1b96d17d7d7594ee" + ] + } + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:48:37.428307" + } +} diff --git a/modules/nf-core/sratools/prefetch/tests/tags.yml b/modules/nf-core/sratools/prefetch/tests/tags.yml new file mode 100644 index 00000000..52110bfd --- /dev/null +++ b/modules/nf-core/sratools/prefetch/tests/tags.yml @@ -0,0 +1,2 @@ +sratools/prefetch: + - modules/nf-core/sratools/prefetch/** diff --git a/modules/nf-core/untar/environment.yml b/modules/nf-core/untar/environment.yml new file mode 100644 index 00000000..0c9cbb10 --- /dev/null +++ b/modules/nf-core/untar/environment.yml @@ -0,0 +1,11 @@ +name: untar + +channels: + - conda-forge + - bioconda + - defaults + +dependencies: + - conda-forge::grep=3.11 + - conda-forge::sed=4.7 + - conda-forge::tar=1.34 diff --git a/modules/nf-core/untar/main.nf b/modules/nf-core/untar/main.nf new file mode 100644 index 00000000..8a75bb95 --- /dev/null +++ b/modules/nf-core/untar/main.nf @@ -0,0 +1,63 @@ +process UNTAR { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$prefix"), emit: untar + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args2 = task.ext.args2 ?: '' + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName.toString().replaceFirst(/\.tar$/, "")) + + """ + mkdir $prefix + + ## Ensures --strip-components only applied when top level of tar contents is a directory + ## If just files or multiple directories, place all in prefix + if [[ \$(tar -taf ${archive} | grep -o -P "^.*?\\/" | uniq | wc -l) -eq 1 ]]; then + tar \\ + -C $prefix --strip-components 1 \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + else + tar \\ + -C $prefix \\ + -xavf \\ + $args \\ + $archive \\ + $args2 + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.toString().replaceFirst(/\.[^\.]+(.gz)?$/, "")) + """ + mkdir $prefix + touch ${prefix}/file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + untar: \$(echo \$(tar --version 2>&1) | sed 's/^.*(GNU tar) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/untar/meta.yml b/modules/nf-core/untar/meta.yml new file mode 100644 index 00000000..a9a2110f --- /dev/null +++ b/modules/nf-core/untar/meta.yml @@ -0,0 +1,46 @@ +name: untar +description: Extract files. +keywords: + - untar + - uncompress + - extract +tools: + - untar: + description: | + Extract tar.gz files. + documentation: https://www.gnu.org/software/tar/manual/ + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be untar + pattern: "*.{tar}.{gz}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - untar: + type: directory + description: Directory containing contents of archive + pattern: "*/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@matthdsm" + - "@jfy133" diff --git a/modules/nf-core/untar/tests/main.nf.test b/modules/nf-core/untar/tests/main.nf.test new file mode 100644 index 00000000..98b769ad --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test @@ -0,0 +1,45 @@ +nextflow_process { + + name "Test Process UNTAR" + script "../main.nf" + process "UNTAR" + + test("test_untar") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'genomics/sarscov2/genome/db/kraken2.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar") }, + ) + } + + } + + test("test_untar_onlyfiles") { + + when { + process { + """ + input[0] = [ [], file(params.modules_testdata_base_path + 'generic/tar/hello.tar.gz', checkIfExists: true) ] + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot(process.out.untar).match("test_untar_onlyfiles") }, + ) + } + + } + +} diff --git a/modules/nf-core/untar/tests/main.nf.test.snap b/modules/nf-core/untar/tests/main.nf.test.snap new file mode 100644 index 00000000..64550292 --- /dev/null +++ b/modules/nf-core/untar/tests/main.nf.test.snap @@ -0,0 +1,42 @@ +{ + "test_untar_onlyfiles": { + "content": [ + [ + [ + [ + + ], + [ + "hello.txt:md5,e59ff97941044f85df5297e1c302d260" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:41.320643" + }, + "test_untar": { + "content": [ + [ + [ + [ + + ], + [ + "hash.k2d:md5,8b8598468f54a7087c203ad0190555d9", + "opts.k2d:md5,a033d00cf6759407010b21700938f543", + "taxo.k2d:md5,094d5891cdccf2f1468088855c214b2c" + ] + ] + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:49:33.795172" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 1b2505f9..d4c92c7a 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,21 +9,15 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags // Input options - input = null - - // References - genome = null - igenomes_base = 's3://ngi-igenomes/igenomes/' - igenomes_ignore = false - - // MultiQC options - multiqc_config = null - multiqc_title = null - multiqc_logo = null - max_multiqc_email_size = '25.MB' - multiqc_methods_description = null + input = null + nf_core_pipeline = null + nf_core_rnaseq_strandedness = 'auto' + ena_metadata_fields = null + sample_mapping_fields = 'experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description' + download_method = 'ftp' + skip_fastq_download = false + dbgap_key = null // Boilerplate options outdir = null @@ -40,16 +34,20 @@ params { pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/' // Config options - config_profile_name = null - config_profile_description = null + config_profile_name = null + config_profile_description = null - custom_config_version = 'master' - custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" - config_profile_contact = null - config_profile_url = null + custom_config_version = 'master' + custom_config_base = "https://raw.githubusercontent.com/nf-core/configs/${params.custom_config_version}" + config_profile_contact = null + config_profile_url = null // Schema validation default options - validate_params = true + validate_params = true + + // Deprecated options + // See: https://github.com/nf-core/fetchngs/pull/279/files#r1494459480 + force_sratools_download = false } // Load base.config by default for all pipelines @@ -154,8 +152,8 @@ profiles { executor.cpus = 4 executor.memory = 8.GB } - test { includeConfig 'conf/test.config' } - test_full { includeConfig 'conf/test_full.config' } + test { includeConfig 'conf/test.config' } + test_full { includeConfig 'conf/test_full.config' } } // Load nf-core custom profiles from different Institutions @@ -174,9 +172,6 @@ podman.registry = 'quay.io' singularity.registry = 'quay.io' charliecloud.registry = 'quay.io' -// Load igenomes.config if required -includeConfig !params.igenomes_ignore ? 'conf/igenomes.config' : 'conf/igenomes_ignored.config' - // Export these variables to prevent local Python/R libraries from conflicting with those in the container // The JULIA depot path has been adjusted to a fixed path `/usr/local/share/julia` that needs to be used for packages in the container. // See https://apeltzer.github.io/post/03-julia-lang-nextflow/ for details on that. Once we have a common agreement on where to keep Julia packages, this is adjustable. @@ -227,7 +222,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=24.04.2' version = '1.13.0dev' - doi = '' + doi = '10.5281/zenodo.5070524' } // Nextflow plugins @@ -267,4 +262,4 @@ validation { } // Load modules.config for DSL2 module specific options -includeConfig 'conf/modules.config' +includeConfig './workflows/sra/nextflow.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index d88fc3f9..68671c68 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -18,67 +18,67 @@ "exists": true, "schema": "assets/schema_input.json", "mimetype": "text/csv", - "pattern": "^\\S+\\.csv$", - "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/fetchngs/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "pattern": "^\\S+\\.(csv|tsv|txt)$", + "fa_icon": "fas fa-file-excel", + "description": "File containing SRA/ENA/GEO/DDBJ identifiers one per line to download their associated metadata and FastQ files." }, - "outdir": { + "ena_metadata_fields": { "type": "string", - "format": "directory-path", - "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", - "fa_icon": "fas fa-folder-open" + "fa_icon": "fas fa-columns", + "description": "Comma-separated list of ENA metadata fields to fetch before downloading data.", + "help_text": "The default list of fields used by the pipeline can be found at the top of the [`bin/sra_ids_to_runinfo.py`](https://github.com/nf-core/fetchngs/blob/master/bin/sra_ids_to_runinfo.py) script within the pipeline repo. This pipeline requires a minimal set of fields to download FastQ files i.e. `'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5'`. Full list of accepted metadata fields can be obtained from the [ENA API](https://www.ebi.ac.uk/ena/portal/api/returnFields?dataPortal=ena&format=tsv&result=read_run)." }, - "email": { + "sample_mapping_fields": { "type": "string", - "description": "Email address for completion summary.", - "fa_icon": "fas fa-envelope", - "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", - "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" + "fa_icon": "fas fa-columns", + "description": "Comma-separated list of ENA metadata fields used to create a separate 'id_mappings.csv' and 'multiqc_config.yml' with selected fields that can be used to rename samples in general and in MultiQC.", + "default": "experiment_accession,run_accession,sample_accession,experiment_alias,run_alias,sample_alias,experiment_title,sample_title,sample_description" }, - "multiqc_title": { + "nf_core_pipeline": { "type": "string", - "description": "MultiQC report title. Printed as page header, used for filename if not otherwise specified.", - "fa_icon": "fas fa-file-signature" - } - } - }, - "reference_genome_options": { - "title": "Reference genome options", - "type": "object", - "fa_icon": "fas fa-dna", - "description": "Reference genome related files and options required for the workflow.", - "properties": { - "genome": { + "fa_icon": "fab fa-apple", + "description": "Name of supported nf-core pipeline e.g. 'rnaseq'. A samplesheet for direct use with the pipeline will be created with the appropriate columns.", + "enum": ["rnaseq", "atacseq", "viralrecon", "taxprofiler"] + }, + "nf_core_rnaseq_strandedness": { "type": "string", - "description": "Name of iGenomes reference.", - "fa_icon": "fas fa-book", - "help_text": "If using a reference genome configured in the pipeline using iGenomes, use this parameter to give the ID for the reference. This is then used to build the full paths for all required reference genome files e.g. `--genome GRCh38`. \n\nSee the [nf-core website docs](https://nf-co.re/usage/reference_genomes) for more details." + "fa_icon": "fas fa-dna", + "description": "Value for 'strandedness' entry added to samplesheet created when using '--nf_core_pipeline rnaseq'.", + "help_text": "The default is 'auto' which can be used with nf-core/rnaseq v3.10 onwards to auto-detect strandedness during the pipeline execution.", + "default": "auto" }, - "fasta": { + "download_method": { "type": "string", - "format": "file-path", - "exists": true, - "mimetype": "text/plain", - "pattern": "^\\S+\\.fn?a(sta)?(\\.gz)?$", - "description": "Path to FASTA genome file.", - "help_text": "This parameter is *mandatory* if `--genome` is not specified. If you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.", - "fa_icon": "far fa-file-code" + "default": "ftp", + "fa_icon": "fas fa-download", + "enum": ["aspera", "ftp", "sratools"], + "description": "Method to download FastQ files. Available options are 'aspera', 'ftp' or 'sratools'. Default is 'ftp'.", + "help_text": "FTP and Aspera CLI download FastQ files directly from the ENA FTP whereas sratools uses sra-tools to download *.sra files and convert to FastQ." }, - "igenomes_ignore": { + "skip_fastq_download": { "type": "boolean", - "description": "Do not load the iGenomes reference config.", - "fa_icon": "fas fa-ban", - "hidden": true, - "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + "fa_icon": "fas fa-fast-forward", + "description": "Only download metadata for public data database ids and don't download the FastQ files." + }, + "dbgap_key": { + "type": "string", + "fa_icon": "fas fa-address-card", + "help_text": "Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit. Users with granted access to controlled data can download the JWT cart file for the study from the SRA Run Selector upon logging in. The JWT file can only be used on cloud platforms and is valid for 1 hour upon creation.", + "format": "file-path", + "description": "dbGaP repository key." }, - "igenomes_base": { + "outdir": { "type": "string", "format": "directory-path", - "description": "The base path to the igenomes reference files", - "fa_icon": "fas fa-ban", - "hidden": true, - "default": "s3://ngi-igenomes/igenomes/" + "description": "The output directory where the results will be saved. You have to use absolute paths to storage on Cloud infrastructure.", + "fa_icon": "fas fa-folder-open" + }, + "email": { + "type": "string", + "description": "Email address for completion summary.", + "fa_icon": "fas fa-envelope", + "help_text": "Set this parameter to your e-mail address to get a summary e-mail with details of the run sent to you when the workflow exits. If set in your user config file (`~/.nextflow/config`) then you don't need to specify this on the command line for every run.", + "pattern": "^([a-zA-Z0-9_\\-\\.]+)@([a-zA-Z0-9_\\-\\.]+)\\.([a-zA-Z]{2,5})$" } } }, @@ -166,14 +166,6 @@ "fa_icon": "fas fa-remove-format", "hidden": true }, - "max_multiqc_email_size": { - "type": "string", - "description": "File size limit when attaching MultiQC reports to summary emails.", - "pattern": "^\\d+(\\.\\d+)?\\.?\\s*(K|M|G|T)?B$", - "default": "25.MB", - "fa_icon": "fas fa-file-upload", - "hidden": true - }, "monochrome_logs": { "type": "boolean", "description": "Do not use coloured log outputs.", @@ -187,24 +179,6 @@ "help_text": "Incoming hook URL for messaging service. Currently, MS Teams and Slack are supported.", "hidden": true }, - "multiqc_config": { - "type": "string", - "format": "file-path", - "description": "Custom config file to supply to MultiQC.", - "fa_icon": "fas fa-cog", - "hidden": true - }, - "multiqc_logo": { - "type": "string", - "description": "Custom logo file to supply to MultiQC. File name must also be set in the MultiQC config file", - "fa_icon": "fas fa-image", - "hidden": true - }, - "multiqc_methods_description": { - "type": "string", - "description": "Custom MultiQC yaml file containing HTML including a methods description.", - "fa_icon": "fas fa-cog" - }, "validate_params": { "type": "boolean", "description": "Boolean whether to validate parameters against the schema at runtime", @@ -220,20 +194,36 @@ "hidden": true } } + }, + "deprecated_options": { + "title": "Deprecated options", + "type": "object", + "description": "List of parameters that have been deprecated.", + "default": "", + "fa_icon": "fas fa-calendar-times", + "properties": { + "force_sratools_download": { + "type": "boolean", + "fa_icon": "fas fa-times-circle", + "description": "This parameter has been deprecated. Please use '--download_method sratools' instead.", + "enum": [false], + "hidden": true + } + } } }, "allOf": [ { "$ref": "#/$defs/input_output_options" }, - { - "$ref": "#/$defs/reference_genome_options" - }, { "$ref": "#/$defs/institutional_config_options" }, { "$ref": "#/$defs/generic_options" + }, + { + "$ref": "#/$defs/deprecated_options" } ] } diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 00000000..6f5e2c47 --- /dev/null +++ b/nf-test.config @@ -0,0 +1,10 @@ +config { + // Location of nf-tests + testsDir "." + + // nf-test directory used to create temporary files for each test + workDir System.getenv("NFT_WORKDIR") ?: ".nf-test" + + // Location of an optional nextflow.config file specific for executing pipeline tests + configFile "tests/nextflow.config" +} diff --git a/subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf b/subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf index d5794430..666b4492 100644 --- a/subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf @@ -3,19 +3,19 @@ // /* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT FUNCTIONS / MODULES / SUBWORKFLOWS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +======================================================================================== + IMPORT MODULES/SUBWORKFLOWS +======================================================================================== */ -include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { samplesheetToList } from 'plugin/nf-schema' include { completionEmail } from '../../nf-core/utils_nfcore_pipeline' include { completionSummary } from '../../nf-core/utils_nfcore_pipeline' include { imNotification } from '../../nf-core/utils_nfcore_pipeline' -include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' +include { paramsSummaryMap } from 'plugin/nf-schema' +include { samplesheetToList } from 'plugin/nf-schema' include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipeline' +include { UTILS_NFCORE_PIPELINE } from '../../nf-core/utils_nfcore_pipeline' +include { UTILS_NFSCHEMA_PLUGIN } from '../../nf-core/utils_nfschema_plugin' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -26,17 +26,16 @@ include { UTILS_NEXTFLOW_PIPELINE } from '../../nf-core/utils_nextflow_pipelin workflow PIPELINE_INITIALISATION { take: - version // boolean: Display version and exit - validate_params // boolean: Boolean whether to validate parameters against the schema at runtime - monochrome_logs // boolean: Do not use coloured log outputs - nextflow_cli_args // array: List of positional nextflow CLI args - outdir // string: The output directory where the results will be saved - input // string: Path to input samplesheet + version // boolean: Display version and exit + help // boolean: Display help text + validate_params // boolean: Boolean whether to validate parameters against the schema at runtime + monochrome_logs // boolean: Do not use coloured log outputs + nextflow_cli_args // array: List of positional nextflow CLI args + outdir // string: The output directory where the results will be saved + input // string: File containing SRA/ENA/GEO/DDBJ identifiers one per line to download their associated metadata and FastQ files main: - ch_versions = Channel.empty() - // // Print version and exit if required and dump pipeline parameters to JSON file // @@ -64,37 +63,25 @@ workflow PIPELINE_INITIALISATION { ) // - // Custom validation for pipeline parameters - // - validateInputParameters() - - // - // Create channel from input file provided through params.input + // Auto-detect input id type // + ch_input = file(input) + if (isSraId(ch_input)) { + sraCheckENAMetadataFields(ena_metadata_fields) + } else { + error('Ids provided via --input not recognised please make sure they are either SRA / ENA / GEO / DDBJ ids!') + } + // Read in ids from --input file Channel - .fromList(samplesheetToList(params.input, "${projectDir}/assets/schema_input.json")) - .map { - meta, fastq_1, fastq_2 -> - if (!fastq_2) { - return [ meta.id, meta + [ single_end:true ], [ fastq_1 ] ] - } else { - return [ meta.id, meta + [ single_end:false ], [ fastq_1, fastq_2 ] ] - } - } - .groupTuple() - .map { samplesheet -> - validateInputSamplesheet(samplesheet) - } - .map { - meta, fastqs -> - return [ meta, fastqs.flatten() ] - } - .set { ch_samplesheet } + .from(ch_input) + .splitCsv(header:false, sep:'', strip:true) + .map { it[0] } + .unique() + .set { ch_ids } emit: - samplesheet = ch_samplesheet - versions = ch_versions + ids = ch_ids } /* @@ -112,7 +99,6 @@ workflow PIPELINE_COMPLETION { outdir // path: Path to output directory where results will be published monochrome_logs // boolean: Disable ANSI colour codes in log output hook_url // string: hook URL for notifications - multiqc_report // string: Path to MultiQC report main: summary_params = paramsSummaryMap(workflow, parameters_schema: "nextflow_schema.json") @@ -128,8 +114,7 @@ workflow PIPELINE_COMPLETION { email_on_fail, plaintext_email, outdir, - monochrome_logs, - multiqc_report.toList() + monochrome_logs ) } @@ -137,6 +122,8 @@ workflow PIPELINE_COMPLETION { if (hook_url) { imNotification(summary_params, hook_url) } + + sraCurateSamplesheetWarn() } workflow.onError { @@ -149,115 +136,55 @@ workflow PIPELINE_COMPLETION { FUNCTIONS ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// -// Check and validate pipeline parameters -// -def validateInputParameters() { - genomeExistsError() -} // -// Validate channels from input samplesheet +// Check if input ids are from the SRA // -def validateInputSamplesheet(input) { - def (metas, fastqs) = input[1..2] - - // Check that multiple runs of the same sample are of the same datatype i.e. single-end / paired-end - def endedness_ok = metas.collect{ meta -> meta.single_end }.unique().size == 1 - if (!endedness_ok) { - error("Please check input samplesheet -> Multiple runs of a sample must be of the same datatype i.e. single-end or paired-end: ${metas[0].id}") +def isSraId(input) { + def is_sra = false + def total_ids = 0 + def no_match_ids = [] + def pattern = /^(((SR|ER|DR)[APRSX])|(SAM(N|EA|D))|(PRJ(NA|EB|DB))|(GS[EM]))(\d+)$/ + input.eachLine { line -> + total_ids += 1 + if (!(line =~ pattern)) { + no_match_ids << line + } } - return [ metas[0], fastqs ] -} -// -// Get attribute from genome config file e.g. fasta -// -def getGenomeAttribute(attribute) { - if (params.genomes && params.genome && params.genomes.containsKey(params.genome)) { - if (params.genomes[ params.genome ].containsKey(attribute)) { - return params.genomes[ params.genome ][ attribute ] + def num_match = total_ids - no_match_ids.size() + if (num_match > 0) { + if (num_match == total_ids) { + is_sra = true + } else { + error("Mixture of ids provided via --input: ${no_match_ids.join(', ')}\nPlease provide either SRA / ENA / GEO / DDBJ ids!") } } - return null + return is_sra } // -// Exit pipeline if incorrect --genome key provided +// Check and validate parameters // -def genomeExistsError() { - if (params.genomes && params.genome && !params.genomes.containsKey(params.genome)) { - def error_string = "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n" + - " Genome '${params.genome}' not found in any config files provided to the pipeline.\n" + - " Currently, the available genome keys are:\n" + - " ${params.genomes.keySet().join(", ")}\n" + - "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~" - error(error_string) +def sraCheckENAMetadataFields(ena_metadata_fields) { + // Check minimal ENA fields are provided to download FastQ files + def valid_ena_metadata_fields = ['run_accession', 'experiment_accession', 'library_layout', 'fastq_ftp', 'fastq_md5'] + def actual_ena_metadata_fields = ena_metadata_fields ? ena_metadata_fields.split(',').collect{ it.trim().toLowerCase() } : valid_ena_metadata_fields + if (!actual_ena_metadata_fields.containsAll(valid_ena_metadata_fields)) { + error("Invalid option: '${ena_metadata_fields}'. Minimally required fields for '--ena_metadata_fields': '${valid_ena_metadata_fields.join(',')}'") } } // -// Generate methods description for MultiQC +// Print a warning after pipeline has completed // -def toolCitationText() { - // TODO nf-core: Optionally add in-text citation tools to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "Tool (Foo et al. 2023)" : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report - def citation_text = [ - "Tools used in the workflow included:", - "FastQC (Andrews 2010),", - "MultiQC (Ewels et al. 2016)", - "." - ].join(' ').trim() - - return citation_text -} - -def toolBibliographyText() { - // TODO nf-core: Optionally add bibliographic entries to this list. - // Can use ternary operators to dynamically construct based conditions, e.g. params["run_xyz"] ? "
  • Author (2023) Pub name, Journal, DOI
  • " : "", - // Uncomment function in methodsDescriptionText to render in MultiQC report - def reference_text = [ - "
  • Andrews S, (2010) FastQC, URL: https://www.bioinformatics.babraham.ac.uk/projects/fastqc/).
  • ", - "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " - ].join(' ').trim() - - return reference_text -} - -def methodsDescriptionText(mqc_methods_yaml) { - // Convert to a named map so can be used as with familar NXF ${workflow} variable syntax in the MultiQC YML file - def meta = [:] - meta.workflow = workflow.toMap() - meta["manifest_map"] = workflow.manifest.toMap() - - // Pipeline DOI - if (meta.manifest_map.doi) { - // Using a loop to handle multiple DOIs - // Removing `https://doi.org/` to handle pipelines using DOIs vs DOI resolvers - // Removing ` ` since the manifest.doi is a string and not a proper list - def temp_doi_ref = "" - def manifest_doi = meta.manifest_map.doi.tokenize(",") - manifest_doi.each { doi_ref -> - temp_doi_ref += "(doi: ${doi_ref.replace("https://doi.org/", "").replace(" ", "")}), " - } - meta["doi_text"] = temp_doi_ref.substring(0, temp_doi_ref.length() - 2) - } else meta["doi_text"] = "" - meta["nodoi_text"] = meta.manifest_map.doi ? "" : "
  • If available, make sure to update the text to include the Zenodo DOI of version of the pipeline used.
  • " - - // Tool references - meta["tool_citations"] = "" - meta["tool_bibliography"] = "" - - // TODO nf-core: Only uncomment below if logic in toolCitationText/toolBibliographyText has been filled! - // meta["tool_citations"] = toolCitationText().replaceAll(", \\.", ".").replaceAll("\\. \\.", ".").replaceAll(", \\.", ".") - // meta["tool_bibliography"] = toolBibliographyText() - - - def methods_text = mqc_methods_yaml.text - - def engine = new groovy.text.SimpleTemplateEngine() - def description_html = engine.createTemplate(methods_text).make(meta) - - return description_html.toString() +def sraCurateSamplesheetWarn() { + log.warn "=============================================================================\n" + + " Please double-check the samplesheet that has been auto-created by the pipeline.\n\n" + + " Public databases don't reliably hold information such as strandedness\n" + + " information, controls etc\n\n" + + " All of the sample metadata obtained from the ENA has been appended\n" + + " as additional columns to help you manually curate the samplesheet before\n" + + " running nf-core/other pipelines.\n" + + "===================================================================================" } diff --git a/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test new file mode 100644 index 00000000..f2e3f12a --- /dev/null +++ b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test @@ -0,0 +1,79 @@ + +nextflow_function { + + name "Test Functions" + script "subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf" + tag "UTILS_NFCORE_FETCHNGS_PIPELINE" + + test("Function isSraId") { + + function "isSraId" + + when { + function { + """ + input[0] = 'DRR000774' + """ + } + } + + then { + assertAll( + { assert function.success }, + { assert snapshot(function.result).match() } + ) + } + } + + test("Function sraCheckENAMetadataFields [success]") { + + function "sraCheckENAMetadataFields" + + when { + function { + """ + input[0] = 'run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5' + """ + } + } + + then { + assertAll( + { assert function.success }, + { assert snapshot(function.result).match() } + ) + } + } + + test("Function sraCheckENAMetadataFields [failure]") { + + function "sraCheckENAMetadataFields" + + when { + function { + """ + input[0] = 'run_accession,experiment_accession,library_layout,fastq_ftp' + """ + } + } + + then { + assertAll( + { assert !function.success } + ) + } + } + + test("Function sraCurateSamplesheetWarn") { + + function "sraCurateSamplesheetWarn" + + then { + assertAll( + { assert function.success }, + { assert snapshot(function.result).match() } + ) + } + } + +} diff --git a/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test.snap b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test.snap new file mode 100644 index 00000000..99fb20e7 --- /dev/null +++ b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.function.nf.test.snap @@ -0,0 +1,28 @@ +{ + "Function sraCurateSamplesheetWarn": { + "content": null, + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:55:41.001798" + }, + "Function sraCheckENAMetadataFields [success]": { + "content": null, + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:55:33.679255" + }, + "Function isSraId": { + "content": [ + true + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T11:55:29.999289" + } +} \ No newline at end of file diff --git a/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_completion.nf.test b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_completion.nf.test new file mode 100644 index 00000000..b94f72cf --- /dev/null +++ b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_completion.nf.test @@ -0,0 +1,36 @@ +nextflow_workflow { + + name "Test Workflow PIPELINE_COMPLETION" + script "subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf" + workflow "PIPELINE_COMPLETION" + tag "UTILS_NFCORE_FETCHNGS_PIPELINE" + + test("Should run") { + + when { + workflow { + """ + email = null + email_on_fail = null + plaintext_email = false + outdir = 'results' + monochrome_logs = false + hook_url = null + + input[0] = email + input[1] = email_on_fail + input[2] = plaintext_email + input[3] = outdir + input[4] = monochrome_logs + input[5] = hook_url + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } +} diff --git a/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_initialisation.nf.test b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_initialisation.nf.test new file mode 100644 index 00000000..e04aa726 --- /dev/null +++ b/subworkflows/local/utils_nfcore_fetchngs_pipeline/tests/main.workflow_pipeline_initialisation.nf.test @@ -0,0 +1,39 @@ +nextflow_workflow { + + name "Test Workflow PIPELINE_INITIALISATION" + script "subworkflows/local/utils_nfcore_fetchngs_pipeline/main.nf" + workflow "PIPELINE_INITIALISATION" + tag "UTILS_NFCORE_FETCHNGS_PIPELINE" + + test("Should run") { + + when { + workflow { + """ + version = false + help = false + validate_params = false + monochrome_logs = false + nextflow_cli_args = [] + outdir = 'results' + ena_metadata_fields = null + + input[0] = version + input[1] = help + input[2] = validate_params + input[3] = monochrome_logs + input[4] = nextflow_cli_args + input[5] = outdir + input[6] = 'https://raw.githubusercontent.com/nf-core/test-datasets/2732b911c57e607fa7aea5ba0c3d91b25bafb662/testdata/v1.12.0/sra_ids_test.csv' + input[7] = ena_metadata_fields + """ + } + } + + then { + assertAll( + { assert workflow.success } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf new file mode 100644 index 00000000..fbeacf4a --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/main.nf @@ -0,0 +1,39 @@ +include { CUSTOM_SRATOOLSNCBISETTINGS } from '../../../modules/nf-core/custom/sratoolsncbisettings/main' +include { SRATOOLS_PREFETCH } from '../../../modules/nf-core/sratools/prefetch/main' +include { SRATOOLS_FASTERQDUMP } from '../../../modules/nf-core/sratools/fasterqdump/main' + +// +// Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +// +workflow FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS { + take: + ch_sra_ids // channel: [ val(meta), val(id) ] + ch_dbgap_key // channel: [ path(dbgap_key) ] + + main: + + ch_versions = Channel.empty() + + // + // Detect existing NCBI user settings or create new ones. + // + CUSTOM_SRATOOLSNCBISETTINGS ( ch_sra_ids.collect() ) + ch_ncbi_settings = CUSTOM_SRATOOLSNCBISETTINGS.out.ncbi_settings + ch_versions = ch_versions.mix(CUSTOM_SRATOOLSNCBISETTINGS.out.versions) + + // + // Prefetch sequencing reads in SRA format. + // + SRATOOLS_PREFETCH ( ch_sra_ids, ch_ncbi_settings, ch_dbgap_key ) + ch_versions = ch_versions.mix(SRATOOLS_PREFETCH.out.versions.first()) + + // + // Convert the SRA format into one or more compressed FASTQ files. + // + SRATOOLS_FASTERQDUMP ( SRATOOLS_PREFETCH.out.sra, ch_ncbi_settings, ch_dbgap_key ) + ch_versions = ch_versions.mix(SRATOOLS_FASTERQDUMP.out.versions.first()) + + emit: + reads = SRATOOLS_FASTERQDUMP.out.reads // channel: [ val(meta), [ reads ] ] + versions = ch_versions // channel: [ versions.yml ] +} diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml new file mode 100644 index 00000000..1b968acc --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/meta.yml @@ -0,0 +1,52 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: fastq_download_prefetch_fasterqdump_sratools +description: Download FASTQ sequencing reads from the NCBI's Sequence Read Archive (SRA). +keywords: + - SRA + - NCBI + - sequencing + - fastq + - prefetch + - fasterq-dump +components: + - custom/sratoolsncbisettings + - sratools/prefetch + - sratools/fasterqdump +input: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - id: + type: string + description: > + SRA run identifier. + + - certificate: + type: file + description: > + Path to a JWT cart file used to access protected dbGAP data on SRA using the sra-toolkit + + pattern: "*.cart" +# TODO Update when we decide on a standard for subworkflow docs +output: + - meta: + type: map + description: > + Groovy Map containing sample information e.g. [ id:'test', single_end:false ] + + - reads: + type: file + description: Extracted FASTQ file or files if the sequencing reads are paired-end. + pattern: "*.fastq.gz" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@Midnighter" + - "@drpatelh" +maintainers: + - "@Midnighter" + - "@drpatelh" diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/nextflow.config b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/nextflow.config new file mode 100644 index 00000000..de803a38 --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/nextflow.config @@ -0,0 +1,2 @@ +includeConfig '../../../modules/nf-core/sratools/prefetch/nextflow.config' +includeConfig '../../../modules/nf-core/sratools/fasterqdump/nextflow.config' diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test new file mode 100644 index 00000000..5f74ed20 --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test @@ -0,0 +1,43 @@ +nextflow_workflow { + + name "Test workflow: fastq_download_prefetch_fasterqdump_sratools/main.nf" + script "../main.nf" + workflow "FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS" + tag "subworkflows" + tag "subworkflows_nfcore" + tag "custom/sratoolsncbisettings" + tag "sratools/prefetch" + tag "sratools/fasterqdump" + tag "subworkflows/fastq_download_prefetch_fasterqdump_sratools" + + test("Parameters: default") { + + when { + workflow { + """ + input[0] = Channel.of( + [[ id:'test_single_end', single_end:true ], 'DRR000774'], + [[ id:'test_paired_end', single_end:false ], 'SRR11140744'] + ) + input[1] = [] + """ + } + } + + then { + def pelines1 = path(workflow.out.reads[0][1][0]).linesGzip + def pelines2 = path(workflow.out.reads[0][1][1]).linesGzip + def selines = path(workflow.out.reads[1][1]).linesGzip + assertAll( + { assert workflow.success }, + { assert snapshot(pelines1[0..5]).match("test_pe_reads_1_lines") }, + { assert snapshot(pelines1.size()).match("test_pe_reads_1_size") }, + { assert snapshot(pelines2[0..5]).match("test_pe_reads_2_lines") }, + { assert snapshot(pelines2.size()).match("test_pe_reads_2_size") }, + { assert snapshot(selines[0..5]).match("test_se_reads_lines") }, + { assert snapshot(selines.size()).match("test_se_reads_size") }, + { assert snapshot(workflow.out.versions).match("versions") } + ) + } + } +} diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test.snap b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test.snap new file mode 100644 index 00000000..0c0be2a4 --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/main.nf.test.snap @@ -0,0 +1,97 @@ +{ + "test_se_reads_size": { + "content": [ + 19996 + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T12:02:56.176292" + }, + "test_pe_reads_2_lines": { + "content": [ + [ + "@SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ACAGGACACGAGTAACTCGTCTATCTTCTGCTGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACGTGGCTTTGGAGACTCCGTGGAGGAGGTCTTATCAGAGGCACGTCAACATCTTAAA", + "+SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ABAAAFBFFBDBGGGGGGGGGGHHHHHHHHHHCHGHGGGHHHGGHGGHGHGGGHFHHHHHHHHGGGGGHHHHHHHHHFHHHHGHHHGHGGGGGEFGDGHHGFGGGHHHHHGHHGGHHFHHHHGHHHHHHHHHHHHHHGFFGGHHHHHHGGHHGGHHHHHEGHHHHHHHGHHGHHFHHHHHGGGGGGGGGGGGAGGG9BEFFFFFFFFFFFFFFEEFFFFFFFA.FFFFFFFEFEFFFFFFF.BFFFFFFFB", + "@SRR11140744.2 M01472:285:000000000-CYHNP:1:1101:20752:3564 length=238", + "GTGCACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTCTATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAGGTTTCGTCCGGGTGTGACCGAAAGGTAAGATGGAGAGCCTTGTCCCTGGTTTCAACGAGAAAACACACGTCCAACTCAGTTTGCCTGTTTTACAGGTTCGCGACGTGCTCGTACG" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T12:02:56.166207" + }, + "test_pe_reads_2_size": { + "content": [ + 2011460 + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T12:02:56.168869" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,1a2218ff913fc33408bffccb081b5048", + "versions.yml:md5,2f3b3a13b36dabf13f09327613d5558d", + "versions.yml:md5,53d6e983afde3a28add2ffc6b7eba4f3" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T15:19:18.755939" + }, + "test_pe_reads_1_size": { + "content": [ + 2013376 + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "24.01.0" + }, + "timestamp": "2024-02-28T15:19:18.677234" + }, + "test_se_reads_lines": { + "content": [ + [ + "@DRR000774.1 1 length=421", + "ACGCAGGTGCCAGCAGCCGCGGTAATACGTAGGATCCGAGCGTTGTCCGGATTTATTGGGCGTAAAGGGTGCGTAGGCGGCTTGTCAAGTCTCATGTGAAATCTCCCGGCTCAACTGGGAGGGTCATGGGAAACTGATGAGCTCGAGGGCAGTAGAGGGAAGCGGAATTCCGAGAGTAGTGGTGAAATGCGTAGATACTCGGAGGAACACCAGTGGCGAAAGCGGCTTCCTGGACTGTACCTGACGCTGAGGCACGAAAGCGTGGGGAGCAAACCGGATTAGATACCCGGGTAGTCCACGCCCTAAACGATGGATACTAGATATAGGGGGTATCGACCCTCTGTGTCGAAGCTAACGCATTAAGTATCCCGCCTGAGGAGTACGGCCGCAAGGCTAAAACTTAAGGAATTGACGGCTGCGT", + "+DRR000774.1 1 length=421", + "FFFFFFFFFFFIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIIHHFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF:88FFF888???DBBBBB666F222ADDDFFF::;FFFFFFFFFFFFFFFFFFFFFFFFFFFF9:::FFFFCCCFFFFDDDFFFFF<<<<<8888886623//38><83238@B@@<;855557,,,,,,,0/0;;8:==DDDDDDDDD9:", + "@DRR000774.2 2 length=126", + "ACGCAGGTGCCAGCAGCCGCGGTAATACGGAGGGAGCTAGCGTTGTTCGGAATTACTGGGCGTAAAGCGCACGTAGGCGGCTTTTCAAGTCAGGGGTGGAAATACCCGGGGCCGTCAACCCGACCG" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T12:02:56.171227" + }, + "test_pe_reads_1_lines": { + "content": [ + [ + "@SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "ACATAGGGCTGTTCAAGTTGAGGCAAAACGCCTTTTTCAACTTCTACTAAGCCACAAGTGCCATCTTTAAGATGTTGACGTGCCTCTGATAAGACCTCCTCCACGGAGTCTCCAAAGCCACGTACGAGCACGTCGCGAACCTGTAAAACAGGCAAACTGAGTTGGACGTGTGTTTTCTCGTTGAAACCAGGGACAAGGCTCTCCATCTTACCTTTCGGTCACACCCGGACGAAACCTAGATGTGCTGATGA", + "+SRR11140744.1 M01472:285:000000000-CYHNP:1:1101:12117:3295 length=251", + "BCCCCFFFFFCFGGGGGGGGGGHGGHHHHGGGHGHHHHHHHHHHHHHHHHHHHHHHHGHHHHHHHHHHHHHHHHHHHHHGGGHHHHHGHHGHHHHHHHHHHHHHGGGGGHHHHHHHHHHHHGHHHGGGGGHGHHGGGGGGGHHHHHHHHHHHGGHHHHHFHHHHHHHGGGHHHHHHHHHGGGHHHHHHHHGGGGGGGFGGGGGGGGGGGGGGGGGGGGGGFFFFFFFFFDFFFFFFFFFFFFFFFFFFFFB", + "@SRR11140744.2 M01472:285:000000000-CYHNP:1:1101:20752:3564 length=238", + "CGTACGAGCACGTCGCGAACCTGTAAAACAGGCAAACTGAGTTGGACGTGTGTTTTCTCGTTGAAACCAGGGACAAGGCTCTCCATCTTACCTTTCGGTCACACCCGGACGAAACCTAGATGTGCTGATGATCGGCTGCAACACGGACGAAACCGTAAGCAGCCTGCAGAAGATAGACGAGTTACTCGTGTCCTGTCAACGACAGTAATTAGTTATTAATTATACTGCGTGAGTGCAC" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.1" + }, + "timestamp": "2024-02-28T12:02:56.161354" + } +} diff --git a/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/tags.yml b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/tags.yml new file mode 100644 index 00000000..03028c32 --- /dev/null +++ b/subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/fastq_download_prefetch_fasterqdump_sratools: + - subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/** diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test index 68718e4f..e49d617f 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.function.nf.test @@ -4,9 +4,7 @@ nextflow_function { name "Test Functions" script "subworkflows/nf-core/utils_nextflow_pipeline/main.nf" config "subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config" - tag 'subworkflows' - tag 'utils_nextflow_pipeline' - tag 'subworkflows/utils_nextflow_pipeline' + tag "UTILS_NEXTFLOW_PIPELINE" test("Test Function getWorkflowVersion") { diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test index ca964ce8..13782037 100644 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test +++ b/subworkflows/nf-core/utils_nextflow_pipeline/tests/main.workflow.nf.test @@ -4,9 +4,6 @@ nextflow_workflow { script "../main.nf" config "subworkflows/nf-core/utils_nextflow_pipeline/tests/nextflow.config" workflow "UTILS_NEXTFLOW_PIPELINE" - tag 'subworkflows' - tag 'utils_nextflow_pipeline' - tag 'subworkflows/utils_nextflow_pipeline' test("Should run no inputs") { diff --git a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml deleted file mode 100644 index f8476112..00000000 --- a/subworkflows/nf-core/utils_nextflow_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nextflow_pipeline: - - subworkflows/nf-core/utils_nextflow_pipeline/** diff --git a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml b/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml deleted file mode 100644 index ac8523c9..00000000 --- a/subworkflows/nf-core/utils_nfcore_pipeline/tests/tags.yml +++ /dev/null @@ -1,2 +0,0 @@ -subworkflows/utils_nfcore_pipeline: - - subworkflows/nf-core/utils_nfcore_pipeline/** diff --git a/tests/main.nf.test b/tests/main.nf.test new file mode 100644 index 00000000..dc3662cd --- /dev/null +++ b/tests/main.nf.test @@ -0,0 +1,81 @@ +nextflow_pipeline { + + name "Test pipeline" + script "../main.nf" + tag "PIPELINE" + + test("Run with profile test") { + + when { + params { + outdir = "$outputDir" + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/2732b911c57e607fa7aea5ba0c3d91b25bafb662/testdata/v1.12.0/sra_ids_test.csv' + } + } + + then { + assert workflow.success + + assertAll( + { assert new File("$outputDir/samplesheet/samplesheet.csv").readLines().size() == 15 }, + { assert new File("$outputDir/samplesheet/samplesheet.csv").readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] }, + { assert new File("$outputDir/samplesheet/samplesheet.csv").readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX024467"', '"DRX026011"', '"ERX1234253"', '"SRX10940790"', '"SRX11047067"', '"SRX17709227"', '"SRX17709228"', '"SRX6725035"', '"SRX9315476"', '"SRX9504942"', '"SRX9504942"', '"SRX9504942"', '"SRX9504942"', '"SRX9626017"'] }, + { assert new File("$outputDir/samplesheet/samplesheet.csv").text.contains('Illumina HiSeq 2500') }, + { assert new File("$outputDir/custom/user-settings.mkfg").exists() }, + { assert new File("$outputDir/fastq/md5/DRX024467_DRR026872.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/DRX026011_DRR028935_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/DRX026011_DRR028935_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/ERX1234253_ERR1160846.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX17709227_SRR21711856.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX17709228_SRR21711855.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX6725035_SRR9984183.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055517_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055517_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055518_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055518_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055519_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055519_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055520_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9504942_SRR13055520_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9626017_SRR13191702_1.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/md5/SRX9626017_SRR13191702_2.fastq.gz.md5").exists() }, + { assert new File("$outputDir/fastq/DRX024467_DRR026872.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/DRX026011_DRR028935_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/DRX026011_DRR028935_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/ERX1234253_ERR1160846.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX10940790_SRR14593545_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX10940790_SRR14593545_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX11047067_SRR14709033.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX17709227_SRR21711856.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX17709228_SRR21711855.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX6725035_SRR9984183.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9315476_SRR12848126_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9315476_SRR12848126_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055517_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055517_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055518_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055518_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055519_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055519_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055520_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9504942_SRR13055520_2.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9626017_SRR13191702_1.fastq.gz").exists() }, + { assert new File("$outputDir/fastq/SRX9626017_SRR13191702_2.fastq.gz").exists() }, + { assert new File("$outputDir/metadata/DRR026872.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/DRR028935.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/ERR1160846.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/GSE214215.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/GSM4907283.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/SRR12848126.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/SRR13191702.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/SRR14593545.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/SRR14709033.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/metadata/SRR9984183.runinfo_ftp.tsv").exists() }, + { assert new File("$outputDir/pipeline_info/nf_core_fetchngs_software_mqc_versions.yml").exists() }, + { assert new File("$outputDir/samplesheet/id_mappings.csv").exists() }, + { assert new File("$outputDir/samplesheet/multiqc_config.yml").exists() }, + { assert new File("$outputDir/samplesheet/samplesheet.csv").exists() } + ) + } + } +} diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..d75ad44b --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,27 @@ +params { + // Base directory for nf-core/modules test data + modules_testdata_base_path = 's3://ngi-igenomes/testdata/nf-core/modules/' + + // Base directory for nf-core/fetchngs test data + pipelines_testdata_base_path = 's3://ngi-igenomes/testdata/nf-core/pipelines/fetchngs/1.15.0/' +} + +// Impose sensible resource limits for testing +process { + withName: '.*' { + cpus = 2 + memory = 3.GB + time = 2.h + } +} + +// Impose same minimum Nextflow version as the pipeline for testing +manifest { + nextflowVersion = '!>=23.04.0' +} + +// Disable all Nextflow reporting options +timeline { enabled = false } +report { enabled = false } +trace { enabled = false } +dag { enabled = false } diff --git a/tower.yml b/tower.yml index 787aedfe..7b7fd106 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,7 @@ reports: - multiqc_report.html: - display: "MultiQC HTML report" samplesheet.csv: display: "Auto-created samplesheet with collated metadata and FASTQ paths" + id_mappings.csv: + display: "File with database identifier mappings that can be used to rename samples" + multiqc_config.yml: + display: "MultiQC config file for bulk renaming of sample names from database ids" diff --git a/workflows/fetchngs.nf b/workflows/fetchngs.nf deleted file mode 100644 index 6eb7189d..00000000 --- a/workflows/fetchngs.nf +++ /dev/null @@ -1,97 +0,0 @@ -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - IMPORT MODULES / SUBWORKFLOWS / FUNCTIONS -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ -include { FASTQC } from '../modules/nf-core/fastqc/main' -include { MULTIQC } from '../modules/nf-core/multiqc/main' -include { paramsSummaryMap } from 'plugin/nf-schema' -include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' -include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_fetchngs_pipeline' - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - RUN MAIN WORKFLOW -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ - -workflow FETCHNGS { - - take: - ch_samplesheet // channel: samplesheet read in from --input - main: - - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet - ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile( - storeDir: "${params.outdir}/pipeline_info", - name: 'nf_core_' + 'pipeline_software_' + 'mqc_' + 'versions.yml', - sort: true, - newLine: true - ).set { ch_collated_versions } - - - // - // MODULE: MultiQC - // - ch_multiqc_config = Channel.fromPath( - "$projectDir/assets/multiqc_config.yml", checkIfExists: true) - ch_multiqc_custom_config = params.multiqc_config ? - Channel.fromPath(params.multiqc_config, checkIfExists: true) : - Channel.empty() - ch_multiqc_logo = params.multiqc_logo ? - Channel.fromPath(params.multiqc_logo, checkIfExists: true) : - Channel.empty() - - summary_params = paramsSummaryMap( - workflow, parameters_schema: "nextflow_schema.json") - ch_workflow_summary = Channel.value(paramsSummaryMultiqc(summary_params)) - ch_multiqc_files = ch_multiqc_files.mix( - ch_workflow_summary.collectFile(name: 'workflow_summary_mqc.yaml')) - ch_multiqc_custom_methods_description = params.multiqc_methods_description ? - file(params.multiqc_methods_description, checkIfExists: true) : - file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) - ch_methods_description = Channel.value( - methodsDescriptionText(ch_multiqc_custom_methods_description)) - - ch_multiqc_files = ch_multiqc_files.mix(ch_collated_versions) - ch_multiqc_files = ch_multiqc_files.mix( - ch_methods_description.collectFile( - name: 'methods_description_mqc.yaml', - sort: true - ) - ) - - MULTIQC ( - ch_multiqc_files.collect(), - ch_multiqc_config.toList(), - ch_multiqc_custom_config.toList(), - ch_multiqc_logo.toList(), - [], - [] - ) - - emit:multiqc_report = MULTIQC.out.report.toList() // channel: /path/to/multiqc_report.html - versions = ch_versions // channel: [ path(versions.yml) ] - -} - -/* -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - THE END -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -*/ diff --git a/workflows/sra/main.nf b/workflows/sra/main.nf new file mode 100644 index 00000000..0c8cac0c --- /dev/null +++ b/workflows/sra/main.nf @@ -0,0 +1,196 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { MULTIQC_MAPPINGS_CONFIG } from '../../modules/local/multiqc_mappings_config' +include { SRA_FASTQ_FTP } from '../../modules/local/sra_fastq_ftp' +include { SRA_IDS_TO_RUNINFO } from '../../modules/local/sra_ids_to_runinfo' +include { SRA_RUNINFO_TO_FTP } from '../../modules/local/sra_runinfo_to_ftp' +include { ASPERA_CLI } from '../../modules/local/aspera_cli' +include { SRA_TO_SAMPLESHEET } from '../../modules/local/sra_to_samplesheet' +include { softwareVersionsToYAML } from '../../subworkflows/nf-core/utils_nfcore_pipeline' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +include { FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS } from '../../subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + RUN MAIN WORKFLOW +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +workflow SRA { + + take: + ids // channel: [ ids ] + + main: + ch_versions = Channel.empty() + + // + // MODULE: Get SRA run information for public database ids + // + SRA_IDS_TO_RUNINFO ( + ids, + params.ena_metadata_fields ?: '' + ) + ch_versions = ch_versions.mix(SRA_IDS_TO_RUNINFO.out.versions.first()) + + // + // MODULE: Parse SRA run information, create file containing FTP links and read into workflow as [ meta, [reads] ] + // + SRA_RUNINFO_TO_FTP ( + SRA_IDS_TO_RUNINFO.out.tsv + ) + ch_versions = ch_versions.mix(SRA_RUNINFO_TO_FTP.out.versions.first()) + + SRA_RUNINFO_TO_FTP + .out + .tsv + .splitCsv(header:true, sep:'\t') + .map { + meta -> + def meta_clone = meta.clone() + meta_clone.single_end = meta_clone.single_end.toBoolean() + return meta_clone + } + .unique() + .set { ch_sra_metadata } + + if (!params.skip_fastq_download) { + + ch_sra_metadata + .branch { + meta -> + def download_method = 'ftp' + // meta.fastq_aspera is a metadata string with ENA fasp links supported by Aspera + // For single-end: 'fasp.sra.ebi.ac.uk:/vol1/fastq/ERR116/006/ERR1160846/ERR1160846.fastq.gz' + // For paired-end: 'fasp.sra.ebi.ac.uk:/vol1/fastq/SRR130/020/SRR13055520/SRR13055520_1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/fastq/SRR130/020/SRR13055520/SRR13055520_2.fastq.gz' + if (meta.fastq_aspera && params.download_method == 'aspera') { + download_method = 'aspera' + } + if ((!meta.fastq_aspera && !meta.fastq_1) || params.download_method == 'sratools') { + download_method = 'sratools' + } + + aspera: download_method == 'aspera' + return [ meta, meta.fastq_aspera.tokenize(';').take(2) ] + ftp: download_method == 'ftp' + return [ meta, [ meta.fastq_1, meta.fastq_2 ] ] + sratools: download_method == 'sratools' + return [ meta, meta.run_accession ] + } + .set { ch_sra_reads } + + // + // MODULE: If FTP link is provided in run information then download FastQ directly via FTP and validate with md5sums + // + SRA_FASTQ_FTP ( + ch_sra_reads.ftp + ) + ch_versions = ch_versions.mix(SRA_FASTQ_FTP.out.versions.first()) + + // + // SUBWORKFLOW: Download sequencing reads without FTP links using sra-tools. + // + FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS ( + ch_sra_reads.sratools, + params.dbgap_key ? file(params.dbgap_key, checkIfExists: true) : [] + ) + ch_versions = ch_versions.mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.versions.first()) + + // + // MODULE: If Aspera link is provided in run information then download FastQ directly via Aspera CLI and validate with md5sums + // + ASPERA_CLI ( + ch_sra_reads.aspera, + 'era-fasp' + ) + ch_versions = ch_versions.mix(ASPERA_CLI.out.versions.first()) + + // Isolate FASTQ channel which will be added to emit block + SRA_FASTQ_FTP + .out + .fastq + .mix(FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS.out.reads) + .mix(ASPERA_CLI.out.fastq) + .map { + meta, fastq -> + def reads = fastq instanceof List ? fastq.flatten() : [ fastq ] + def meta_clone = meta.clone() + + meta_clone.fastq_1 = reads[0] ? "${params.outdir}/fastq/${reads[0].getName()}" : '' + meta_clone.fastq_2 = reads[1] && !meta.single_end ? "${params.outdir}/fastq/${reads[1].getName()}" : '' + + return meta_clone + } + .set { ch_sra_metadata } + } + + // + // MODULE: Stage FastQ files downloaded by SRA together and auto-create a samplesheet + // + SRA_TO_SAMPLESHEET ( + ch_sra_metadata, + params.nf_core_pipeline ?: '', + params.nf_core_rnaseq_strandedness ?: 'auto', + params.sample_mapping_fields + ) + + // Merge samplesheets and mapping files across all samples + SRA_TO_SAMPLESHEET + .out + .samplesheet + .map { it[1] } + .collectFile(name:'tmp_samplesheet.csv', newLine: true, keepHeader: true, sort: { it.baseName }) + .map { it.text.tokenize('\n').join('\n') } + .collectFile(name:'samplesheet.csv', storeDir: "${params.outdir}/samplesheet") + .set { ch_samplesheet } + + SRA_TO_SAMPLESHEET + .out + .mappings + .map { it[1] } + .collectFile(name:'tmp_id_mappings.csv', newLine: true, keepHeader: true, sort: { it.baseName }) + .map { it.text.tokenize('\n').join('\n') } + .collectFile(name:'id_mappings.csv', storeDir: "${params.outdir}/samplesheet") + .set { ch_mappings } + + // + // MODULE: Create a MutiQC config file with sample name mappings + // + ch_sample_mappings_yml = Channel.empty() + if (params.sample_mapping_fields) { + MULTIQC_MAPPINGS_CONFIG ( + ch_mappings + ) + ch_versions = ch_versions.mix(MULTIQC_MAPPINGS_CONFIG.out.versions) + ch_sample_mappings_yml = MULTIQC_MAPPINGS_CONFIG.out.yml + } + + // + // Collate and save software versions + // + softwareVersionsToYAML(ch_versions) + .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_fetchngs_software_mqc_versions.yml', sort: true, newLine: true) + + emit: + samplesheet = ch_samplesheet + mappings = ch_mappings + sample_mappings = ch_sample_mappings_yml + sra_metadata = ch_sra_metadata + versions = ch_versions.unique() +} + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + THE END +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ diff --git a/workflows/sra/nextflow.config b/workflows/sra/nextflow.config new file mode 100644 index 00000000..d242c238 --- /dev/null +++ b/workflows/sra/nextflow.config @@ -0,0 +1,8 @@ +includeConfig "../../modules/local/multiqc_mappings_config/nextflow.config" +includeConfig "../../modules/local/aspera_cli/nextflow.config" +includeConfig "../../modules/local/sra_fastq_ftp/nextflow.config" +includeConfig "../../modules/local/sra_ids_to_runinfo/nextflow.config" +includeConfig "../../modules/local/sra_runinfo_to_ftp/nextflow.config" +includeConfig "../../modules/local/sra_to_samplesheet/nextflow.config" +includeConfig "../../modules/nf-core/sratools/prefetch/nextflow.config" +includeConfig "../../subworkflows/nf-core/fastq_download_prefetch_fasterqdump_sratools/nextflow.config" diff --git a/workflows/sra/tests/main.nf.test b/workflows/sra/tests/main.nf.test new file mode 100644 index 00000000..1e2856c1 --- /dev/null +++ b/workflows/sra/tests/main.nf.test @@ -0,0 +1,52 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_DEFAULT" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: default") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test b/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test new file mode 100644 index 00000000..c8d17876 --- /dev/null +++ b/workflows/sra/tests/sra_custom_ena_metadata_fields.nf.test @@ -0,0 +1,57 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_CUSTOM_ENA_METADATA_FIELDS" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --nf_core_pipeline rnaseq --ena_metadata_fields ... --sample_mapping_fields ...") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + nf_core_pipeline = "rnaseq" + ena_metadata_fields = "run_accession,experiment_accession,library_layout,fastq_ftp,fastq_md5" + sample_mapping_fields = "run_accession,library_layout" + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(5) == ['"sample"', '"fastq_1"', '"fastq_2"', '"strandedness"' , '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('SINGLE') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('SINGLE') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "3b70bc9658eab4ba2f4ec98cb749ac9d" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_download_method_aspera.nf.test b/workflows/sra/tests/sra_download_method_aspera.nf.test new file mode 100644 index 00000000..d431a571 --- /dev/null +++ b/workflows/sra/tests/sra_download_method_aspera.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_DOWNLOAD_METHOD_ASPERA" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "ASPERA_CLI" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --download_method aspera") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + download_method = 'aspera' + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_download_method_sratools.nf.test b/workflows/sra/tests/sra_download_method_sratools.nf.test new file mode 100644 index 00000000..e4f8c2d2 --- /dev/null +++ b/workflows/sra/tests/sra_download_method_sratools.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_DOWNLOAD_METHOD_SRATOOLS" + + // Dependencies + tag "FASTQ_DOWNLOAD_PREFETCH_FASTERQDUMP_SRATOOLS" + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --download_method sratools") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + download_method = 'sratools' + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test new file mode 100644 index 00000000..72f8d595 --- /dev/null +++ b/workflows/sra/tests/sra_nf_core_pipeline_atacseq.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_NF_CORE_PIPELINE_ATACSEQ" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --nf_core_pipeline atacseq") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + nf_core_pipeline = "atacseq" + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(5) == ['"sample"', '"fastq_1"', '"fastq_2"', '"replicate"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test new file mode 100644 index 00000000..2e20902a --- /dev/null +++ b/workflows/sra/tests/sra_nf_core_pipeline_rnaseq.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_NF_CORE_PIPELINE_RNASEQ" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --nf_core_pipeline rnaseq") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + nf_core_pipeline = "rnaseq" + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(5) == ['"sample"', '"fastq_1"', '"fastq_2"', '"strandedness"' , '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test new file mode 100644 index 00000000..50650b79 --- /dev/null +++ b/workflows/sra/tests/sra_nf_core_pipeline_taxprofiler.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_NF_CORE_PIPELINE_TAXPROFILER" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --nf_core_pipeline taxprofiler") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + nf_core_pipeline = "taxprofiler" + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(5) == ['"sample"', '"fastq_1"', '"fastq_2"', '"fasta"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test b/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test new file mode 100644 index 00000000..1fe9cc41 --- /dev/null +++ b/workflows/sra/tests/sra_nf_core_pipeline_viralrecon.nf.test @@ -0,0 +1,55 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_NF_CORE_PIPELINE_VIRALRECON" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_FASTQ_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --nf_core_pipeline viralrecon") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + nf_core_pipeline = "viralrecon" + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +} diff --git a/workflows/sra/tests/sra_skip_fastq_download.nf.test b/workflows/sra/tests/sra_skip_fastq_download.nf.test new file mode 100644 index 00000000..8663cf3f --- /dev/null +++ b/workflows/sra/tests/sra_skip_fastq_download.nf.test @@ -0,0 +1,54 @@ +nextflow_workflow { + + name "Test workflow: sra/main.nf" + script "../main.nf" + workflow "SRA" + tag "SRA_SKIP_FASTQ_DOWNLOAD" + + // Dependencies + tag "SRA_IDS_TO_RUNINFO" + tag "SRA_RUNINFO_TO_FTP" + tag "SRA_TO_SAMPLESHEET" + tag "MULTIQC_MAPPINGS_CONFIG" + + test("Parameters: --skip_fastq_download") { + + when { + workflow { + """ + input[0] = Channel.from("DRX026011", "ERX1234253", "SRX6725035") + """ + } + params { + skip_fastq_download = true + } + } + + then { + assert workflow.success + + assertAll( + { + with(workflow.out.samplesheet) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',')[0].take(4) == ['"sample"', '"fastq_1"', '"fastq_2"', '"run_accession"'] + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.mappings) { + assert path(get(0)).readLines().size() == 4 + assert path(get(0)).readLines()*.split(',').collect { it[0] } == ['"sample"', '"DRX026011"', '"ERX1234253"', '"SRX6725035"'] + assert path(get(0)).text.contains('Illumina HiSeq 2500') + } + }, + { + with(workflow.out.sample_mappings) { + assert path(get(0)[0]).md5 == "1ac06bb95b503703430e74660bbdd768" + } + } + ) + } + } +}