diff --git a/.github/workflows/awsfulltest.yml b/.github/workflows/awsfulltest.yml index 33ee6733..4f6e999f 100644 --- a/.github/workflows/awsfulltest.yml +++ b/.github/workflows/awsfulltest.yml @@ -12,12 +12,12 @@ jobs: name: Run AWS full tests if: github.repository == 'nf-core/scrnaseq' runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["alevin", "kallisto", "star", "cellranger", "universc"] steps: - name: Launch workflow via tower uses: seqeralabs/action-tower-launch@v2 - # TODO nf-core: You can customise AWS full pipeline tests as required - # Add full size test data (but still relatively small datasets for few samples) - # on the `test_full.config` test runs with only one set of parameters with: workspace_id: ${{ secrets.TOWER_WORKSPACE_ID }} access_token: ${{ secrets.TOWER_ACCESS_TOKEN }} diff --git a/.github/workflows/awstest.yml b/.github/workflows/awstest.yml index edc7b3a3..ab0d7d3c 100644 --- a/.github/workflows/awstest.yml +++ b/.github/workflows/awstest.yml @@ -9,6 +9,9 @@ jobs: name: Run AWS tests if: github.repository == 'nf-core/scrnaseq' runs-on: ubuntu-latest + strategy: + matrix: + aligner: ["alevin", "kallisto", "star", "cellranger", "universc"] steps: # Launch workflow using Tower CLI tool action - name: Launch workflow via tower @@ -21,7 +24,8 @@ jobs: workdir: s3://${{ secrets.AWS_S3_BUCKET }}/work/scrnaseq/work-${{ github.sha }} parameters: | { - "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/scrnaseq/results-test-${{ github.sha }}" + "outdir": "s3://${{ secrets.AWS_S3_BUCKET }}/scrnaseq/results-test-${{ github.sha }}/aligner_${{ matrix.aligner }}", + "aligner": "${{ matrix.aligner }}" } profiles: test diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 7b27c964..746f26a2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -1,5 +1,5 @@ -name: nf-core CI # This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors +name: nf-core CI on: push: branches: @@ -7,9 +7,16 @@ on: pull_request: release: types: [published] + merge_group: + types: + - checks_requested + branches: + - master + - dev env: NXF_ANSI_LOG: false + NFTEST_VER: "0.8.1" concurrency: group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}" @@ -17,16 +24,21 @@ concurrency: jobs: test: - name: Run pipeline with test data - # Only run on push if this is the nf-core dev branch (merged PRs) + name: "aligner: ${{ matrix.profile }} ; NF: ${{ matrix.NXF_VER }}" if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'nf-core/scrnaseq') }}" runs-on: ubuntu-latest strategy: + fail-fast: false matrix: NXF_VER: - "23.04.0" - "latest-everything" + profile: ["alevin", "cellranger", "kallisto", "star"] + steps: + - name: Disk space cleanup + uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 + - name: Check out pipeline code uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 @@ -35,12 +47,37 @@ jobs: with: version: "${{ matrix.NXF_VER }}" - - name: Disk space cleanup - uses: jlumbroso/free-disk-space@54081f138730dfa15788a46383842cd2f914a1be # v1.3.1 - - name: Run pipeline with test data # TODO nf-core: You can customise CI pipeline run tests as required # For example: adding multiple test runs with different parameters # Remember that you can parallelise this by using strategy.matrix + - name: Cache nf-test installation + id: cache-software + uses: actions/cache@v3 + with: + path: | + /usr/local/bin/nf-test + /home/runner/.nf-test/nf-test.jar + key: ${{ runner.os }}-${{ env.NFTEST_VER }}-nftest + + - name: Install nf-test + if: steps.cache-software.outputs.cache-hit != 'true' + run: | + wget -qO- https://code.askimed.com/install/nf-test | bash + sudo mv nf-test /usr/local/bin/ + + - name: Run nf-test run: | - nextflow run ${GITHUB_WORKSPACE} -profile test,docker --outdir ./results + nf-test test tests/main_pipeline_${{ matrix.profile }}.test --junitxml=test.xml + + - name: Output log on failure + if: failure() + run: | + sudo apt install bat > /dev/null + batcat --decorations=always --color=always ${{ github.workspace }}/.nf-test/tests/*/meta/nextflow.log + + - name: Publish Test Report + uses: mikepenz/action-junit-report@v3 + if: always() # always run even if the previous step fails + with: + report_paths: test.xml diff --git a/.gitignore b/.gitignore index 5124c9ac..a9ba2c52 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,7 @@ results/ testing/ testing* *.pyc +log/ +reports/ +testme.sh +.nf-test/ diff --git a/.nf-core.yml b/.nf-core.yml index 3805dc81..738ad918 100644 --- a/.nf-core.yml +++ b/.nf-core.yml @@ -1 +1,5 @@ repository_type: pipeline +lint: + template_strings: False + files_unchanged: + - .github/ISSUE_TEMPLATE/bug_report.yml diff --git a/CHANGELOG.md b/CHANGELOG.md index 72746e18..b42791db 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,12 +5,113 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## v2.6.0dev - [date] -Initial release of nf-core/scrnaseq, created with the [nf-core](https://nf-co.re/) template. +- [[#91](https://github.com/nf-core/scrnaseq/issues/91)] - Change from pytests to nf-test -### `Added` +## v2.5.1 -### `Fixed` +- Template update to v2.12 ([#298](https://github.com/nf-core/scrnaseq/pull/298)). +- Fix that cellranger workflow couldn't be run and enable CI for this workflow ([#288](https://github.com/nf-core/scrnaseq/pull/288)). +- Update modules ([#288]()https://github.com/nf-core/scrnaseq/pull/288). -### `Dependencies` +## v2.5.0 - 2024-01-02 -### `Deprecated` +- Update template to v2.11.1 ([#279](https://github.com/nf-core/scrnaseq/pull/279)) +- Add support for paired GEX+ATAC sequencing using cellranger-arc ([#274](https://github.com/nf-core/scrnaseq/pull/274)) +- Increase default runtime limits for some processes ([#281](https://github.com/nf-core/scrnaseq/pull/281), [#284](https://github.com/nf-core/scrnaseq/pull/284)) +- Better support for custom protocols ([#273](https://github.com/nf-core/scrnaseq/pull/273)). + - The universc protocol is now specified via the `--protocol` flag + - Any protocol specified is now passed to the respective aligner + - Added a section to the documentation + +## v2.4.1 - 2023-09-28 + +- Fix whitelist logic for dropseq ([#267](https://github.com/nf-core/scrnaseq/pull/267)) +- Fix false-positive filename check in cellranger module ([#261](https://github.com/nf-core/scrnaseq/pull/261)) +- Template update to v2.10 ([#269](https://github.com/nf-core/scrnaseq/pull/269)) + +## v2.4.0 - 2023-08-16 Lime Platinum Crab + +- Fix typo causing empty version imformation for mtx_conversion subworkflow ([#254](https://github.com/nf-core/scrnaseq/pull/254)) +- Add `singularity.registry = 'quay.io'` and bump NF version to 23.04.0 ([#237](https://github.com/nf-core/scrnaseq/pull/237)) +- Fixed issue with file collisions while using cellranger ([#232](https://github.com/nf-core/scrnaseq/pull/232)) +- Fix issue where multiqc inputs tried to access objects that did not exist ([#239](https://github.com/nf-core/scrnaseq/pull/239)) +- Removed `public_aws_ecr` profile ([#242](https://github.com/nf-core/scrnaseq/pull/242)) +- Include cellranger in MultiQC report ([#244](https://github.com/nf-core/scrnaseq/pull/244)) +- Nf-core template update to v2.9 ([#245](https://github.com/nf-core/scrnaseq/pull/245)) +- Update cellranger and fastqc module ([#246](https://github.com/nf-core/scrnaseq/pull/246)). + The [updated cellranger module](https://github.com/nf-core/modules/pull/3537) now automatically renames input FASTQ + files to match the expected naming conventions. + +## v2.3.2 - 2023-06-07 Sepia Samarium Salmon + +- Move containers for pipeline to quay.io ([#233](https://github.com/nf-core/scrnaseq/pull/233)) + +## v2.3.1 - 2023-06-02 Yellow Strontium Pinscher + +- Add `public_aws_ecr` config for using the AWS mirror of containers where possible ([#225](https://github.com/nf-core/scrnaseq/pull/225)) + +## v2.3.0 Steelblue Waspaloy Dachshund + +- Fix problem on samplesheet check related to amount of columns ([[#211](https://github.com/nf-core/scrnaseq/issues/211)]) +- Fixed bug in starsolo output cardinality. + +## v2.2.0 + +- Added support to output 10x count files in text format. +- Add gene symbols to count matrices +- Added UniverSC aligner to implement open-source version of Cell Ranger with wrapper for 40 technologies +- Update cellranger to v7.1.0 ([#205](https://github.com/nf-core/scrnaseq/pull/205)). + +### Fixes + +- Autocanceling previous CI runs when new changes are pushed. +- Fixed [#193](https://github.com/nf-core/scrnaseq/issues/177) by updating the Seurat container directive +- Fixed [#177](https://github.com/nf-core/scrnaseq/issues/177) by adjusting the channels generation and usage when skipping fastqc +- Fixed [#173](https://github.com/nf-core/scrnaseq/issues/173) by adjusting parameter type and adding them to modules.config +- Fixed [#170](https://github.com/nf-core/scrnaseq/issues/170) by adding UniverSC subworkflow using new module +- Fixed [#196](https://github.com/nf-core/scrnaseq/issues/196) by adjusting runtime requirements for AlevinQC +- Fixed [#191](https://github.com/nf-core/scrnaseq/issues/191) by updating simpleAF containers to latest version + +## v2.1.0 - 2022-10-06 "Green Mercury Siberian Husky" + +- Alevin workflow updated to use Alevin-Fry via simpleaf - thanks to @rob-p for supporting this and @fmalmeida implementing the support + +### Fixes + +- Fixed Kallistobustools workflow [#123](https://github.com/nf-core/scrnaseq/issues/123) by upgrading to nf-core/modules module +- Fixed matrix conversion error when running STAR with --soloFeatures GeneFull [#135](https://github.com/nf-core/scrnaseq/pull/135) +- Fixed seurat matrix conversion error when running with conda profile [#136](https://github.com/nf-core/scrnaseq/pull/136) +- Fixed Kallistobustools module [#116](https://github.com/nf-core/scrnaseq/issues/116). By updating nf-core module and making sure conversion modules take into account the different outputs produced by kallisto standard and non-standard workflows. +- Updated pipeline template to [nf-core/tools 2.6](https://github.com/nf-core/tools/releases/tag/2.6) + +## v2.0.0 - 2022-06-17 "Gray Nickel Beagle" + +- Pipeline ported to dsl2 +- Template update with latest nf-core/tools v2.1 +- Added cellranger v.7.0.0 subworkflow +- Added full size tests + +### Fixes + +- Make sure pipeline runs on multiple samples [#77](https://github.com/nf-core/scrnaseq/pull/77) +- Fix issue where STARsolo always uses 10XV2 chemistry [#60](https://github.com/nf-core/scrnaseq/issues/60) + +## v1.1.0 - 2021-03-24 "Olive Mercury Corgi" + +- Template update with latest nf-core/tools v1.13.2 +- Parameters JSON Schema added [#42](https://github.com/nf-core/scrnaseq/issues/42) +- [25](https://github.com/nf-core/scrnaseq/issues/25) Fix small documentation error with wrong parameter for txp2gene + +### Fixes + +- [#20](https://github.com/nf-core/scrnaseq/issues/20) Fix Transcriptome Fasta argument not detected well +- [#21](https://github.com/nf-core/scrnaseq/issues/21) Fix `--kallisto_index` being ignored + +## v1.0.0 - 2019-11-28 "Tiny Aluminium Crab" + +Initial release of nf-core/scrnaseq, created with the [nf-core](http://nf-co.re/) template. +This includes the following workflow options: + +- Salmon Alevin + AlevinQC +- STARSolo +- Kallisto / BUStools diff --git a/CITATIONS.md b/CITATIONS.md index bd70fa4e..867bde34 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -18,6 +18,21 @@ > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. +* [Alevin](https://doi.org/10.1186/s13059-019-1670-y) + + > Srivastava, A., Malik, L., Smith, T. et al. Alevin efficiently estimates accurate gene abundances from dscRNA-seq data. Genome Biol 20, 65 (2019). + +* [Salmon](https://www.nature.com/articles/nmeth.4197) + + > Patro, R., Duggal, G., Love, M. et al. Salmon provides fast and bias-aware quantification of transcript expression. Nat Methods 14, 417–419 (2017). + +* [Kallisto/Bustools](https://www.nature.com/articles/s41587-021-00870-2) + + > Melsted, P., Booeshaghi, A.S., Liu, L. et al. Modular, efficient and constant-memory single-cell RNA-seq preprocessing. Nat Biotechnol 39, 813–818 (2021). + +* [StarSolo](https://www.biorxiv.org/content/10.1101/2021.05.05.442755v1) + > Benjamin Kaminow, Dinar Yunusov, Alexander Dobin. STARsolo: accurate, fast and versatile mapping/quantification of single-cell and single-nucleus RNA-seq data. BioRxiv 2021.05.05.442755 (2021). + ## Software packaging/containerisation tools - [Anaconda](https://anaconda.com) diff --git a/README.md b/README.md index d7e24902..23061375 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,10 @@ -[![GitHub Actions CI Status](https://github.com/nf-core/scrnaseq/actions/workflows/ci.yml/badge.svg)](https://github.com/nf-core/scrnaseq/actions/workflows/ci.yml) -[![GitHub Actions Linting Status](https://github.com/nf-core/scrnaseq/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/scrnaseq/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scrnaseq/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.XXXXXXX-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.XXXXXXX) +[![GitHub Actions CI Status](https://github.com/nf-core/scrnaseq/workflows/nf-core%20CI/badge.svg)](https://github.com/nf-core/scrnaseq/actions?query=workflow%3A%22nf-core+CI%22) +[![GitHub Actions Linting Status](https://github.com/nf-core/scrnaseq/workflows/nf-core%20linting/badge.svg)](https://github.com/nf-core/scrnaseq/actions?query=workflow%3A%22nf-core+linting%22) +[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/scrnaseq/results) +[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.3568187-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.3568187) [![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com) [![Nextflow](https://img.shields.io/badge/nextflow%20DSL2-%E2%89%A523.04.0-23aa62.svg)](https://www.nextflow.io/) @@ -19,50 +21,49 @@ ## Introduction -**nf-core/scrnaseq** is a bioinformatics pipeline that ... +**nf-core/scrnaseq** is a bioinformatics best-practice analysis pipeline for processing 10x Genomics single-cell RNA-seq data. - +This is a community effort in building a pipeline capable to support: - - +- Alevin-Fry + AlevinQC +- STARSolo +- Kallisto + BUStools +- Cellranger +- UniverSC -1. Read QC ([`FastQC`](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/)) -2. Present QC for raw reads ([`MultiQC`](http://multiqc.info/)) +## Documentation + +The nf-core/scrnaseq pipeline comes with documentation about the pipeline [usage](https://nf-co.re/scrnaseq/usage), [parameters](https://nf-co.re/scrnaseq/parameters) and [output](https://nf-co.re/scrnaseq/output). + +![scrnaseq workflow](docs/images/scrnaseq_pipeline_v1.0_metro_clean.png) ## Usage > [!NOTE] > If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data. - - Now, you can run the pipeline using: - - ```bash nextflow run nf-core/scrnaseq \ -profile \ --input samplesheet.csv \ + --genome_fasta GRCm38.p6.genome.chr19.fa \ + --gtf gencode.vM19.annotation.chr19.gtf \ + --protocol 10XV2 \ + --aligner \ --outdir ``` @@ -72,6 +73,21 @@ nextflow run nf-core/scrnaseq \ For more details and further functionality, please refer to the [usage documentation](https://nf-co.re/scrnaseq/usage) and the [parameter documentation](https://nf-co.re/scrnaseq/parameters). +## Decision Tree for users + +The nf-core/scrnaseq pipeline features several paths to analyze your single cell data. Future additions will also be done soon, e.g. the addition of multi-ome analysis types. To aid users in analyzing their data, we have added a decision tree to help people decide on what type of analysis they want to run and how to choose appropriate parameters for that. + +```mermaid +graph TD + A[sc RNA] -->|alevin-fry| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|CellRanger| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|kbpython| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|STARsolo| B(h5ad/seurat/mtx matrices) + A[sc RNA] -->|Universc| B(h5ad/seurat/mtx matrices) +``` + +Options for the respective alignment method can be found [here](https://github.com/nf-core/scrnaseq/blob/dev/docs/usage.md#aligning-options) to choose between methods. + ## Pipeline output To see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/scrnaseq/results) tab on the nf-core website pipeline page. @@ -80,11 +96,15 @@ For more details about the output files and reports, please refer to the ## Credits -nf-core/scrnaseq was originally written by Bailey PJ, Botvinnik O, Marques de Almeida F, Peltzer A, Sturm G. +nf-core/scrnaseq was originally written by Bailey PJ, Botvinnik O, Marques de Almeida F, Gabernet G, Peltzer A, Sturm G. -We thank the following people for their extensive assistance in the development of this pipeline: +We thank the following people and teams for their extensive assistance in the development of this pipeline: - +- @heylf +- @KevinMenden +- @FloWuenne +- @rob-p +- [GHGA](https://www.ghga.de/) ## Contributions and Support @@ -94,17 +114,10 @@ For further information or help, don't hesitate to get in touch on the [Slack `# ## Citations - - +If you use nf-core/scrnaseq for your analysis, please cite it using the following doi: [10.5281/zenodo.3568187](https://doi.org/10.5281/zenodo.3568187) - +The basic benchmarks that were used as motivation for incorporating the three available modular workflows can be found in [this publication](https://www.biorxiv.org/content/10.1101/673285v2). -An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. - -You can cite the `nf-core` publication as follows: +We offer all three paths for the processing of scRNAseq data so it remains up to the user to decide which pipeline workflow is chosen for a particular analysis question. -> **The nf-core framework for community-curated bioinformatics pipelines.** -> -> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen. -> -> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x). +An extensive list of references for the tools used by the pipeline can be found in the [`CITATIONS.md`](CITATIONS.md) file. diff --git a/assets/methods_description_template.yml b/assets/methods_description_template.yml index c79ee0c1..107f3120 100644 --- a/assets/methods_description_template.yml +++ b/assets/methods_description_template.yml @@ -3,7 +3,6 @@ description: "Suggested text and references to use when describing pipeline usag section_name: "nf-core/scrnaseq Methods Description" section_href: "https://github.com/nf-core/scrnaseq" plot_type: "html" -## TODO nf-core: Update the HTML below to your preferred methods description, e.g. add publication citation for this pipeline ## You inject any metadata in the Nextflow '${workflow}' object data: |

Methods

diff --git a/assets/protocols.json b/assets/protocols.json new file mode 100644 index 00000000..23ff1328 --- /dev/null +++ b/assets/protocols.json @@ -0,0 +1,90 @@ +{ + "alevin": { + "10XV1": { + "protocol": "10xv1", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "10xv2", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "10xv3", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "dropseq" + } + }, + "cellranger": { + "auto": { + "protocol": "auto" + }, + "10XV1": { + "protocol": "SC3Pv1" + }, + "10XV2": { + "protocol": "SC3Pv2" + }, + "10XV3": { + "protocol": "SC3Pv3" + } + }, + "star": { + "10XV1": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V1_barcode_whitelist.txt.gz" + }, + "10XV2": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 10", + "whitelist": "assets/whitelist/10x_V2_barcode_whitelist.txt.gz" + }, + "10XV3": { + "protocol": "CB_UMI_Simple", + "extra_args": "--soloUMIlen 12", + "whitelist": "assets/whitelist/10x_V3_barcode_whitelist.txt.gz" + }, + "dropseq": { + "protocol": "CB_UMI_Simple" + }, + "smartseq": { + "protocol": "SmartSeq" + } + }, + "kallisto": { + "10XV1": { + "protocol": "10XV1" + }, + "10XV2": { + "protocol": "10XV2" + }, + "10XV3": { + "protocol": "10XV3" + }, + "dropseq": { + "protocol": "DROPSEQ" + }, + "smartseq": { + "protocol": "SMARTSEQ" + } + }, + "universc": { + "auto": { + "protocol": "10x" + }, + "10XV1": { + "protocol": "10x-v1" + }, + "10XV2": { + "protocol": "10x-v2" + }, + "10XV3": { + "protocol": "10x-v3" + }, + "dropseq": { + "protocol": "dropseq" + } + } +} diff --git a/assets/samplesheet.csv b/assets/samplesheet.csv index 5f653ab7..d4623eea 100644 --- a/assets/samplesheet.csv +++ b/assets/samplesheet.csv @@ -1,3 +1,4 @@ -sample,fastq_1,fastq_2 -SAMPLE_PAIRED_END,/path/to/fastq/files/AEG588A1_S1_L002_R1_001.fastq.gz,/path/to/fastq/files/AEG588A1_S1_L002_R2_001.fastq.gz -SAMPLE_SINGLE_END,/path/to/fastq/files/AEG588A4_S4_L003_R1_001.fastq.gz, +sample,fastq_1,fastq_2,protocol,expected_cells +Sample_X,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_X_S1_L001_R2_001.fastq.gz,"10xV2","5000" +Sample_Y,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L001_R2_001.fastq.gz,"10xV2","5000" +Sample_Y,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R1_001.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/testdata/cellranger/Sample_Y_S1_L002_R2_001.fastq.gz,"10xV2","5000" diff --git a/assets/whitelist/10x_V1_barcode_whitelist.txt.gz b/assets/whitelist/10x_V1_barcode_whitelist.txt.gz new file mode 100644 index 00000000..bf838d62 Binary files /dev/null and b/assets/whitelist/10x_V1_barcode_whitelist.txt.gz differ diff --git a/assets/whitelist/10x_V2_barcode_whitelist.txt.gz b/assets/whitelist/10x_V2_barcode_whitelist.txt.gz new file mode 100644 index 00000000..e9203f21 Binary files /dev/null and b/assets/whitelist/10x_V2_barcode_whitelist.txt.gz differ diff --git a/assets/whitelist/10x_V3_barcode_whitelist.txt.gz b/assets/whitelist/10x_V3_barcode_whitelist.txt.gz new file mode 100644 index 00000000..836b665d Binary files /dev/null and b/assets/whitelist/10x_V3_barcode_whitelist.txt.gz differ diff --git a/bin/alevin_qc.r b/bin/alevin_qc.r new file mode 100755 index 00000000..ec1362f9 --- /dev/null +++ b/bin/alevin_qc.r @@ -0,0 +1,20 @@ +#!/usr/bin/env Rscript + +# Command line argument processing +args <- commandArgs(trailingOnly=TRUE) + +if (length(args) < 3) { + stop("Usage: alevin_qc.r ", call.=FALSE) +} + +require(alevinQC) +require(tximport) + +baseDir <- args[1] +sampleId <- args[2] +outDir <- args[3] + +alevinQCReport(baseDir = baseDir, sampleId = sampleId, + outputFile = "alevinReport.html", + outputFormat = "html_document", + outputDir = outDir, forceOverwrite = TRUE) diff --git a/bin/concat_h5ad.py b/bin/concat_h5ad.py new file mode 100755 index 00000000..43ea071a --- /dev/null +++ b/bin/concat_h5ad.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc, anndata as ad, pandas as pd +from pathlib import Path +import argparse + + +def read_samplesheet(samplesheet): + df = pd.read_csv(samplesheet) + df.set_index("sample") + + # samplesheet may contain replicates, when it has, + # group information from replicates and collapse with commas + # only keep unique values using set() + df = df.groupby(["sample"]).agg(lambda column: ",".join(set(column))) + + return df + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Concatenates h5ad files and merge metadata from samplesheet") + + parser.add_argument("-i", "--input", dest="input", help="Path to samplesheet.csv") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + parser.add_argument( + "-s", + "--suffix", + dest="suffix", + help="Suffix of matrices to remove and get sample name", + ) + + args = vars(parser.parse_args()) + + # Open samplesheet as dataframe + df_samplesheet = read_samplesheet(args["input"]) + + # find all h5ad and append to dict + dict_of_h5ad = {str(path).replace(args["suffix"], ""): sc.read_h5ad(path) for path in Path(".").rglob("*.h5ad")} + + # concat h5ad files + adata = ad.concat(dict_of_h5ad, label="sample", merge="unique", index_unique="_") + + # merge with data.frame, on sample information + adata.obs = adata.obs.join(df_samplesheet, on="sample") + adata.write_h5ad(args["out"], compression="gzip") + + print("Wrote h5ad file to {}".format(args["out"])) diff --git a/bin/filter_gtf_for_genes_in_genome.py b/bin/filter_gtf_for_genes_in_genome.py new file mode 100755 index 00000000..485f8392 --- /dev/null +++ b/bin/filter_gtf_for_genes_in_genome.py @@ -0,0 +1,81 @@ +#!/usr/bin/env python +# Script originally written by Pranathi Vemuri (github.com/pranathivemuri) +# modified by Harshil Patel (github.com/drpatelh) + +from __future__ import print_function +import logging +from itertools import groupby +import argparse + +# Create a logger +logging.basicConfig(format="%(name)s - %(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger(__file__) +logger.setLevel(logging.INFO) + + +def is_header(line): + return line[0] == ">" + + +def extract_fasta_seq_names(fasta_name): + """ + modified from Brent Pedersen + Correct Way To Parse A Fasta File In Python + given a fasta file. yield tuples of header, sequence + from https://www.biostars.org/p/710/ + """ + # first open the file outside + fh = open(fasta_name) + + # ditch the boolean (x[0]) and just keep the header or sequence since + # we know they alternate. + faiter = (x[1] for x in groupby(fh, is_header)) + + for i, header in enumerate(faiter): + line = next(header) + if is_header(line): + # drop the ">" + headerStr = line[1:].strip().split()[0] + yield headerStr + + +def extract_genes_in_genome(fasta, gtf_in, gtf_out): + seq_names_in_genome = set(extract_fasta_seq_names(fasta)) + logger.info("Extracted chromosome sequence names from : %s" % fasta) + logger.info("All chromosome names: " + ", ".join(sorted(x for x in seq_names_in_genome))) + seq_names_in_gtf = set([]) + + n_total_lines = 0 + n_lines_in_genome = 0 + with open(gtf_out, "w") as f: + with open(gtf_in) as g: + for line in g.readlines(): + n_total_lines += 1 + seq_name_gtf = line.split("\t")[0] + seq_names_in_gtf.add(seq_name_gtf) + if seq_name_gtf in seq_names_in_genome: + n_lines_in_genome += 1 + f.write(line) + logger.info( + "Extracted %d / %d lines from %s matching sequences in %s" % (n_lines_in_genome, n_total_lines, gtf_in, fasta) + ) + logger.info("All sequence IDs from GTF: " + ", ".join(sorted(x for x in seq_name_gtf))) + + logger.info("Wrote matching lines to %s" % gtf_out) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="""Filter GTF only for features in the genome""") + parser.add_argument("--gtf", type=str, help="GTF file") + parser.add_argument("--fasta", type=str, help="Genome fasta file") + parser.add_argument( + "-o", + "--output", + dest="output", + default="genes_in_genome.gtf", + type=str, + help="GTF features on fasta genome sequences", + ) + + args = parser.parse_args() + extract_genes_in_genome(args.fasta, args.gtf, args.output) diff --git a/bin/generate_lib_csv.py b/bin/generate_lib_csv.py new file mode 100755 index 00000000..5c1c0c4f --- /dev/null +++ b/bin/generate_lib_csv.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +import argparse +import os + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Generate the lib.csv for cellranger-arc.") + + parser.add_argument("-t", "--sample_types", dest="sample_types", help="Comma seperated list of sample types.") + parser.add_argument("-n", "--sample_names", dest="sample_names", help="Comma seperated list of sample names.") + parser.add_argument("-f", "--fastq_folder", dest="fastq_folder", help="Folder of FASTQ files.") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + + args = vars(parser.parse_args()) + + print(args) + + sample_types = args["sample_types"].split(",") + sample_names = args["sample_names"].split(",") + unique_samples_names = set(sample_names) + + lib_csv = open(args["out"], "w") + lib_csv.write("fastqs,sample,library_type") + + for i in range(0, len(sample_types)): + if sample_names[i] in unique_samples_names: + unique_samples_names.remove( + sample_names[i] + ) # this has to be done to account for different Lane files (e.g., L002) + if sample_types[i] == "gex": + lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Gene Expression")) + else: + lib_csv.write("\n{},{},{}".format(args["fastq_folder"], sample_names[i], "Chromatin Accessibility")) + + lib_csv.close() + + print("Wrote lib.csv file to {}".format(args["out"])) diff --git a/bin/mtx_to_h5ad.py b/bin/mtx_to_h5ad.py new file mode 100755 index 00000000..3282122d --- /dev/null +++ b/bin/mtx_to_h5ad.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python + +# Set numba chache dir to current working directory (which is a writable mount also in containers) +import os + +os.environ["NUMBA_CACHE_DIR"] = "." + +import scanpy as sc +import pandas as pd +import argparse +from scipy import io +from anndata import AnnData + + +def _10x_h5_to_adata(mtx_h5: str, sample: str): + adata = sc.read_10x_h5(mtx_h5) + adata.var["gene_symbols"] = adata.var_names + adata.var.set_index("gene_ids", inplace=True) + adata.obs["sample"] = sample + + # reorder columns for 10x mtx files + adata.var = adata.var[["gene_symbols", "feature_types", "genome"]] + + return adata + + +def _mtx_to_adata( + mtx_file: str, + barcode_file: str, + feature_file: str, + sample: str, + aligner: str, +): + adata = sc.read_mtx(mtx_file) + if ( + aligner == "star" + ): # for some reason star matrix comes transposed and doesn't fit when values are appended directly + adata = adata.transpose() + + adata.obs_names = pd.read_csv(barcode_file, header=None, sep="\t")[0].values + adata.var_names = pd.read_csv(feature_file, header=None, sep="\t")[0].values + adata.obs["sample"] = sample + + return adata + + +def input_to_adata( + input_data: str, + barcode_file: str, + feature_file: str, + sample: str, + aligner: str, + txp2gene: str, + star_index: str, + verbose: bool = True, +): + if verbose and (txp2gene or star_index): + print("Reading in {}".format(input_data)) + + if aligner == "cellranger": + adata = _10x_h5_to_adata(input_data, sample) + else: + adata = _mtx_to_adata(input_data, barcode_file, feature_file, sample, aligner) + + if verbose and (txp2gene or star_index): + print("Reading in {}".format(txp2gene)) + + if txp2gene: + t2g = pd.read_table(txp2gene, header=None, names=["gene_id", "gene_symbol"], usecols=[1, 2]) + elif star_index: + t2g = pd.read_table( + f"{star_index}/geneInfo.tab", header=None, skiprows=1, names=["gene_id", "gene_symbol"], usecols=[0, 1] + ) + + if txp2gene or star_index: + t2g = t2g.drop_duplicates(subset="gene_id").set_index("gene_id") + adata.var["gene_symbol"] = t2g["gene_symbol"] + + return adata + + +def write_counts( + adata: AnnData, + out: str, + verbose: bool = False, +): + pd.DataFrame(adata.obs.index).to_csv(os.path.join(out, "barcodes.tsv"), sep="\t", index=False, header=None) + pd.DataFrame(adata.var).to_csv(os.path.join(out, "features.tsv"), sep="\t", index=True, header=None) + io.mmwrite(os.path.join(out, "matrix.mtx"), adata.X.T, field="integer") + + if verbose: + print("Wrote features.tsv, barcodes.tsv, and matrix.mtx files to {}".format(args["out"])) + + +def dump_versions(task_process): + import pkg_resources + + with open("versions.yml", "w") as f: + f.write(f"{task_process}:\n\t") + f.write("\n\t".join([f"{pkg.key}: {pkg.version}" for pkg in pkg_resources.working_set])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Converts mtx output to h5ad.") + + parser.add_argument("-i", "--input_data", dest="input_data", help="Path to either mtx or mtx h5 file.") + parser.add_argument("-v", "--verbose", dest="verbose", help="Toggle verbose messages", default=False) + parser.add_argument("-f", "--feature", dest="feature", help="Path to feature file.", nargs="?", const="") + parser.add_argument("-b", "--barcode", dest="barcode", help="Path to barcode file.", nargs="?", const="") + parser.add_argument("-s", "--sample", dest="sample", help="Sample name") + parser.add_argument("-o", "--out", dest="out", help="Output path.") + parser.add_argument("-a", "--aligner", dest="aligner", help="Which aligner has been used?") + parser.add_argument("--task_process", dest="task_process", help="Task process name.") + parser.add_argument("--txp2gene", dest="txp2gene", help="Transcript to gene (t2g) file.", nargs="?", const="") + parser.add_argument( + "--star_index", dest="star_index", help="Star index folder containing geneInfo.tab.", nargs="?", const="" + ) + + args = vars(parser.parse_args()) + + # create the directory with the sample name + os.makedirs(os.path.dirname(args["out"]), exist_ok=True) + + adata = input_to_adata( + input_data=args["input_data"], + barcode_file=args["barcode"], + feature_file=args["feature"], + sample=args["sample"], + aligner=args["aligner"], + txp2gene=args["txp2gene"], + star_index=args["star_index"], + verbose=args["verbose"], + ) + + write_counts(adata=adata, out=args["sample"], verbose=args["verbose"]) + + adata.write_h5ad(args["out"], compression="gzip") + + print("Wrote h5ad file to {}".format(args["out"])) + + dump_versions(task_process=args["task_process"]) diff --git a/bin/mtx_to_seurat.R b/bin/mtx_to_seurat.R new file mode 100755 index 00000000..f2680838 --- /dev/null +++ b/bin/mtx_to_seurat.R @@ -0,0 +1,37 @@ +#!/usr/bin/env Rscript +library(Seurat) + +args <- commandArgs(trailingOnly=TRUE) + +mtx_file <- args[1] +barcode_file <- args[2] +feature_file <- args[3] +out.file <- args[4] +aligner <- args[5] + +if(aligner %in% c("kallisto", "alevin")) { + # for kallisto and alevin, the features file contains only one column and matrix needs to be transposed + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file, feature.column = 1, mtx.transpose = TRUE + ) +} else { + expression.matrix <- ReadMtx( + mtx = mtx_file, features = feature_file, cells = barcode_file + ) +} + +seurat.object <- CreateSeuratObject(counts = expression.matrix) + +dir.create(basename(dirname(out.file)), showWarnings = FALSE) + +saveRDS(seurat.object, file = out.file) + + +yaml::write_yaml( +list( + 'MTX_TO_SEURAT'=list( + 'Seurat' = paste(packageVersion('Seurat'), collapse='.') + ) +), +"versions.yml" +) diff --git a/bin/postprocessing.r b/bin/postprocessing.r new file mode 100755 index 00000000..556f82e2 --- /dev/null +++ b/bin/postprocessing.r @@ -0,0 +1,50 @@ + +args = commandArgs(trailingOnly=TRUE) +if (length(args) < 2) { + stop("Usage: postprocessing.r ", call.=FALSE) +} + +base.path <- args[1] + + +# Parts of the function is taken from Seurat's Read10x parsing function +ReadAlevin <- function( base.path = NULL ){ + if (! dir.exists(base.path )){ + stop("Directory provided does not exist") + } + + barcode.loc <- paste0( base.path, "alevin/quants_mat_rows.txt" ) + gene.loc <- paste0( base.path, "alevin/quants_mat_cols.txt" ) + matrix.loc <- paste0( base.path, "alevin/quants_mat.csv" ) + if (!file.exists( barcode.loc )){ + stop("Barcode file missing") + } + if (! file.exists(gene.loc) ){ + stop("Gene name file missing") + } + if (! file.exists(matrix.loc )){ + stop("Expression matrix file missing") + } + matrix <- as.matrix(read.csv( matrix.loc, header=FALSE)) + matrix <- t(matrix[,1:ncol(matrix)-1]) + + cell.names <- readLines( barcode.loc ) + gene.names <- readLines( gene.loc ) + + colnames(matrix) <- cell.names + rownames(matrix) <- gene.names + matrix[is.na(matrix)] <- 0 + return(matrix) +} + +require("seurat") + +alv.data <- ReadAlevin(base.path) +dat <- CreateSeuratObject(raw.data = alv.data, min.cells = 3, min.genes = 200, project = "10X_rnaseq") +dat <- NormalizeData(object = dat, normalization.method = "LogNormalize", scale.factor = 10000) +dat <- FindVariableGenes(object = dat, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.0125, x.high.cutoff = 3, y.cutoff = 0.5) +dat <- ScaleData(object = dat) +dat <- RunPCA(object = dat, pc.genes = dat@var.genes, do.print = TRUE, pcs.print = 1:5, genes.print = 5) +dat <- FindClusters(object = dat, reduction.type = "pca", dims.use = 1:10, resolution = 0.6, print.output = 0, save.SNN = TRUE) +dat <- RunTSNE(object = dat, dims.use = 1:10, do.fast = TRUE) +TSNEPlot(object = dat) diff --git a/bin/t2g.py b/bin/t2g.py new file mode 100755 index 00000000..2e1953c4 --- /dev/null +++ b/bin/t2g.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python +# This was downloaded on 2019-06-23 from https://github.com/bustools/getting_started/releases/ +# All credit goes to the original authors from the Kallisto/BUStools team! +# BSD 2-Clause License +# +# Copyright (c) 2017, Nicolas Bray, Harold Pimentel, Páll Melsted and Lior Pachter +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +import sys, argparse + + +def create_transcript_list(input, use_name=True, use_version=False): + r = {} + for line in input: + if len(line) == 0 or line[0] == "#": + continue + l = line.strip().split("\t") + if l[2] == "transcript": + info = l[8] + d = {} + for x in info.split("; "): + x = x.strip() + p = x.find(" ") + if p == -1: + continue + k = x[:p] + p = x.find('"', p) + p2 = x.find('"', p + 1) + v = x[p + 1 : p2] + d[k] = v + + if "transcript_id" not in d or "gene_id" not in d: + continue + + tid = d["transcript_id"].split(".")[0] + gid = d["gene_id"].split(".")[0] + if use_version: + if "transcript_version" not in d or "gene_version" not in d: + continue + + tid += "." + d["transcript_version"] + gid += "." + d["gene_version"] + gname = None + if use_name: + if "gene_name" not in d: + continue + gname = d["gene_name"] + + if tid in r: + continue + + r[tid] = (gid, gname) + return r + + +def print_output(output, r, use_name=True): + for tid in r: + if use_name: + output.write("%s\t%s\t%s\n" % (tid, r[tid][0], r[tid][1])) + else: + output.write("%s\t%s\n" % (tid, r[tid][0])) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + add_help=True, + description="Creates transcript to gene info from GTF files\nreads from standard input and writes to standard output", + ) + parser.add_argument( + "--use_version", + "-v", + action="store_true", + help="Use version numbers in transcript and gene ids", + ) + parser.add_argument("--skip_gene_names", "-s", action="store_true", help="Do not output gene names") + args = parser.parse_args() + + input = sys.stdin + r = create_transcript_list(input, use_name=not args.skip_gene_names, use_version=args.use_version) + output = sys.stdout + print_output(output, r) diff --git a/conf/base.config b/conf/base.config index d6df6f7e..eea930a9 100644 --- a/conf/base.config +++ b/conf/base.config @@ -10,7 +10,6 @@ process { - // TODO nf-core: Check the defaults for all processes cpus = { check_max( 1 * task.attempt, 'cpus' ) } memory = { check_max( 6.GB * task.attempt, 'memory' ) } time = { check_max( 4.h * task.attempt, 'time' ) } @@ -24,7 +23,6 @@ process { // These labels are used and recognised by default in DSL2 files hosted on nf-core/modules. // If possible, it would be nice to keep the same label naming convention when // adding in your local modules too. - // TODO nf-core: Customise requirements for specific processes. // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors withLabel:process_single { cpus = { check_max( 1 , 'cpus' ) } diff --git a/conf/modules.config b/conf/modules.config index e3ea8fa6..d4f3d531 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -11,7 +11,6 @@ */ process { - publishDir = [ path: { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" }, mode: params.publish_dir_mode, @@ -20,8 +19,16 @@ process { withName: FASTQC { ext.args = '--quiet' + time = { check_max( 120.h * task.attempt, 'time' ) } } + withName: SAMPLESHEET_CHECK { + publishDir = [ + path: { "${params.outdir}/pipeline_info" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } withName: CUSTOM_DUMPSOFTWAREVERSIONS { publishDir = [ path: { "${params.outdir}/pipeline_info" }, @@ -39,4 +46,170 @@ process { ] } + withName: 'MTX_TO_H5AD|CONCAT_H5AD|MTX_TO_SEURAT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/mtx_conversions" }, + mode: params.publish_dir_mode + ] + } + withName: 'GTF_GENE_FILTER' { + publishDir = [ + path: { "${params.outdir}/gtf_filter" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename }, + enabled: false + ] + } +} + +if(params.aligner == "cellranger") { + process { + withName: CELLRANGER_MKGTF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + } + withName: CELLRANGER_MKREF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkref", + mode: params.publish_dir_mode + ] + } + withName: CELLRANGER_COUNT { + publishDir = [ + path: "${params.outdir}/${params.aligner}/count", + mode: params.publish_dir_mode + ] + ext.args = {"--chemistry ${meta.chemistry} " + (meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : '')} + time = { check_max( 240.h * task.attempt, 'time' ) } + } + } +} + +if(params.aligner == "cellrangerarc") { + process { + withName: CELLRANGERARC_MKGTF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + } + withName: CELLRANGERARC_MKREF { + publishDir = [ + path: "${params.outdir}/${params.aligner}/mkref", + mode: params.publish_dir_mode + ] + } + withName: CELLRANGERARC_COUNT { + publishDir = [ + path: "${params.outdir}/${params.aligner}/count", + mode: params.publish_dir_mode + ] + ext.args = {meta.expected_cells ? "--expect-cells ${meta.expected_cells}" : ''} + time = { check_max( 240.h * task.attempt, 'time' ) } + } + } +} + +if(params.aligner == "universc") { + process { + publishDir = { "${params.outdir}/${task.process.tokenize(':')[-1].tokenize('_')[0].toLowerCase()}" } + + withName: CELLRANGER_MKGTF { + publishDir = [ + path: "${params.outdir}/cellranger/mkgtf", + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + ext.args = "--attribute=gene_biotype:protein_coding --attribute=gene_biotype:lncRNA --attribute=gene_biotype:pseudogene" + container = "nf-core/universc:1.2.5.1" + } + withName: CELLRANGER_MKREF { + publishDir = [ + path: "${params.outdir}/cellranger/mkref", + mode: params.publish_dir_mode + ] + container = "nf-core/universc:1.2.5.1" + } + withName: UNIVERSC { + publishDir = [ + path: "${params.outdir}/universc", + mode: params.publish_dir_mode + ] + time = { check_max( 240.h * task.attempt, 'time' ) } + } + } +} + +if (params.aligner == "alevin") { + process { + withName: GFFREAD_TXP2GENE { + ext.args = "--table transcript_id,gene_id" + ext.prefix = { "${gff.baseName}_gffread" } + } + withName: 'SIMPLEAF_INDEX' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + ext.args = { "--rlen ${params.simpleaf_rlen}" } + } + withName: 'SIMPLEAF_QUANT' { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode + ] + ext.args = "-r cr-like" + } + //Fix for issue 196 + withName: 'ALEVINQC' { + time = '120.h' + } + } +} + +if (params.aligner == "star") { + process { + withName: STAR_ALIGN { + ext.args = "--readFilesCommand zcat --runDirPerm All_RWX --outWigType bedGraph --twopassMode Basic --outSAMtype BAM SortedByCoordinate" + } + withName: STAR_GENOMEGENERATE { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/genome_generate" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: STAR_ALIGN { + publishDir = [ + path: { "${params.outdir}/${params.aligner}/${meta.id}" }, + mode: params.publish_dir_mode + ] + } + } +} + +if (params.aligner == 'kallisto') { + process { + withName: KALLISTOBUSTOOLS_REF { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode, + enabled: params.save_reference + ] + } + withName: KALLISTOBUSTOOLS_COUNT { + publishDir = [ + path: { "${params.outdir}/${params.aligner}" }, + mode: params.publish_dir_mode + ] + ext.args = "--workflow ${params.kb_workflow}" + } + } } diff --git a/conf/test.config b/conf/test.config index 05a1d648..45ee54c8 100644 --- a/conf/test.config +++ b/conf/test.config @@ -20,10 +20,19 @@ params { max_time = '6.h' // Input data - // TODO nf-core: Specify the paths to your test data on nf-core/test-datasets - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv' + input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' // Genome references - genome = 'R64-1-1' + fasta = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa' + gtf = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/gencode.vM19.annotation.chr19.gtf' + aligner = 'star' + protocol = '10XV2' + + validationSchemaIgnoreParams = 'genomes' +} + +process { + withName: '.*:CELLRANGER_COUNT' { + maxForks = 1 + } } diff --git a/conf/test_full.config b/conf/test_full.config index e50f487c..dd838978 100644 --- a/conf/test_full.config +++ b/conf/test_full.config @@ -2,23 +2,25 @@ ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Nextflow config file for running full-size tests ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - Defines input files and everything required to run a full size pipeline test. + Defines input files and everything required to run a fast and simple pipeline test. Use as follows: - nextflow run nf-core/scrnaseq -profile test_full, --outdir + nextflow run nf-core/scrnaseq -profile test, --outdir ---------------------------------------------------------------------------------------- */ params { - config_profile_name = 'Full test profile' + config_profile_name = 'Full Test profile' config_profile_description = 'Full test dataset to check pipeline function' - // Input data for full size test - // TODO nf-core: Specify the paths to your full test data ( on nf-core/test-datasets or directly in repositories, e.g. SRA) - // TODO nf-core: Give any required params for the test so that command line flags are not needed - input = 'https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_full_illumina_amplicon.csv' + // Input data + input = 'https://raw.githubusercontent.com/nf-core/test-datasets/scrnaseq/samplesheet_2.0_full.csv' // Genome references - genome = 'R64-1-1' + genome = 'GRCh38' + aligner = 'star' + protocol = '10XV2' + + validationSchemaIgnoreParams = 'genomes' } diff --git a/docs/images/nfcore-scrnaseq_logo.png b/docs/images/nfcore-scrnaseq_logo.png new file mode 100644 index 00000000..23bad886 Binary files /dev/null and b/docs/images/nfcore-scrnaseq_logo.png differ diff --git a/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg b/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg new file mode 100644 index 00000000..a831236f --- /dev/null +++ b/docs/images/rnaseq_pipeline_V1.0-metro_clean.svg @@ -0,0 +1,3471 @@ + +image/svg+xmlCell RangerGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count Matrixcellranger countGene BioType Filteringcellranger-arc mkgtfGenerate Reference Indexcellranger-arc mkrefGenerate Count Matrixcellranger-arc countCell Ranger ArcGene BioType Filteringcellranger-atac mkgtfGenerate Reference Indexcellranger-atac mkrefGenerate Count Matrixcellranger-atac countCell Ranger ATACSalmon-AlevinGenerate Reference Indexsalmon indexQuality ControlalevinQCGenerate Count Matrixsalmon alevinKallistoGenerate Reference Indexkallistobustools refGenerate Count Matrixkallistobustools countSTARsoloGenerate Count MatrixSTARGenerate Reference IndexSTARCount Matrixh5adDemuliplexed Count Matrixh5adConvert MTX to H5ADDemultiplexingReporthtmlMultiQCRaw Data QCFastQCIdentify Genotypescellsnp-liteGenotypingvireocsvSamplesheetLegendStartEndInput:GenomefastagtfReferenceAnnotationgtfReferenceAnnotationGenomefasta+scRNA-SeqscATAC-SeqMultiomefastqCell RangerUniverSCGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count Matrixcellranger countGene BioType Filteringcellranger mkgtfGenerate Reference Indexcellranger mkrefGenerate Count MatrixuniverscSalmon-AlevinQuality ControlalevinQCGenerate Reference Indexsalmon indexGenerate Count Matrixsalmon alevinKallistoGenerate Reference Indexkallistobustools refGenerate Count Matrixkallistobustools countSTARsoloGenerate Count MatrixSTARGenerate Reference IndexSTARCount Matrixh5adConvert MTX to H5ADReporthtmlMultiQCRaw Data QCFastQCcsvSamplesheetLegendStartEndInput:GenomefastagtfReferenceAnnotationgtfReferenceAnnotationGenomefasta+fastq diff --git a/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png b/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png new file mode 100644 index 00000000..74692e2f Binary files /dev/null and b/docs/images/scrnaseq_pipeline_v1.0_metro_clean.png differ diff --git a/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png b/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png new file mode 100644 index 00000000..7329ee16 Binary files /dev/null and b/docs/images/scrnaseq_pipeline_v2.0_metro_clean.png differ diff --git a/docs/output.md b/docs/output.md index 22a1a79f..7e9f0cd8 100644 --- a/docs/output.md +++ b/docs/output.md @@ -4,42 +4,147 @@ This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - - - ## Pipeline overview The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: -- [FastQC](#fastqc) - Raw read QC -- [MultiQC](#multiqc) - Aggregate report describing results and QC from the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution +- [nf-core/scrnaseq: Output](#nf-corescrnaseq-output) + - [:warning: Please read this documentation on the nf-core website: https://nf-co.re/scrnaseq/output](#warning-please-read-this-documentation-on-the-nf-core-website-httpsnf-corescrnaseqoutput) + - [Introduction](#introduction) + - [Pipeline overview](#pipeline-overview) + - [FastQC](#fastqc) + - [Kallisto & Bustools Results](#kallisto--bustools-results) + - [STARsolo](#starsolo) + - [Salmon Alevin & AlevinQC](#salmon-alevin--alevinqc) + - [Cellranger](#cellranger) + - [Cellranger ARC](#cellranger-arc) + - [UniverSC](#universc) + - [Other output data](#other-output-data) + - [MultiQC](#multiqc) + - [Pipeline information](#pipeline-information) -### FastQC +## FastQC -
-Output files +See [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc) for details about FastQC. -- `fastqc/` - - `*_fastqc.html`: FastQC report containing quality metrics. - - `*_fastqc.zip`: Zip archive containing the FastQC report, tab-delimited data file and plot images. +The pipeline analyzes the raw data and generates for each file a FastQC report. All report are collected in MultiQC. -
+**Output directory: `results/fastqc`** -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). +- `.html` + - Contains the FastQC report. +- `.zip` + - Contains additional information, such as individual plots, and FastQC raw data. -![MultiQC - FastQC sequence counts plot](images/mqc_fastqc_counts.png) +:::note +The FastQC plots displayed in the MultiQC report shows the unprocessed reads. Depending on the protocol, +they may contain barcodes, UMIs and adapter sequences. +::: -![MultiQC - FastQC mean quality scores plot](images/mqc_fastqc_quality.png) +## Kallisto & Bustools Results -![MultiQC - FastQC adapter content plot](images/mqc_fastqc_adapter.png) +See [Kallisto](https://pachterlab.github.io/kallisto/about) for details about Kallisto and [Bustools](https://bustools.github.io/) for more information on BusTools. -:::note -The FastQC plots displayed in the MultiQC report shows _untrimmed_ reads. They may contain adapter sequence and potentially regions with low quality. -::: +The pipeline can analyze data from single cell rnaseq experiments and generates a set of folders with respective outputs from various steps of the analysis. For a detailed summary what the pipeline does specifically, please follow the [excellent tutorial](https://www.kallistobus.tools/getting_started.html) that also describes specific steps for downstream analysis of the generated matrices. + +**Output directory: `results/kallisto`** + +- `raw_bus` + - Contains the unconverted BUS formatted pseudo aligned data +- `sort_bus` + - Contains the same BUS formatted data, sorted and corrected with the supplied barcode whitelist +- `kallisto_gene_map` + - Contains the converted GTF gene map that is used by BUSTools for downstream analysis +- `bustools_counts` + - Contains two subdirectories + - `eqcount`: Containing the Transcript Compatibility Count (TCC) Matrix (`tcc.mtx`) + - `genecount`: Containing the Gene Count Matrix (`gene.mtx`) +- `bustools_metrics` \* Contains the JSON metrics generated by BUStools + +For details on how to load these into R and perform further downstream analysis, please refer to the [BusTools HowTo](https://github.com/BUStools/getting_started/blob/master/getting_started.ipynb). + +**Output directory: `results/reference_genome`** + +- `kallisto_index` + - Contains the index of the supplied (genome/transcriptome) fasta file + +## STARsolo + +**Output directory: `results/star`** + +- Files will be organized in one directory per sample +- Contains the mapped BAM files and output metrics created by STARsolo + +**Output directory: `results/reference_genome`** + +- `star_index` + - Contains the index of the supplied genome fasta file + +## Salmon Alevin & AlevinQC + +**Output directory: `results/alevin`** + +- `alevin` + - Contains the created Salmon Alevin pseudo-aligned output +- `alevinqc` + - Contains the QC report for the aforementioned Salmon Alevin output data + +**Output directory: `results/reference_genome`** + +- `salmon_index` + - Contains the indexed reference transcriptome for Salmon Alevin +- `alevin/txp2gene.tsv` + - The transcriptome to gene mapping TSV file utilized by Salmon Alevin + +## Cellranger + +Cell Ranger is a set of analysis scripts that processes 10X Chromium single cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis. See [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) for more information on Cellranger. + +**Output directory: `results/cellranger`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger + +## Cellranger ARC + +Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell Multiome ATAC + Gene Expression sequencing data to generate a variety of analyses pertaining to gene expression (GEX), chromatin accessibility, and their linkage. Furthermore, since the ATAC and GEX measurements are on the very same cell, we are able to perform analyses that link chromatin accessibility and GEX. See [Cellranger ARC](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc) for more information on Cellranger. + +**Output directory: `results/cellrangerarc`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by Cellranger ARC + +## UniverSC + +UniverSC is a wrapper that calls an open-source implementation of Cell Ranger v3.0.2 and adjusts run parameters for compatibility with a wide ranger of technologies. +Custom inputs and at least 40 preset technologies are supported. UniverSC is developed independently from 10X Genomics and all software are not subject +to the 10X Genomics End User License Agreement which restricts usage on other platforms. Therefore in principle UniverSC can be run on any scRNA-Seq technology +without restrictions to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis. +See [UniverSC](https://github.com/minoda-lab/universc) for more information on UniverSC. + +UniverSC has been published in _Nature Communications_. + +Battenberg, K., Kelly, S.T., Ras, R.A., Hetherington, N.A., Hayashi, K., and Minoda, A. (2022) A flexible cross-platform single-cell data processing pipeline. Nat Commun 13(1): 1-7. https://doi.org/10.1038/s41467-022-34681-z + +**Output directory: `results/universc`** + +- Contains the mapped BAM files, filtered and unfiltered HDF5 matrices and output metrics created by the open-source implementation of Cell Ranger run via UniverSC + +## Other output data + +**Output directory: `results/reference_genome`** + +- `barcodes` + - Contains the utilized cell barcode whitelists (if applicable) +- `extract_transcriptome` + - When supplied with a `--fasta` genome fasta, this contains the extracted transcriptome + - The GTF file supplied with `--gtf` is used to extract the transcriptome positions appropriately + +**Output directory: `results/${params.aligner}/mtx_conversions`** + +- `*_matrix.h5ad` + - `.mtx` files converted to [AnnData](https://anndata.readthedocs.io/en/latest/) in `.h5ad` format, using [scanpy package](https://scanpy.readthedocs.io/en/stable/). + - One per sample and a single one with all samples concatenated together `combined_matrix.h5ad` -### MultiQC +## MultiQC
Output files diff --git a/docs/usage.md b/docs/usage.md index e9b83795..ee5d9812 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -1,16 +1,10 @@ # nf-core/scrnaseq: Usage -## :warning: Please read this documentation on the nf-core website: [https://nf-co.re/scrnaseq/usage](https://nf-co.re/scrnaseq/usage) - > _Documentation of pipeline parameters is generated automatically from the pipeline schema and can no longer be found in markdown files._ -## Introduction - - - ## Samplesheet input -You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row as shown in the examples below. +You will need to create a samplesheet with information about the samples you would like to analyse before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with at least 3 columns, and a header row as shown in the examples below. ```bash --input '[path to samplesheet file]' @@ -29,38 +23,150 @@ CONTROL_REP1,AEG588A1_S1_L004_R1_001.fastq.gz,AEG588A1_S1_L004_R2_001.fastq.gz ### Full samplesheet -The pipeline will auto-detect whether a sample is single- or paired-end using the information provided in the samplesheet. The samplesheet can have as many columns as you desire, however, there is a strict requirement for the first 3 columns to match those defined in the table below. +There is a strict requirement for the first 3 columns to match those defined in the table below. -A final samplesheet file consisting of both single- and paired-end data may look something like the one below. This is for 6 samples, where `TREATMENT_REP3` has been sequenced twice. +| Column | Description | +| ---------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `sample` | Required. Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | +| `fastq_1` | Required. Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `fastq_2` | Required. Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +| `expected_cells` | Optional. Number of cells expected for a sample. Must be an integer. If multiple rows are provided for the same sample, this must be the same number for all rows, i.e. the total number of expected cells for the sample. | +| `seq_center` | Optional. Sequencing center for the sample. If multiple rows are provided for the same sample, this must be the same string for all rows. Samples sequenced at different centers are considered different samples and must have different identifiers. Used for STARsolo BAM outputs only. Overrides `params.seq_center`. | + +An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. + +### Expected cells + +This parameter is currently supported by + +- [Salmon Alevin](https://salmon.readthedocs.io/en/latest/alevin.html#expectcells) +- [STARsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) +- [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) + +Note that since cellranger v7, it is **not recommended** anymore to supply the `--expected-cells` parameter. + +## Aligning options + +By default, the pipeline uses [Salmon Alevin](https://salmon.readthedocs.io/en/latest/alevin.html) (i.e. --aligner alevin) to perform pseudo-alignment of reads to the reference genome and to perform the downstream BAM-level quantification. Then QC reports are generated with AlevinQC. + +Other aligner options for running the pipeline are: + +- [Kallisto](https://pachterlab.github.io/kallisto/about) & [Bustools](https://bustools.github.io/), where kallisto is used for alignment and bustools is used for downstream analysis + - `--aligner kallisto` +- [STARsolo](https://github.com/alexdobin/STAR/blob/master/docs/STARsolo.md) to perform both alignment and downstream analysis. + - `--aligner star` +- [Cellranger](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger) to perform both alignment and downstream analysis. + - `--aligner cellranger` +- [UniverSC](https://github.com/minoda-lab/universc) to run an open-source version of Cell Ranger on any technology + - '--aligner universc` + +### If using cellranger or universc + +This pipeline automatically renames input FASTQ files to follow the +[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input): -```csv title="samplesheet.csv" -sample,fastq_1,fastq_2 -CONTROL_REP1,AEG588A1_S1_L002_R1_001.fastq.gz,AEG588A1_S1_L002_R2_001.fastq.gz -CONTROL_REP2,AEG588A2_S2_L002_R1_001.fastq.gz,AEG588A2_S2_L002_R2_001.fastq.gz -CONTROL_REP3,AEG588A3_S3_L002_R1_001.fastq.gz,AEG588A3_S3_L002_R2_001.fastq.gz -TREATMENT_REP1,AEG588A4_S4_L003_R1_001.fastq.gz, -TREATMENT_REP2,AEG588A5_S5_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L003_R1_001.fastq.gz, -TREATMENT_REP3,AEG588A6_S6_L004_R1_001.fastq.gz, +``` +[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz ``` -| Column | Description | -| --------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `sample` | Custom sample name. This entry will be identical for multiple sequencing libraries/runs from the same sample. Spaces in sample names are automatically converted to underscores (`_`). | -| `fastq_1` | Full path to FastQ file for Illumina short reads 1. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | -| `fastq_2` | Full path to FastQ file for Illumina short reads 2. File has to be gzipped and have the extension ".fastq.gz" or ".fq.gz". | +For more details, see -An [example samplesheet](../assets/samplesheet.csv) has been provided with the pipeline. +- [this issue](https://github.com/nf-core/scrnaseq/issues/241), discussing various mechanisms to deal with non-conformant filenames +- [the README of the cellranger/count module](https://github.com/nf-core/modules/blob/master/modules/nf-core/cellranger/count/README.md) which demonstrates that renaming files does not affect the results. +- [the code for renaming files in the cellranger/count module](https://github.com/nf-core/modules/blob/master/modules/nf-core/cellranger/count/templates/cellranger_count.py) +- [the code for renaming files in UniverSC](https://github.com/minoda-lab/universc/blob/99a20652430c1dc9f962536a2793536f643810b7/launch_universc.sh#L1411-L1609) + +As a sanity check, we verify that filenames of a pair of FASTQ files only differ by `R1`/`R2`. + +### Support for different scRNA-seq protocols + +The single-cell protocol used in the experiment can be specified using the `--protocol` flag. +For cellranger, it is recommended to stick with the default value `'auto'` for automatic detection of the protocol. +For all other aligner, you need to specify the protocol manually. + +The three 10x Genomics protocols 3' v1 (`10XV1`), 3' v2 (`10XV2`) and 3' v3 (`10XV3`) are universally supported +by all aligners in the pipeline and mapped to the correct options automatically. If the protocol is unknown to the +nf-core pipeline, the value specified to `--protocol` is passed to the aligner _in verbatim_ to support additional protocols. + +Here are some hints on running the various aligners with different protocols + +#### Kallisto/bustools + +The command `kb --list` shows all supported, preconfigured protocols. Additionally, a custom technology string such as +`0,0,16:0,16,26:1,0,0` can be speficied: + +> Additionally kallisto bus will accept a string specifying a new technology in the format of bc:umi:seq where each of bc,umi and seq are a triplet of integers separated by a comma, denoting the file index, start and stop of the sequence used. For example to specify the 10xV2 technology we would use 0,0,16:0,16,26:1,0,0 + +For more details, please refer to the [Kallisto/bustools documentation](https://pachterlab.github.io/kallisto/manual#bus). + +#### Alevin/fry + +Alevin/fry also supports custom chemistries in a slighly different format, e.g. `1{b[16]u[12]x:}2{r:}`. + +For more details, see the [simpleaf documentation](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag) + +#### UniverSC + +See the [UniverSC GitHub page](https://github.com/minoda-lab/universc#pre-set-configurations) for all supported protocols. + +Currently only 3\' scRNA-Seq parameters are supported in nextflow, although chemistry parameters for 5\' scRNA-Seq and full-length scRNA-Seq libraries are supported by teh container. + +### If using cellranger-arc + +#### Automatic file name detection + +This pipeline currently **does not** automatically renames input FASTQ files to follow the +[naming convention by 10x](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input): + +``` +[Sample Name]_S1_L00[Lane Number]_[Read Type]_001.fastq.gz +``` + +Thus please make sure your files follow this naming convention. + +#### Sample sheet definition + +If you are using cellranger-arc you have to add the column _sample_type_ (atac for scATAC or gex for scRNA) and _fastq_barcode_ (part of the scATAC data) to your samplesheet as an input. + +**Beware of the following points:** + +- It is important that you give your scRNA and scATAC different [Sample Name]s. +- Check first which file is your barcode fastq file for your scATAC data ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/using/fastq-input)). +- If you have more than one sequencing run then you have to give them another suffix (e.g., rep\*) to your [Sample Name] ([see](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/using/fastq-input#atac_quick_start)). + +An example samplesheet for a dataset called test_scARC that has two sequencing runs for the scATAC and one seqeuncing run +from two lanes for the scRNA could look like this: + +```csv +sample,fastq_1,fastq_2,fastq_barcode,sample_type +test_scARC,path/test_scARC_atac_rep1_S1_L001_R1_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_R2_001.fastq.gz,path/test_scARC_atac_rep1_S1_L001_I2_001.fastq.gz,atac +test_scARC,path/test_scARC_atac_rep2_S2_L001_R1_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_R2_001.fastq.gz,path/test_scARC_atac_rep2_S2_L001_I2_001.fastq.gz,atac +test_scARC,path/test_scARC_gex_S1_L001_R1_001.fastq.gz,path/test_scARC_gex_S1_L001_R2_001.fastq.gz,,gex +test_scARC,path/test_scARC_gex_S1_L002_R1_001.fastq.gz,path/test_scARC_gex_S1_L002_R2_001.fastq.gz,,gex +``` + +#### Config file and index + +Cellranger-arc needs a reference index directory that you can provide with `--cellranger_index`. Be aware, you can use +for cellranger-arc the same index you use for cellranger ([see](https://kb.10xgenomics.com/hc/en-us/articles/4408281606797-Are-the-references-interchangeable-between-pipelines)). +Yet, a cellranger-arc index might include additional data (e.g., TF binding motifs). Therefore, please first check if +you have to create a new cellranger-arc index ([see here](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/advanced/references) for +more information) + +If you decide to create a cellranger-arc index, then you need to create a config file to generate the index. The pipeline +can do this autmatically for you if you provide a `--fasta`, `--gtf`, and an optional `--motif` file. However, you can +also decide to provide your own config file with `--cellrangerarc_config`, then you also have to specify with `--cellrangerarc_reference` +the reference genome name that you have used and stated as _genome:_ in your config file. ## Running the pipeline -The typical command for running the pipeline is as follows: +The minimum typical command for running the pipeline is as follows: ```bash -nextflow run nf-core/scrnaseq --input ./samplesheet.csv --outdir ./results --genome GRCh37 -profile docker +nextflow run nf-core/scrnaseq --input ./samplesheet.csv --outdir ./results --genome GRCh38 -profile docker ``` -This will launch the pipeline with the `docker` configuration profile. See below for more information about profiles. +This will launch the pipeline with the `docker` configuration profile and default `--type` and `--barcode_whitelist`. See below for more information about profiles and these options. Note that the pipeline will create the following files in your working directory: @@ -108,7 +214,7 @@ nextflow pull nf-core/scrnaseq It is a good idea to specify a pipeline version when running the pipeline on your data. This ensures that a specific version of the pipeline code and software are used when you run your pipeline. If you keep using the same tag, you'll be running the same version of the pipeline, even if there have been changes to the code since. -First, go to the [nf-core/scrnaseq releases page](https://github.com/nf-core/scrnaseq/releases) and find the latest pipeline version - numeric only (eg. `1.3.1`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.3.1`. Of course, you can switch to another version by changing the number after the `-r` flag. +First, go to the [nf-core/scrnaseq releases page](https://github.com/nf-core/scrnaseq/releases) and find the latest version number - numeric only (eg. `1.0.0`). Then specify this when running the pipeline with `-r` (one hyphen) - eg. `-r 1.0.0`. This version number will be logged in reports when you run the pipeline, so that you'll know what you used when you look back in the future. For example, at the bottom of the MultiQC reports. @@ -126,6 +232,8 @@ These options are part of Nextflow and use a _single_ hyphen (pipeline parameter ### `-profile` +Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Note that multiple profiles can be loaded, for example: `-profile docker` - the order of arguments is important! + Use this parameter to choose a configuration profile. Profiles can give configuration presets for different compute environments. Several generic profiles are bundled with the pipeline which instruct the pipeline to use software packaged using different methods (Docker, Singularity, Podman, Shifter, Charliecloud, Apptainer, Conda) - see below. diff --git a/main.nf b/main.nf index fe1b3bad..3471e5b0 100644 --- a/main.nf +++ b/main.nf @@ -29,10 +29,8 @@ include { getGenomeAttribute } from './subworkflows/local/utils_nfcore_scrn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ -// TODO nf-core: Remove this line if you don't need a FASTA file -// This is an example of how to use getGenomeAttribute() to fetch parameters -// from igenomes.config using `--genome` params.fasta = getGenomeAttribute('fasta') +params.gtf = getGenomeAttribute('gtf') /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/modules.json b/modules.json index 01d274c2..d81f322c 100644 --- a/modules.json +++ b/modules.json @@ -5,15 +5,117 @@ "https://github.com/nf-core/modules.git": { "modules": { "nf-core": { + "cellranger/count": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": [ + "modules" + ] + }, + "cellranger/mkgtf": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": [ + "modules" + ] + }, + "cellranger/mkref": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "cellrangerarc/count": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": [ + "modules" + ] + }, + "cellrangerarc/mkgtf": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": [ + "modules" + ] + }, + "cellrangerarc/mkref": { + "branch": "master", + "git_sha": "4196b1b2e7ce265892f3979eabf7a9ddc030702f", + "installed_by": [ + "modules" + ] + }, + "custom/dumpsoftwareversions": { + "branch": "master", + "git_sha": "8ec825f465b9c17f9d83000022995b4f7de6fe93", + "installed_by": [ + "modules" + ] + }, "fastqc": { "branch": "master", "git_sha": "f4ae1d942bd50c5c0b9bd2de1393ce38315ba57c", - "installed_by": ["modules"] + "installed_by": [ + "modules" + ] + }, + "gffread": { + "branch": "master", + "git_sha": "575e1bc54b083fb15e7dd8b5fcc40bea60e8ce83", + "installed_by": [ + "modules" + ] + }, + "gunzip": { + "branch": "master", + "git_sha": "3a5fef109d113b4997c9822198664ca5f2716208", + "installed_by": [ + "modules" + ] + }, + "kallistobustools/count": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "kallistobustools/ref": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] }, "multiqc": { "branch": "master", - "git_sha": "b7ebe95761cd389603f9cc0e0dc384c0f663815a", - "installed_by": ["modules"] + "git_sha": "9e71d8519dfbfc328c078bba14d4bd4c99e39a94", + "installed_by": [ + "modules" + ] + }, + "star/genomegenerate": { + "branch": "master", + "git_sha": "a21faa6a3481af92a343a10926f59c189a2c16c9", + "installed_by": [ + "modules" + ] + }, + "universc": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] + }, + "unzip": { + "branch": "master", + "git_sha": "3f5420aa22e00bd030a2556dfdffc9e164ec0ec5", + "installed_by": [ + "modules" + ] } } }, @@ -22,17 +124,23 @@ "utils_nextflow_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfcore_pipeline": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] }, "utils_nfvalidation_plugin": { "branch": "master", "git_sha": "5caf7640a9ef1d18d765d55339be751bb0969dfa", - "installed_by": ["subworkflows"] + "installed_by": [ + "subworkflows" + ] } } } diff --git a/modules/local/alevinqc.nf b/modules/local/alevinqc.nf new file mode 100644 index 00000000..9000d79e --- /dev/null +++ b/modules/local/alevinqc.nf @@ -0,0 +1,46 @@ +process ALEVINQC { + tag "$meta.id" + label 'process_low' + + //The alevinqc 1.14.0 container is broken, missing some libraries - thus reverting this to previous 1.12.1 version + conda "bioconda::bioconductor-alevinqc=1.12.1" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' : + 'biocontainers/bioconductor-alevinqc:1.12.1--r41h9f5acd7_0' }" + + input: + tuple val(meta), path(alevin_results) + + output: + tuple val(meta), path("alevin_report_${meta.id}.html"), emit: report + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + #!/usr/bin/env Rscript + require(alevinQC) + alevinFryQCReport( + mapDir = "${alevin_results}/af_map", + quantDir = "${alevin_results}/af_quant", + permitDir= "${alevin_results}/af_quant", + sampleId = "${prefix}", + outputFile = "alevin_report_${meta.id}.html", + outputFormat = "html_document", + outputDir = "./", + forceOverwrite = TRUE + ) + + yaml::write_yaml( + list( + '${task.process}'=list( + 'alevinqc' = paste(packageVersion('alevinQC'), collapse='.') + ) + ), + "versions.yml" + ) + """ +} diff --git a/modules/local/concat_h5ad.nf b/modules/local/concat_h5ad.nf new file mode 100644 index 00000000..96920f9e --- /dev/null +++ b/modules/local/concat_h5ad.nf @@ -0,0 +1,31 @@ +process CONCAT_H5AD { + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + path h5ad + path samplesheet + + output: + path "*.h5ad", emit: h5ad + + when: + task.ext.when == null || task.ext.when + + script: + """ + concat_h5ad.py \\ + --input $samplesheet \\ + --out combined_matrix.h5ad \\ + --suffix "_matrix.h5ad" + """ + + stub: + """ + touch combined_matrix.h5ad + """ +} diff --git a/modules/local/gene_map.nf b/modules/local/gene_map.nf new file mode 100644 index 00000000..9fd29e0a --- /dev/null +++ b/modules/local/gene_map.nf @@ -0,0 +1,34 @@ +/* + * Reformat design file and check validity + */ +process GENE_MAP { + tag "$gtf" + label 'process_low' + + conda "conda-forge::python=3.8.3" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.8.3' : + 'biocontainers/python:3.8.3' }" + + input: + path gtf + + output: + path "transcripts_to_genes.txt" , emit: gene_map + + when: + task.ext.when == null || task.ext.when + + script: + if("${gtf}".endsWith('.gz')){ + name = "${gtf.baseName}" + unzip = "gunzip -f ${gtf}" + } else { + unzip = "" + name = "${gtf}" + } + """ + $unzip + cat $name | t2g.py --use_version > transcripts_to_genes.txt + """ +} diff --git a/modules/local/gffread_transcriptome.nf b/modules/local/gffread_transcriptome.nf new file mode 100644 index 00000000..ab573b07 --- /dev/null +++ b/modules/local/gffread_transcriptome.nf @@ -0,0 +1,30 @@ +process GFFREAD_TRANSCRIPTOME { + tag "${genome_fasta}" + label 'process_low' + + conda "bioconda::gffread=0.12.7" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.7--hd03093a_1' : + 'biocontainers/gffread:0.12.7--hd03093a_1' }" + + input: + path genome_fasta + path gtf + + output: + path "${genome_fasta}.transcriptome.fa", emit: transcriptome_extracted + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + """ + gffread -F $gtf -w "${genome_fasta}.transcriptome.fa" -g $genome_fasta + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/local/gtf_gene_filter.nf b/modules/local/gtf_gene_filter.nf new file mode 100644 index 00000000..063bd228 --- /dev/null +++ b/modules/local/gtf_gene_filter.nf @@ -0,0 +1,32 @@ +process GTF_GENE_FILTER { + tag "$fasta" + label 'process_low' + + conda "conda-forge::python=3.9.5" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.9--1' : + 'biocontainers/python:3.9--1' }" + + input: + path fasta + path gtf + + output: + path "*.gtf" , emit: gtf + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: // filter_gtf_for_genes_in_genome.py is bundled with the pipeline, in nf-core/rnaseq/bin/ + """ + filter_gtf_for_genes_in_genome.py \\ + --gtf $gtf \\ + --fasta $fasta \\ + -o ${fasta.baseName}_genes.gtf + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //g') + END_VERSIONS + """ +} diff --git a/modules/local/mtx_to_h5ad.nf b/modules/local/mtx_to_h5ad.nf new file mode 100644 index 00000000..84d98608 --- /dev/null +++ b/modules/local/mtx_to_h5ad.nf @@ -0,0 +1,91 @@ +process MTX_TO_H5AD { + tag "$meta.id" + label 'process_medium' + + conda "conda-forge::scanpy conda-forge::python-igraph conda-forge::leidenalg" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/scanpy:1.7.2--pyhdfd78af_0' : + 'biocontainers/scanpy:1.7.2--pyhdfd78af_0' }" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + path txp2gene + path star_index + + output: + path "${meta.id}/*h5ad", emit: h5ad + path "${meta.id}/*", emit: counts + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // def file paths for aligners (except cellranger) + if (params.aligner == 'kallisto') { + mtx_matrix = "*count/counts_unfiltered/*.mtx" + barcodes_tsv = "*count/counts_unfiltered/*.barcodes.txt" + features_tsv = "*count/counts_unfiltered/*.genes.txt" + } else if (params.aligner == 'alevin') { + mtx_matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes_tsv = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features_tsv = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + } else if (params.aligner == 'star') { + mtx_matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" + barcodes_tsv = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" + features_tsv = "*.Solo.out/Gene*/filtered/features.tsv.gz" + } + + // + // run script + // + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) + """ + # convert file types + mtx_to_h5ad.py \\ + --aligner cellranger \\ + --input filtered_feature_bc_matrix.h5 \\ + --sample ${meta.id} \\ + --out ${meta.id}/${meta.id}_matrix.h5ad + """ + + else if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + # convert file types + for input_type in spliced unspliced ; do + mtx_to_h5ad.py \\ + --aligner ${params.aligner} \\ + --sample ${meta.id} \\ + --input *count/counts_unfiltered/\${input_type}.mtx \\ + --barcode *count/counts_unfiltered/\${input_type}.barcodes.txt \\ + --feature *count/counts_unfiltered/\${input_type}.genes.txt \\ + --txp2gene ${txp2gene} \\ + --star_index ${star_index} \\ + --out ${meta.id}/${meta.id}_\${input_type}_matrix.h5ad ; + done + """ + + else + """ + # convert file types + mtx_to_h5ad.py \\ + --task_process ${task.process} \\ + --aligner ${params.aligner} \\ + --sample ${meta.id} \\ + --input $mtx_matrix \\ + --barcode $barcodes_tsv \\ + --feature $features_tsv \\ + --txp2gene ${txp2gene} \\ + --star_index ${star_index} \\ + --out ${meta.id}/${meta.id}_matrix.h5ad + """ + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.h5ad + touch versions.yml + """ +} diff --git a/modules/local/mtx_to_seurat.nf b/modules/local/mtx_to_seurat.nf new file mode 100644 index 00000000..d83575a4 --- /dev/null +++ b/modules/local/mtx_to_seurat.nf @@ -0,0 +1,72 @@ +process MTX_TO_SEURAT { + tag "$meta.id" + label 'process_medium' + + conda "r-seurat" + container "nf-core/seurat:4.3.0" + + input: + // inputs from cellranger nf-core module does not come in a single sample dir + // for each sample, the sub-folders and files come directly in array. + tuple val(meta), path(inputs) + + output: + path "${meta.id}/*.rds", emit: seuratObjects + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def aligner = params.aligner + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { + matrix = "matrix.mtx.gz" + barcodes = "barcodes.tsv.gz" + features = "features.tsv.gz" + } else if (params.aligner == "kallisto") { + matrix = "*count/counts_unfiltered/*.mtx" + barcodes = "*count/counts_unfiltered/*.barcodes.txt" + features = "*count/counts_unfiltered/*.genes.txt" + } else if (params.aligner == "alevin") { + matrix = "*_alevin_results/af_quant/alevin/quants_mat.mtx" + barcodes = "*_alevin_results/af_quant/alevin/quants_mat_rows.txt" + features = "*_alevin_results/af_quant/alevin/quants_mat_cols.txt" + } else if (params.aligner == 'star') { + matrix = "*.Solo.out/Gene*/filtered/matrix.mtx.gz" + barcodes = "*.Solo.out/Gene*/filtered/barcodes.tsv.gz" + features = "*.Solo.out/Gene*/filtered/features.tsv.gz" + } + """ + mkdir ${meta.id} + """ + + if (params.aligner == 'kallisto' && params.kb_workflow != 'standard') + """ + # convert file types + for input_type in spliced unspliced ; do + mtx_to_seurat.R \\ + *count/counts_unfiltered/\${input_type}.mtx \\ + *count/counts_unfiltered/\${input_type}.barcodes.txt \\ + *count/counts_unfiltered/\${input_type}.genes.txt \\ + ${meta.id}/${meta.id}_\${input_type}_matrix.rds \\ + ${aligner} + done + """ + + else + """ + mtx_to_seurat.R \\ + $matrix \\ + $barcodes \\ + $features \\ + ${meta.id}/${meta.id}_matrix.rds \\ + ${aligner} + """ + + stub: + """ + mkdir ${meta.id} + touch ${meta.id}/${meta.id}_matrix.rds + touch versions.yml + """ +} diff --git a/modules/local/simpleaf_index.nf b/modules/local/simpleaf_index.nf new file mode 100644 index 00000000..8e8bd519 --- /dev/null +++ b/modules/local/simpleaf_index.nf @@ -0,0 +1,50 @@ +process SIMPLEAF_INDEX { + tag "$transcript_gtf" + label "process_medium" + + conda 'bioconda::simpleaf=0.10.0-1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.10.0--h9f5acd7_1' : + 'biocontainers/simpleaf:0.10.0--h9f5acd7_1' }" + + input: + path genome_fasta + path transcript_fasta + path transcript_gtf + + output: + path "salmon/index" , emit: index + path "salmon/ref/*_t2g_3col.tsv" , emit: transcript_tsv + path "versions.yml" , emit: versions + path "salmon" , emit: salmon + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def seq_inputs = (params.transcript_fasta) ? "--refseq $transcript_fasta" : "--gtf $transcript_gtf" + """ + # export required var + export ALEVIN_FRY_HOME=. + export NUMBA_CACHE_DIR=. + + # prep simpleaf + simpleaf set-paths + + # run simpleaf index + simpleaf \\ + index \\ + --threads $task.cpus \\ + --fasta $genome_fasta \\ + $seq_inputs \\ + $args \\ + -o salmon + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + salmon: \$(salmon --version | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/local/simpleaf_quant.nf b/modules/local/simpleaf_quant.nf new file mode 100644 index 00000000..abb58404 --- /dev/null +++ b/modules/local/simpleaf_quant.nf @@ -0,0 +1,84 @@ +process SIMPLEAF_QUANT { + tag "$meta.id" + label 'process_high' + + conda 'bioconda::simpleaf=0.10.0-1' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/simpleaf:0.10.0--h9f5acd7_1' : + 'biocontainers/simpleaf:0.10.0--h9f5acd7_1' }" + + input: + // + // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] + // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. + // + tuple val(meta), path(reads) + path index + path txp2gene + val protocol + path whitelist + + output: + tuple val(meta), path("*_alevin_results"), emit: alevin_results + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def prefix = task.ext.prefix ?: "${meta.id}" + + // + // check if users are using one of the mutually excludable parameters: + // e.g -k,--knee | -e,--expect-cells | -f, --forced-cells + // + unzip_whitelist = "" + unfiltered_command = "" + save_whitelist = "" + if (!(args_list.any { it in ['-k', '--knee', '-e', '--expect-cells', '-f', '--forced-cells']} || meta.expected_cells)) { + if (whitelist) { + unzip_whitelist = "gzip -dcf $whitelist > whitelist.uncompressed.txt" + unfiltered_command = "-u whitelist.uncompressed.txt" + save_whitelist = "mv whitelist.uncompressed.txt ${prefix}_alevin_results/" + } + } + + // expected cells + def expect_cells = meta.expected_cells ? "--expect-cells $meta.expected_cells" : '' + + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + """ + # export required var + export ALEVIN_FRY_HOME=. + export NUMBA_CACHE_DIR=. + + # prep simpleaf + simpleaf set-paths + + # run simpleaf quant + $unzip_whitelist + simpleaf quant \\ + -1 ${forward.join( "," )} \\ + -2 ${reverse.join( "," )} \\ + -i ${index} \\ + -o ${prefix}_alevin_results \\ + -m $txp2gene \\ + -t $task.cpus \\ + -c "$protocol" \\ + $expect_cells \\ + $unfiltered_command \\ + $args + + $save_whitelist + [[ ! -f ${prefix}_alevin_results/af_quant/all_freq.bin ]] && cp ${prefix}_alevin_results/af_quant/permit_freq.bin ${prefix}_alevin_results/af_quant/all_freq.bin + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + simpleaf: \$(simpleaf -V | tr -d '\\n' | cut -d ' ' -f 2) + salmon: \$(salmon --version | sed -e "s/salmon //g") + END_VERSIONS + """ +} diff --git a/modules/local/star_align.nf b/modules/local/star_align.nf new file mode 100644 index 00000000..a7dfab3d --- /dev/null +++ b/modules/local/star_align.nf @@ -0,0 +1,91 @@ +process STAR_ALIGN { + tag "$meta.id" + label 'process_high' + + conda 'bioconda::star=2.7.10b' + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/star:2.7.10b--h9ee0642_0' : + 'biocontainers/star:2.7.10b--h9ee0642_0' }" + + input: + // + // Input reads are expected to come as: [ meta, [ pair1_read1, pair1_read2, pair2_read1, pair2_read2 ] ] + // Input array for a sample is created in the same order reads appear in samplesheet as pairs from replicates are appended to array. + // + tuple val(meta), path(reads) + tuple val(meta2), path(index) + path gtf + path whitelist + val protocol + val star_feature + val other_10x_parameters + + output: + tuple val(meta), path('*d.out.bam') , emit: bam + tuple val(meta), path('*.Solo.out') , emit: counts + tuple val(meta), path('*Log.final.out') , emit: log_final + tuple val(meta), path('*Log.out') , emit: log_out + tuple val(meta), path('*Log.progress.out'), emit: log_progress + path "versions.yml" , emit: versions + + tuple val(meta), path('*sortedByCoord.out.bam') , optional:true, emit: bam_sorted + tuple val(meta), path('*toTranscriptome.out.bam'), optional:true, emit: bam_transcript + tuple val(meta), path('*Aligned.unsort.out.bam') , optional:true, emit: bam_unsorted + tuple val(meta), path('*fastq.gz') , optional:true, emit: fastq + tuple val(meta), path('*.tab') , optional:true, emit: tab + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def ignore_gtf = params.star_ignore_sjdbgtf ? '' : "--sjdbGTFfile $gtf" + def seq_center = meta.seq_center ? "--outSAMattrRGline ID:$prefix 'CN:$meta.seq_center' 'SM:$prefix'" : "--outSAMattrRGline ID:$prefix 'SM:$prefix'" + def out_sam_type = (args.contains('--outSAMtype')) ? '' : '--outSAMtype BAM Unsorted' + def mv_unsorted_bam = (args.contains('--outSAMtype BAM Unsorted SortedByCoordinate')) ? "mv ${prefix}.Aligned.out.bam ${prefix}.Aligned.unsort.out.bam" : '' + // def read_pair = params.protocol.contains("chromium") ? "${reads[1]} ${reads[0]}" : "${reads[0]} ${reads[1]}" -- commented out to be removed is it is not being used + + // default values max percentile for UMI count 0.99 and max to min ratio for UMI count 10 taken from STARsolo usage + def cell_filter = meta.expected_cells ? "--soloCellFilter CellRanger2.2 $meta.expected_cells 0.99 10" : '' + + // separate forward from reverse pairs + def (forward, reverse) = reads.collate(2).transpose() + """ + STAR \\ + --genomeDir $index \\ + --readFilesIn ${reverse.join( "," )} ${forward.join( "," )} \\ + --runThreadN $task.cpus \\ + --outFileNamePrefix $prefix. \\ + --soloCBwhitelist <(gzip -cdf $whitelist) \\ + --soloType $protocol \\ + --soloFeatures $star_feature \\ + $other_10x_parameters \\ + $out_sam_type \\ + $ignore_gtf \\ + $seq_center \\ + $cell_filter \\ + $args \\ + + $mv_unsorted_bam + + if [ -f ${prefix}.Unmapped.out.mate1 ]; then + mv ${prefix}.Unmapped.out.mate1 ${prefix}.unmapped_1.fastq + gzip ${prefix}.unmapped_1.fastq + fi + if [ -f ${prefix}.Unmapped.out.mate2 ]; then + mv ${prefix}.Unmapped.out.mate2 ${prefix}.unmapped_2.fastq + gzip ${prefix}.unmapped_2.fastq + fi + + if [ -d ${prefix}.Solo.out ]; then + # Backslashes still need to be escaped (https://github.com/nextflow-io/nextflow/issues/67) + find ${prefix}.Solo.out \\( -name "*.tsv" -o -name "*.mtx" \\) -exec gzip {} \\; + fi + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/count/README.md b/modules/nf-core/cellranger/count/README.md new file mode 100644 index 00000000..feafbfa9 --- /dev/null +++ b/modules/nf-core/cellranger/count/README.md @@ -0,0 +1,99 @@ +# Automatically renaming files for Cellranger + +See also https://github.com/nf-core/scrnaseq/issues/241 and the [Cellranger documentation](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/fastq-input). + +## Motivation + +Cellranger (and also Spaceranger, and probably other 10x pipelines) rely on the following input + +- `--fastqs`, to point to a directory with fastq files that are named according to `{sample_name}_S{i}_L00{j}_{R1,R2}_001.fastq.gz`. +- `--sample`, with a sample name (that is the prefix of all associated fastq files) + +In the easiest case, `--fastqs` points to a directory that contains all fastqs of a single sample that already follow the naming conventions. However, it's not always easy: + +1. the directory may contain fastqs for multiple samples + - this is not a problem for cellranger, it will automatically choose the correct fastq files via the `--sample` flag as long as they follow the naming convention, **but** + - if we stage the whole folder (or all files in that folder) using nextflow, it breaks caching in that if an additional sample (or any other file) gets added to the folder, the cache gets invalidated for _all_ samples. +2. the sample has been sequenced across multiple flow cells + - In this case, we need to specify multiple input folders. Cellranger allows passing a _list of directories_ e.g. `--fastqs=/path/dir1,path/dir2` + - Staging all _files_ in these folders into a single directory using nextflow doesn't do the job, as there may be duplicate file names across the different folders. +3. the raw sequencing data may have been downloaded from a sequence database and doesn't follow the naming convention anymore. + - In that case we need to rename the files to follow the bcl2fastq naming convention. + +## Solution + +Rename files automatically to match the `{sample_name}_S{i}_L00{j}_{R1,R2}_001.fastq.gz` pattern. We assume that +files in the input channel are ordered according to `file1 R1, file1 R2, file2 R1, file2 R2, ...`. +If the files are explicitly listed in a samplesheet, we have this order anyway. If the files follow any sensible naming +convention, this order can be achieved by sorting by filename. + +## Does renaming the files change the result? + +No. Here's how the same dataset was tested in four different scenarios. The result is the same down +to identical md5sums of the output files. + +### Prepare datasets + +1. Download dataset: + +``` +curl https://s3-us-west-2.amazonaws.com/10x.files/samples/cell-exp/6.1.2/10k_PBMC_3p_nextgem_Chromium_X_intron/10k_PBMC_3p_nextgem_Chromium_X_intron_fastqs.tar | tar -xv +INPUT_DIR=10k_PBMC_3p_nextgem_Chromium_X_fastqs +# remove index files +rm $INPUT_DIR/*_I{1,2}_*.fastq.gz +``` + +2. Simulate scenario where files were generated using multiple flow cells: + +``` +MFC=fastq_multiflowcell +mkdir -p $MFC/fc{1,2,3,4} +for f in $INPUT_DIR/*.fastq.gz; do ; ln $f $MFC; done +for i in $(seq 4) ; do ; mv $MFC/*L00$i* $MFC/fc$i ; done +for i in $(seq 4) ; do ; rename L00$i L001 $MFC/fc$i/*; done +``` + +3. Simulate scenario where sample was multiplexed: + +``` +MPX=fastq_multiplexed +mkdir $MPX +for f in $INPUT_DIR/*.fastq.gz; do ; ln $f $MPX; done +for i in $(seq 4) ; do ; rename S1_L00$i S${i}_L001 $MPX/* ; done +``` + +4. Concatenate files + +``` +mkdir fastq_cat +cat $INPUT_DIR/*_R1_*.fastq.gz > fastq_cat/10k_PBMC_3p_nextgem_Chromium_X_S1_L001_R1_001.fastq.gz +cat $INPUT_DIR/*_R2_*.fastq.gz > fastq_cat/10k_PBMC_3p_nextgem_Chromium_X_S1_L001_R2_001.fastq.gz +``` + +### Run cellranger + +``` +singularity pull docker://docker.io/nfcore/cellranger:7.1.0 +cd $INPUT_DIR && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd $MFC && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=fc1,fc2,fc3,fc4 --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd $MPX && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +cd fastq_cat && singularity exec -B $(pwd):$(pwd) -B /cfs:/cfs docker://docker.io/nfcore/cellranger:7.1.0 cellranger count --localcores=8 --id=10k_PBMC_3p_nextgem_Chromium_X --fastqs=. --transcriptome=/cfs/sturmgre/tmp/cellranger-ref/refdata-gex-GRCh38-2020-A --localmem=64 +``` + +### Check results + +``` +> md5sum **/outs/*.h5 | sort +2a7a5a022f01cb8d95965980f0d95ca5 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +2a7a5a022f01cb8d95965980f0d95ca5 fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/raw_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +533f4156f2d90152594ebc587b7fde0a fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/filtered_feature_bc_matrix.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 10k_PBMC_3p_nextgem_Chromium_X_fastqs/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_cat/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_multiflowcell/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +cd33d3aa95c0d5cda7cf50908c5399c1 fastq_multiplexed/10k_PBMC_3p_nextgem_Chromium_X/outs/molecule_info.h5 +``` diff --git a/modules/nf-core/cellranger/count/environment.yml b/modules/nf-core/cellranger/count/environment.yml new file mode 100644 index 00000000..662f747d --- /dev/null +++ b/modules/nf-core/cellranger/count/environment.yml @@ -0,0 +1,5 @@ +name: cellranger_count +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/cellranger/count/main.nf b/modules/nf-core/cellranger/count/main.nf new file mode 100644 index 00000000..d7a191fc --- /dev/null +++ b/modules/nf-core/cellranger/count/main.nf @@ -0,0 +1,42 @@ +process CELLRANGER_COUNT { + tag "$meta.id" + label 'process_high' + + container "nf-core/cellranger:7.1.0" + + input: + tuple val(meta), path(reads, stageAs: "fastq_???/*") + path reference + + output: + tuple val(meta), path("**/outs/**"), emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + template "cellranger_count.py" + + stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def prefix = task.ext.prefix ?: "${meta.id}" + """ + mkdir -p "${prefix}/outs/" + touch ${prefix}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/count/meta.yml b/modules/nf-core/cellranger/count/meta.yml new file mode 100644 index 00000000..c1f4259a --- /dev/null +++ b/modules/nf-core/cellranger/count/meta.yml @@ -0,0 +1,51 @@ +name: cellranger_count +description: Module to use Cell Ranger's pipelines analyze sequencing data produced from Chromium Single Cell Gene Expression. +keywords: + - align + - count + - reference +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + licence: 10x Genomics EULA +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files. The order of the input files MUST be ["sample1 R1", "sample1 R2", "sample2, R1", + "sample2, R2", ...]. This can usually be achieved by sorting the input files by file name. + + Background: 10x data is always paired-end with R1 containing cell barcode and UMI + and R2 containing the actual read sequence. Cell Ranger requires files to adhere to the following file-name + convention: `${Sample_Name}_S1_L00${Lane_Number}_${R1,R2}_001.fastq.gz`. This module automatically + renames files to match this convention based on the order of input files to avoid various + issues (see https://github.com/nf-core/scrnaseq/issues/241). To avoid mistakes, the module + throws an error if a pair of R1 and R2 fastq files does not have the same filename except for the "_R1"/"_R2" part. + Renaming the files does not affect the results (see README.md for detailed tests). + pattern: "*{R1,R2}*.fastq.gz" + - reference: + type: directory + description: Folder containing all the reference indices needed by Cell Ranger +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger, see official 10X Genomics documentation for a complete list + pattern: "${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@edmundmiller" +maintainers: + - "@ggabernet" + - "@edmundmiller" diff --git a/modules/nf-core/cellranger/count/templates/cellranger_count.py b/modules/nf-core/cellranger/count/templates/cellranger_count.py new file mode 100644 index 00000000..1527ba76 --- /dev/null +++ b/modules/nf-core/cellranger/count/templates/cellranger_count.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python3 +""" +Automatically rename staged files for input into cellranger count. + +Copyright (c) Gregor Sturm 2023 - MIT License +""" +from subprocess import run +from pathlib import Path +from textwrap import dedent +import shlex +import re + + +def chunk_iter(seq, size): + """iterate over `seq` in chunks of `size`""" + return (seq[pos : pos + size] for pos in range(0, len(seq), size)) + + +sample_id = "${meta.id}" + +# get fastqs, ordered by path. Files are staged into +# - "fastq_001/{original_name.fastq.gz}" +# - "fastq_002/{oritinal_name.fastq.gz}" +# - ... +# Since we require fastq files in the input channel to be ordered such that a R1/R2 pair +# of files follows each other, ordering will get us a sequence of [R1, R2, R1, R2, ...] +fastqs = sorted(Path(".").glob("fastq_*/*")) +assert len(fastqs) % 2 == 0 + +# target directory in which the renamed fastqs will be placed +fastq_all = Path("./fastq_all") +fastq_all.mkdir(exist_ok=True) + +# Match R1 in the filename, but only if it is followed by a non-digit or non-character +# match "file_R1.fastq.gz", "file.R1_000.fastq.gz", etc. but +# do not match "SRR12345", "file_INFIXR12", etc +filename_pattern = r'([^a-zA-Z0-9])R1([^a-zA-Z0-9])' + +for i, (r1, r2) in enumerate(chunk_iter(fastqs, 2)): + # double escapes are required because nextflow processes this python 'template' + if re.sub(filename_pattern, r'\\1R2\\2', r1.name) != r2.name: + raise AssertionError( + dedent( + f"""\ + We expect R1 and R2 of the same sample to have the same filename except for R1/R2. + This has been checked by replacing "R1" with "R2" in the first filename and comparing it to the second filename. + If you believe this check shouldn't have failed on your filenames, please report an issue on GitHub! + + Files involved: + - {r1} + - {r2} + """ + ) + ) + r1.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R1_001.fastq.gz") + r2.rename(fastq_all / f"{sample_id}_S1_L{i:03d}_R2_001.fastq.gz") + +run( + # fmt: off + [ + "cellranger", "count", + "--id", "${prefix}", + "--fastqs", str(fastq_all), + "--transcriptome", "${reference.name}", + "--localcores", "${task.cpus}", + "--localmem", "${task.memory.toGiga()}", + *shlex.split("""${args}""") + ], + # fmt: on + check=True, +) + +# Output version information +version = run( + ["cellranger", "-V"], + text=True, + check=True, + capture_output=True, +).stdout.replace("cellranger cellranger-", "") + +# alas, no `pyyaml` pre-installed in the cellranger container +with open("versions.yml", "w") as f: + f.write('"${task.process}":\\n') + f.write(f' cellranger: "{version}"\\n') diff --git a/modules/nf-core/cellranger/mkgtf/environment.yml b/modules/nf-core/cellranger/mkgtf/environment.yml new file mode 100644 index 00000000..c81ef3e4 --- /dev/null +++ b/modules/nf-core/cellranger/mkgtf/environment.yml @@ -0,0 +1,5 @@ +name: cellranger_mkgtf +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/cellranger/mkgtf/main.nf b/modules/nf-core/cellranger/mkgtf/main.nf new file mode 100644 index 00000000..e0b0dd67 --- /dev/null +++ b/modules/nf-core/cellranger/mkgtf/main.nf @@ -0,0 +1,36 @@ +process CELLRANGER_MKGTF { + tag "$gtf" + label 'process_low' + + container "nf-core/cellranger:7.1.0" + + input: + path gtf + + output: + path "*.gtf" , emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_MKGTF module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gtf.baseName}.filtered" + """ + cellranger \\ + mkgtf \\ + $gtf \\ + ${prefix}.gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/mkgtf/meta.yml b/modules/nf-core/cellranger/mkgtf/meta.yml new file mode 100644 index 00000000..a01e2830 --- /dev/null +++ b/modules/nf-core/cellranger/mkgtf/meta.yml @@ -0,0 +1,33 @@ +name: cellranger_mkgtf +description: Module to build a filtered GTF needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkgtf command. +keywords: + - reference + - mkref + - index +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + licence: 10x Genomics EULA +input: + - gtf: + type: file + description: The reference GTF transcriptome file + pattern: "*.gtf" +output: + - gtf: + type: directory + description: The filtered GTF transcriptome file + pattern: "*.filtered.gtf" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@edmundmiller" +maintainers: + - "@ggabernet" + - "@edmundmiller" diff --git a/modules/nf-core/cellranger/mkref/environment.yml b/modules/nf-core/cellranger/mkref/environment.yml new file mode 100644 index 00000000..9ca3e88c --- /dev/null +++ b/modules/nf-core/cellranger/mkref/environment.yml @@ -0,0 +1,5 @@ +name: cellranger_mkref +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/cellranger/mkref/main.nf b/modules/nf-core/cellranger/mkref/main.nf new file mode 100644 index 00000000..986891b8 --- /dev/null +++ b/modules/nf-core/cellranger/mkref/main.nf @@ -0,0 +1,38 @@ +process CELLRANGER_MKREF { + tag "$fasta" + label 'process_high' + + container "nf-core/cellranger:7.1.0" + + input: + path fasta + path gtf + val reference_name + + output: + path "${reference_name}", emit: reference + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "CELLRANGER_MKREF module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + """ + cellranger \\ + mkref \\ + --genome=$reference_name \\ + --fasta=$fasta \\ + --genes=$gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$( cellranger --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellranger/mkref/meta.yml b/modules/nf-core/cellranger/mkref/meta.yml new file mode 100644 index 00000000..4cd9091c --- /dev/null +++ b/modules/nf-core/cellranger/mkref/meta.yml @@ -0,0 +1,38 @@ +name: cellranger_mkref +description: Module to build the reference needed by the 10x Genomics Cell Ranger tool. Uses the cellranger mkref command. +keywords: + - reference + - mkref + - index +tools: + - cellranger: + description: Cell Ranger by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/what-is-cell-ranger + documentation: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + tool_dev_url: https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/using/tutorial_ov + licence: 10x Genomics EULA +input: + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fasta,fa}" + - gtf: + type: file + description: Reference transcriptome GTF file + pattern: "*.gtf" + - reference_name: + type: string + description: The name to give the new reference folder + pattern: str +output: + - reference: + type: directory + description: Folder containing all the reference indices needed by Cell Ranger + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" +maintainers: + - "@ggabernet" diff --git a/modules/nf-core/cellrangerarc/Dockerfile b/modules/nf-core/cellrangerarc/Dockerfile new file mode 100644 index 00000000..812b64ba --- /dev/null +++ b/modules/nf-core/cellrangerarc/Dockerfile @@ -0,0 +1,28 @@ +# Dockerfile to create container with Cell Ranger v2.0.2 +# Push to quay.io/nf-core/cellranger-arc: + +FROM continuumio/miniconda3:4.8.2 +LABEL authors="Gisela Gabernet , Florian Heyl" \ + description="Docker image containing Cell Ranger Arc" +# Disclaimer: this container is not provided nor supported by Illumina or 10x Genomics. + +# Install procps and clean apt cache +RUN apt-get update --allow-releaseinfo-change \ + && apt-get install -y \ + cpio \ + procps \ + rpm2cpio \ + unzip \ + && apt-get clean -y && rm -rf /var/lib/apt/lists/* + +# Copy pre-downloaded cellranger-arc file +ENV CELLRANGER_ARC_VER=2.0.2 +COPY cellranger-arc-$CELLRANGER_ARC_VER.tar.gz /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz + +# Install cellranger-arc +RUN \ + cd /opt && \ + tar -xzvf cellranger-arc-$CELLRANGER_ARC_VER.tar.gz && \ + export PATH=/opt/cellranger-arc-$CELLRANGER_ARC_VER:$PATH && \ + ln -s /opt/cellranger-arc-$CELLRANGER_ARC_VER/cellranger-arc /usr/bin/cellranger-arc && \ + rm -rf /opt/cellranger-arc-$CELLRANGER_ARC_VER.tar.gz diff --git a/modules/nf-core/cellrangerarc/README.md b/modules/nf-core/cellrangerarc/README.md new file mode 100644 index 00000000..6089d994 --- /dev/null +++ b/modules/nf-core/cellrangerarc/README.md @@ -0,0 +1,23 @@ +# Updating the docker container and making a new module release + +Cell Ranger Arc is a commercial tool from 10X Genomics. The container provided for the cellranger-arc nf-core module is not provided nor supported by 10x Genomics. Updating the Cell Ranger Arc versions in the container and pushing the update to Dockerhub needs to be done manually. + +1. Navigate to the appropriate download page. - [Cell Ranger Arc](https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/installation): download the tar ball of the desired Cell Ranger Arc version with `curl` or `wget`. Place this file in the same folder where the Dockerfile lies. + +2. Edit the Dockerfile. Update the Cell Ranger Arc versions in this line: + +```bash +ENV CELLRANGER_ARC_VER= +``` + +3. Create and test the container: + +```bash +docker build . -t quay.io/nf-core/cellranger-arc: +``` + +4. Access rights are needed to push the container to the Dockerhub nfcore organization, please ask a core team member to do so. + +```bash +docker push quay.io/nf-core/cellranger-arc: +``` diff --git a/modules/nf-core/cellrangerarc/count/main.nf b/modules/nf-core/cellrangerarc/count/main.nf new file mode 100644 index 00000000..2bf0193a --- /dev/null +++ b/modules/nf-core/cellrangerarc/count/main.nf @@ -0,0 +1,84 @@ +process CELLRANGERARC_COUNT { + tag "$meta.id" + label 'process_high' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + tuple val(meta), val(sample_type), val(sub_sample), path(reads, stageAs: "fastqs/*") + path reference + + output: + tuple val(meta), path("${meta.id}/outs/*"), emit: outs + path("${meta.id}_lib.csv") , emit: lib + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def reference_name = reference.name + def sample_types = sample_type.join(",") + def sample_names = sub_sample.join(",") + def lib_csv = meta.id + "_lib.csv" + + """ + fastq_folder=\$(readlink -f fastqs) + + python3 < versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ + + stub: + """ + mkdir -p "${meta.id}/outs/" + touch ${meta.id}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/count/meta.yml b/modules/nf-core/cellrangerarc/count/meta.yml new file mode 100644 index 00000000..8bc73c6f --- /dev/null +++ b/modules/nf-core/cellrangerarc/count/meta.yml @@ -0,0 +1,40 @@ +name: cellrangerarc_count +description: Module to use Cell Ranger's ARC pipelines analyze sequencing data produced from Chromium Single Cell ARC. Uses the cellranger-arc count command. +keywords: + - align + - count + - reference +tools: + - cellrangerarc: + description: Cell Ranger ARC is a set of analysis pipelines that process Chromium Single Cell ARC data. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - lib_csv: + type: file + description: | + Path to a 3-column CSV file declaring FASTQ paths, sample names and library types of input ATAC and GEX FASTQs. + - reference: + type: directory + description: Directory containing all the reference indices needed by Cell Ranger ARC +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger ARC + pattern: "${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@edmundmiller" + - "@heylf" diff --git a/modules/nf-core/cellrangerarc/mkgtf/main.nf b/modules/nf-core/cellrangerarc/mkgtf/main.nf new file mode 100644 index 00000000..f304c6bc --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkgtf/main.nf @@ -0,0 +1,36 @@ +process CELLRANGERARC_MKGTF { + tag "$gtf" + label 'process_low' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + path gtf + + output: + path "*.filtered.gtf", emit: gtf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + """ + cellranger-arc \\ + mkgtf \\ + $gtf \\ + ${gtf.baseName}.filtered.gtf \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/mkgtf/meta.yml b/modules/nf-core/cellrangerarc/mkgtf/meta.yml new file mode 100644 index 00000000..5fadc3ef --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkgtf/meta.yml @@ -0,0 +1,32 @@ +name: cellrangerarc_mkgtf +description: Module to build a filtered gtf needed by the 10x Genomics Cell Ranger Arc tool. Uses the cellranger-arc mkgtf command. +keywords: + - reference + - mkref + - index +tools: + - cellrangerarc: + description: Cell Ranger Arc by 10x Genomics is a set of analysis pipelines that process Chromium single-cell data to align reads, generate feature-barcode matrices, perform clustering and other secondary analysis, and more. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - gtf: + type: file + description: The reference GTF transcriptome file + pattern: "*.gtf" +output: + - gtf: + type: directory + description: The filtered GTF transcriptome file + pattern: "*.filtered.gtf" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@edmundmiller" + - "@heylf" diff --git a/modules/nf-core/cellrangerarc/mkref/main.nf b/modules/nf-core/cellrangerarc/mkref/main.nf new file mode 100644 index 00000000..079776ba --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkref/main.nf @@ -0,0 +1,83 @@ +process CELLRANGERARC_MKREF { + tag "$reference_name" + label 'process_medium' + + container "nf-core/cellranger-arc:2.0.2" + + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + exit 1, "CELLRANGERARC_COUNT module does not support Conda. Please use Docker / Singularity / Podman instead." + } + + input: + path fasta + path gtf + path motifs + path reference_config + val reference_name + + output: + path "${reference_name}", emit: reference + path "config" , emit: config + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def fast_name = fasta.name + def gtf_name = gtf.name + def motifs_name = motifs.name + def reference_config = reference_config.name + def args = task.ext.args ?: '' + + if ( !reference_name ){ + reference_name = "cellrangerarc_reference" + } + + """ + + python3 < versions.yml + "${task.process}": + cellrangerarc: \$(echo \$( cellranger-arc --version 2>&1) | sed 's/^.*[^0-9]\\([0-9]*\\.[0-9]*\\.[0-9]*\\).*\$/\\1/' ) + END_VERSIONS + """ +} diff --git a/modules/nf-core/cellrangerarc/mkref/meta.yml b/modules/nf-core/cellrangerarc/mkref/meta.yml new file mode 100644 index 00000000..cf98e60c --- /dev/null +++ b/modules/nf-core/cellrangerarc/mkref/meta.yml @@ -0,0 +1,46 @@ +name: cellrangerarc_mkref +description: Module to build the reference needed by the 10x Genomics Cell Ranger Arc tool. Uses the cellranger-arc mkref command. +keywords: + - reference + - mkref + - index +tools: + - cellrangerarc: + description: Cell Ranger Arc is a set of analysis pipelines that process Chromium Single Cell Arc data. + homepage: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + documentation: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + tool_dev_url: https://support.10xgenomics.com/single-cell-multiome-atac-gex/software/pipelines/latest/what-is-cell-ranger-arc + licence: + - 10x Genomics EULA +input: + - fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fasta,fa}" + - gtf: + type: file + description: Reference transcriptome GTF file + pattern: "*.gtf" + - motifs: + type: file + description: Sequence motif file (e.g., from transcription factors) + pattern: "*.txt" + - reference_config: + type: file + description: JSON-like file holding organism, genome, reference fasta path, reference annotation gtf path, contigs that should be excluded and sequence format motif file path + pattern: config + - reference_name: + type: string + description: The name to give the new reference folder + pattern: str +output: + - reference: + type: directory + description: Folder called like the reference_name containing all the reference indices needed by Cell Ranger Arc + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@ggabernet" + - "@heylf" diff --git a/modules/nf-core/gffread/environment.yml b/modules/nf-core/gffread/environment.yml new file mode 100644 index 00000000..5398f71c --- /dev/null +++ b/modules/nf-core/gffread/environment.yml @@ -0,0 +1,7 @@ +name: gffread +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::gffread=0.12.1 diff --git a/modules/nf-core/gffread/main.nf b/modules/nf-core/gffread/main.nf new file mode 100644 index 00000000..d8a473e0 --- /dev/null +++ b/modules/nf-core/gffread/main.nf @@ -0,0 +1,35 @@ +process GFFREAD { + tag "$gff" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/gffread:0.12.1--h8b12597_0' : + 'biocontainers/gffread:0.12.1--h8b12597_0' }" + + input: + path gff + + output: + path "*.gtf" , emit: gtf , optional: true + path "*.gff3" , emit: gffread_gff , optional: true + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${gff.baseName}" + def extension = args.contains("-T") ? 'gtf' : 'gffread.gff3' + """ + gffread \\ + $gff \\ + $args \\ + -o ${prefix}.${extension} + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gffread: \$(gffread --version 2>&1) + END_VERSIONS + """ +} diff --git a/modules/nf-core/gffread/meta.yml b/modules/nf-core/gffread/meta.yml new file mode 100644 index 00000000..d38cbcda --- /dev/null +++ b/modules/nf-core/gffread/meta.yml @@ -0,0 +1,36 @@ +name: gffread +description: Validate, filter, convert and perform various other operations on GFF files +keywords: + - gff + - conversion + - validation +tools: + - gffread: + description: GFF/GTF utility providing format conversions, region filtering, FASTA sequence extraction and more. + homepage: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + documentation: http://ccb.jhu.edu/software/stringtie/gff.shtml#gffread + tool_dev_url: https://github.com/gpertea/gffread + doi: 10.12688/f1000research.23297.1 + licence: ["MIT"] +input: + - gff: + type: file + description: A reference file in either the GFF3, GFF2 or GTF format. + pattern: "*.{gff, gtf}" +output: + - gtf: + type: file + description: GTF file resulting from the conversion of the GFF input file if '-T' argument is present + pattern: "*.{gtf}" + - gffread_gff: + type: file + description: GFF3 file resulting from the conversion of the GFF input file if '-T' argument is absent + pattern: "*.{gff3}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@edmundmiller" +maintainers: + - "@edmundmiller" diff --git a/modules/nf-core/gffread/tests/main.nf.test b/modules/nf-core/gffread/tests/main.nf.test new file mode 100644 index 00000000..bdbc96ae --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process GFFREAD" + script "../main.nf" + process "GFFREAD" + + tag "gffread" + tag "modules_nfcore" + tag "modules" + + test("sarscov2-gff3-gtf") { + + config "./nextflow.config" + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.gtf, + process.out.versions + ).match() }, + { assert process.out.gffread_gff == [] } + ) + } + + } + + test("sarscov2-gff3-gff3") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = file(params.modules_testdata_base_path + "genomics/sarscov2/genome/genome.gff3", checkIfExists: true) + """ + } + } + + then { + assertAll ( + { assert process.success }, + { assert snapshot( + process.out.gffread_gff, + process.out.versions + ).match() }, + { assert process.out.gtf == [] }, + ) + } + + } + +} diff --git a/modules/nf-core/gffread/tests/main.nf.test.snap b/modules/nf-core/gffread/tests/main.nf.test.snap new file mode 100644 index 00000000..00a11a40 --- /dev/null +++ b/modules/nf-core/gffread/tests/main.nf.test.snap @@ -0,0 +1,24 @@ +{ + "sarscov2-gff3-gtf": { + "content": [ + [ + "genome.gtf:md5,2394072d7d31530dfd590c4a117bf6e3" + ], + [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + ], + "timestamp": "2024-01-23T20:00:32.688779117" + }, + "sarscov2-gff3-gff3": { + "content": [ + [ + "genome.gffread.gff3:md5,a7d40d99dcddac23ac673c473279ea2d" + ], + [ + "versions.yml:md5,a71b6cdfa528dd206a238ec64bae13d6" + ] + ], + "timestamp": "2024-01-23T20:07:11.457356625" + } +} \ No newline at end of file diff --git a/modules/nf-core/gffread/tests/nextflow.config b/modules/nf-core/gffread/tests/nextflow.config new file mode 100644 index 00000000..74b25094 --- /dev/null +++ b/modules/nf-core/gffread/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GFFREAD { + ext.args = '-T' + } +} diff --git a/modules/nf-core/gffread/tests/tags.yml b/modules/nf-core/gffread/tests/tags.yml new file mode 100644 index 00000000..05576065 --- /dev/null +++ b/modules/nf-core/gffread/tests/tags.yml @@ -0,0 +1,2 @@ +gffread: + - modules/nf-core/gffread/** diff --git a/modules/nf-core/gunzip/environment.yml b/modules/nf-core/gunzip/environment.yml new file mode 100644 index 00000000..25910b34 --- /dev/null +++ b/modules/nf-core/gunzip/environment.yml @@ -0,0 +1,7 @@ +name: gunzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::sed=4.7 diff --git a/modules/nf-core/gunzip/main.nf b/modules/nf-core/gunzip/main.nf new file mode 100644 index 00000000..468a6f28 --- /dev/null +++ b/modules/nf-core/gunzip/main.nf @@ -0,0 +1,48 @@ +process GUNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/ubuntu:20.04' : + 'nf-core/ubuntu:20.04' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("$gunzip"), emit: gunzip + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + gunzip = archive.toString() - '.gz' + """ + # Not calling gunzip itself because it creates files + # with the original group ownership rather than the + # default one for that user / the work directory + gzip \\ + -cd \\ + $args \\ + $archive \\ + > $gunzip + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ + + stub: + gunzip = archive.toString() - '.gz' + """ + touch $gunzip + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gunzip: \$(echo \$(gunzip --version 2>&1) | sed 's/^.*(gzip) //; s/ Copyright.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gunzip/meta.yml b/modules/nf-core/gunzip/meta.yml new file mode 100644 index 00000000..231034f2 --- /dev/null +++ b/modules/nf-core/gunzip/meta.yml @@ -0,0 +1,39 @@ +name: gunzip +description: Compresses and decompresses files. +keywords: + - gunzip + - compression + - decompression +tools: + - gunzip: + description: | + gzip is a file format and a software application used for file compression and decompression. + documentation: https://www.gnu.org/software/gzip/manual/gzip.html + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Optional groovy Map containing meta information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: File to be compressed/uncompressed + pattern: "*.*" +output: + - gunzip: + type: file + description: Compressed/uncompressed file + pattern: "*.*" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" +maintainers: + - "@joseespinosa" + - "@drpatelh" + - "@jfy133" diff --git a/modules/nf-core/gunzip/tests/main.nf.test b/modules/nf-core/gunzip/tests/main.nf.test new file mode 100644 index 00000000..6406008e --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test @@ -0,0 +1,36 @@ +nextflow_process { + + name "Test Process GUNZIP" + script "../main.nf" + process "GUNZIP" + tag "gunzip" + tag "modules_nfcore" + tag "modules" + + test("Should run without failures") { + + when { + params { + outdir = "$outputDir" + } + process { + """ + input[0] = Channel.of([ + [], + file(params.modules_testdata_base_path + 'genomics/sarscov2/illumina/fastq/test_1.fastq.gz', checkIfExists: true) + ] + ) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gunzip/tests/main.nf.test.snap b/modules/nf-core/gunzip/tests/main.nf.test.snap new file mode 100644 index 00000000..720fd9ff --- /dev/null +++ b/modules/nf-core/gunzip/tests/main.nf.test.snap @@ -0,0 +1,31 @@ +{ + "Should run without failures": { + "content": [ + { + "0": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "1": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ], + "gunzip": [ + [ + [ + + ], + "test_1.fastq:md5,4161df271f9bfcd25d5845a1e220dbec" + ] + ], + "versions": [ + "versions.yml:md5,54376d32aca20e937a4ec26dac228e84" + ] + } + ], + "timestamp": "2023-10-17T15:35:37.690477896" + } +} \ No newline at end of file diff --git a/modules/nf-core/gunzip/tests/tags.yml b/modules/nf-core/gunzip/tests/tags.yml new file mode 100644 index 00000000..fd3f6915 --- /dev/null +++ b/modules/nf-core/gunzip/tests/tags.yml @@ -0,0 +1,2 @@ +gunzip: + - modules/nf-core/gunzip/** diff --git a/modules/nf-core/kallistobustools/count/environment.yml b/modules/nf-core/kallistobustools/count/environment.yml new file mode 100644 index 00000000..7ff8a2da --- /dev/null +++ b/modules/nf-core/kallistobustools/count/environment.yml @@ -0,0 +1,7 @@ +name: kallistobustools_count +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kb-python=0.27.2 diff --git a/modules/nf-core/kallistobustools/count/main.nf b/modules/nf-core/kallistobustools/count/main.nf new file mode 100644 index 00000000..036bb35d --- /dev/null +++ b/modules/nf-core/kallistobustools/count/main.nf @@ -0,0 +1,51 @@ +process KALLISTOBUSTOOLS_COUNT { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kb-python:0.27.2--pyhdfd78af_0' : + 'biocontainers/kb-python:0.27.2--pyhdfd78af_0' }" + + input: + tuple val(meta), path(reads) + path index + path t2g + path t1c + path t2c + val technology + + output: + tuple val(meta), path ("*.count"), emit: count + path "versions.yml" , emit: versions + path "*.count/*/*.mtx" , emit: matrix //Ensure that kallisto finished and produced outputs + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def cdna = t1c ? "-c1 $t1c" : '' + def introns = t2c ? "-c2 $t2c" : '' + def memory = task.memory.toGiga() - 1 + """ + kb \\ + count \\ + -t $task.cpus \\ + -i $index \\ + -g $t2g \\ + $cdna \\ + $introns \\ + -x $technology \\ + $args \\ + -o ${prefix}.count \\ + ${reads.join( " " )} \\ + -m ${memory}G + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/kallistobustools/count/meta.yml b/modules/nf-core/kallistobustools/count/meta.yml new file mode 100644 index 00000000..7491248c --- /dev/null +++ b/modules/nf-core/kallistobustools/count/meta.yml @@ -0,0 +1,68 @@ +name: kallistobustools_count +description: quantifies scRNA-seq data from fastq files using kb-python. +keywords: + - scRNA-seq + - count + - single-cell + - kallisto + - bustools +tools: + - kb: + description: kallisto and bustools are wrapped in an easy-to-use program called kb + homepage: https://www.kallistobus.tools/ + documentation: https://kb-python.readthedocs.io/en/latest/index.html + tool_dev_url: https://github.com/pachterlab/kb_python + licence: MIT License +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: | + List of input FastQ files of size 1 and 2 for single-end and paired-end data, + respectively. + - index: + type: file + description: kb-ref index file (.idx) + pattern: "*.{idx}" + - t2g: + type: file + description: t2g file from kallisto + pattern: "*t2g.txt" + - t1c: + type: file + description: kb ref's c1 spliced_t2c file + pattern: "*.{cdna_t2c.txt}" + - t2c: + type: file + description: kb ref's c2 unspliced_t2c file + pattern: "*.{introns_t2c.txt}" + - workflow_mode: + type: string + description: String value defining workflow to use, can be one of "standard", "lamanno", "nucleus" + pattern: "{standard,lamanno,nucleus,kite}" + - technology: + type: string + description: String value defining the sequencing technology used. + pattern: "{10XV1,10XV2,10XV3,CELSEQ,CELSEQ2,DROPSEQ,INDROPSV1,INDROPSV2,INDROPSV3,SCRUBSEQ,SURECELL,SMARTSEQ}" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test'] + - count: + type: file + description: kb count output folder + pattern: "*.{count}" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@flowuenne" +maintainers: + - "@flowuenne" diff --git a/modules/nf-core/kallistobustools/ref/environment.yml b/modules/nf-core/kallistobustools/ref/environment.yml new file mode 100644 index 00000000..acbd0e0a --- /dev/null +++ b/modules/nf-core/kallistobustools/ref/environment.yml @@ -0,0 +1,8 @@ +name: kallistobustools_ref +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::kb-python=0.27.2 + - conda-forge::requests>=2.23.0 diff --git a/modules/nf-core/kallistobustools/ref/main.nf b/modules/nf-core/kallistobustools/ref/main.nf new file mode 100644 index 00000000..68d72ca9 --- /dev/null +++ b/modules/nf-core/kallistobustools/ref/main.nf @@ -0,0 +1,65 @@ +process KALLISTOBUSTOOLS_REF { + tag "$fasta" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/kb-python:0.27.2--pyhdfd78af_0' : + 'biocontainers/kb-python:0.27.2--pyhdfd78af_0' }" + + input: + path fasta + path gtf + val workflow_mode + + output: + path "versions.yml" , emit: versions + path "kb_ref_out.idx" , emit: index + path "t2g.txt" , emit: t2g + path "cdna.fa" , emit: cdna + path "intron.fa" , optional:true, emit: intron + path "cdna_t2c.txt" , optional:true, emit: cdna_t2c + path "intron_t2c.txt" , optional:true, emit: intron_t2c + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if (workflow_mode == "standard") { + """ + kb \\ + ref \\ + -i kb_ref_out.idx \\ + -g t2g.txt \\ + -f1 cdna.fa \\ + --workflow $workflow_mode \\ + $fasta \\ + $gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ + } else { + """ + kb \\ + ref \\ + -i kb_ref_out.idx \\ + -g t2g.txt \\ + -f1 cdna.fa \\ + -f2 intron.fa \\ + -c1 cdna_t2c.txt \\ + -c2 intron_t2c.txt \\ + --workflow $workflow_mode \\ + $fasta \\ + $gtf + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + kallistobustools: \$(echo \$(kb --version 2>&1) | sed 's/^.*kb_python //;s/positional arguments.*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/kallistobustools/ref/meta.yml b/modules/nf-core/kallistobustools/ref/meta.yml new file mode 100644 index 00000000..00be5143 --- /dev/null +++ b/modules/nf-core/kallistobustools/ref/meta.yml @@ -0,0 +1,63 @@ +name: kallistobustools_ref +description: index creation for kb count quantification of single-cell data. +keywords: + - scRNA-seq + - count + - single-cell + - kallisto + - bustools + - index +tools: + - kb: + description: kallisto|bustools (kb) is a tool developed for fast and efficient processing of single-cell OMICS data. + homepage: https://www.kallistobus.tools/ + documentation: https://kb-python.readthedocs.io/en/latest/index.html + tool_dev_url: https://github.com/pachterlab/kb_python + doi: "10.22002/D1.1876" + licence: MIT License +input: + - fasta: + type: file + description: Genomic DNA fasta file + pattern: "*.{fasta,fasta.gz}" + - gtf: + type: file + description: Genomic gtf file + pattern: "*.{gtf,gtf.gz}" + - workflow_mode: + type: string + description: String value defining workflow to use, can be one of "standard", "lamanno", "nucleus" + pattern: "{standard,lamanno,nucleus}" +output: + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" + - kb_ref_idx: + type: file + description: Index file from kb ref. + pattern: "*.{idx}" + - t2g: + type: file + description: Transcript to gene table + pattern: "*t2g.{txt}" + - cdna: + type: file + description: Cdna fasta file + pattern: "*cdna.{fa}" + - intron: + type: file + description: intron fasta file + pattern: "*intron.{fa}" + - cdna_t2c: + type: file + description: cdna transcript to capture file + pattern: "*cdna_t2c.{txt}" + - intron_t2c: + type: file + description: intron transcript to capture file + pattern: "*intron_t2c.{txt}" +authors: + - "@flowuenne" +maintainers: + - "@flowuenne" diff --git a/modules/nf-core/multiqc/tests/main.nf.test.snap b/modules/nf-core/multiqc/tests/main.nf.test.snap index bfebd802..549ba79c 100644 --- a/modules/nf-core/multiqc/tests/main.nf.test.snap +++ b/modules/nf-core/multiqc/tests/main.nf.test.snap @@ -2,14 +2,14 @@ "multiqc_versions_single": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-02-29T08:48:55.657331" + "timestamp": "2024-01-31T17:43:40.529579" }, "multiqc_stub": { "content": [ @@ -17,25 +17,25 @@ "multiqc_report.html", "multiqc_data", "multiqc_plots", - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-02-29T08:49:49.071937" + "timestamp": "2024-01-31T17:45:09.605359" }, "multiqc_versions_config": { "content": [ [ - "versions.yml:md5,21f35ee29416b9b3073c28733efe4b7d" + "versions.yml:md5,14e9a2661241abd828f4f06a7b5c222d" ] ], "meta": { "nf-test": "0.8.4", "nextflow": "23.10.1" }, - "timestamp": "2024-02-29T08:49:25.457567" + "timestamp": "2024-01-31T17:44:53.535994" } } \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/environment.yml b/modules/nf-core/star/genomegenerate/environment.yml new file mode 100644 index 00000000..791f255e --- /dev/null +++ b/modules/nf-core/star/genomegenerate/environment.yml @@ -0,0 +1,10 @@ +name: star_genomegenerate +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - bioconda::samtools=1.18 + - bioconda::htslib=1.18 + - bioconda::star=2.7.10a + - conda-forge::gawk=5.1.0 diff --git a/modules/nf-core/star/genomegenerate/main.nf b/modules/nf-core/star/genomegenerate/main.nf new file mode 100644 index 00000000..b8855715 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/main.nf @@ -0,0 +1,119 @@ +process STAR_GENOMEGENERATE { + tag "$fasta" + label 'process_high' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' : + 'biocontainers/mulled-v2-1fa26d1ce03c295fe2fdcf85831a92fbcbd7e8c2:ded3841da0194af2701c780e9b3d653a85d27489-0' }" + + input: + tuple val(meta), path(fasta) + tuple val(meta2), path(gtf) + + output: + tuple val(meta), path("star") , emit: index + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def args_list = args.tokenize() + def memory = task.memory ? "--limitGenomeGenerateRAM ${task.memory.toBytes() - 100000000}" : '' + def include_gtf = gtf ? "--sjdbGTFfile $gtf" : '' + if (args_list.contains('--genomeSAindexNbases')) { + """ + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + samtools faidx $fasta + NUM_BASES=`gawk '{sum = sum + \$2}END{if ((log(sum)/log(2))/2 - 1 > 14) {printf "%.0f", 14} else {printf "%.0f", (log(sum)/log(2))/2 - 1}}' ${fasta}.fai` + + mkdir star + STAR \\ + --runMode genomeGenerate \\ + --genomeDir star/ \\ + --genomeFastaFiles $fasta \\ + $include_gtf \\ + --runThreadN $task.cpus \\ + --genomeSAindexNbases \$NUM_BASES \\ + $memory \\ + $args + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } + + stub: + if (gtf) { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/exonGeTrInfo.tab + touch star/exonInfo.tab + touch star/geneInfo.tab + touch star/genomeParameters.txt + touch star/sjdbInfo.txt + touch star/sjdbList.fromGTF.out.tab + touch star/sjdbList.out.tab + touch star/transcriptInfo.tab + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } else { + """ + mkdir star + touch star/Genome + touch star/Log.out + touch star/SA + touch star/SAindex + touch star/chrLength.txt + touch star/chrName.txt + touch star/chrNameLength.txt + touch star/chrStart.txt + touch star/genomeParameters.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + star: \$(STAR --version | sed -e "s/STAR_//g") + samtools: \$(echo \$(samtools --version 2>&1) | sed 's/^.*samtools //; s/Using.*\$//') + gawk: \$(echo \$(gawk --version 2>&1) | sed 's/^.*GNU Awk //; s/, .*\$//') + END_VERSIONS + """ + } +} diff --git a/modules/nf-core/star/genomegenerate/meta.yml b/modules/nf-core/star/genomegenerate/meta.yml new file mode 100644 index 00000000..1061e1b8 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/meta.yml @@ -0,0 +1,53 @@ +name: star_genomegenerate +description: Create index for STAR +keywords: + - index + - fasta + - genome + - reference +tools: + - star: + description: | + STAR is a software package for mapping DNA sequences against + a large reference genome, such as the human genome. + homepage: https://github.com/alexdobin/STAR + manual: https://github.com/alexdobin/STAR/blob/master/doc/STARmanual.pdf + doi: 10.1093/bioinformatics/bts635 + licence: ["MIT"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - fasta: + type: file + description: Fasta file of the reference genome + - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - gtf: + type: file + description: GTF file of the reference genome +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - index: + type: directory + description: Folder containing the star index files + pattern: "star" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@kevinmenden" + - "@drpatelh" +maintainers: + - "@kevinmenden" + - "@drpatelh" diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test b/modules/nf-core/star/genomegenerate/tests/main.nf.test new file mode 100644 index 00000000..c17c8ba4 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test @@ -0,0 +1,115 @@ +nextflow_process { + + name "Test Process STAR_GENOMEGENERATE" + script "../main.nf" + process "STAR_GENOMEGENERATE" + tag "modules" + tag "modules_nfcore" + tag "star" + tag "star/genomegenerate" + + test("fasta_gtf") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_versions") } + ) + } + } + + test("fasta_gtf_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ + [ id:'test_gtf' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.gtf', checkIfExists: true) ] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_gtf_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_gtf_stub_versions") } + ) + } + } + + test("fasta") { + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_index") }, + { assert snapshot(process.out.versions).match("fasta_versions") } + ) + } + + } + + test("fasta_stub") { + + options '-stub' + + when { + process { + """ + input[0] = Channel.of([ + [ id:'test_fasta' ], + [ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta', checkIfExists: true) ] + ]) + input[1] = Channel.of([ [], [] ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(file(process.out.index[0][1]).listFiles().collect { it.getName() }.sort().toString()).match("fasta_stub_index") }, + { assert snapshot(process.out.versions).match("fasta_stub_versions") } + ) + } + + } + +} diff --git a/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap new file mode 100644 index 00000000..5653d6e6 --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/main.nf.test.snap @@ -0,0 +1,90 @@ +{ + "fasta_gtf_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.798555" + }, + "fasta_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.521209" + }, + "fasta_gtf_stub_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.478098" + }, + "fasta_gtf_stub_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:46.491657" + }, + "fasta_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.552329" + }, + "fasta_versions": { + "content": [ + [ + "versions.yml:md5,46b8f1f34bb7f23892cd1eb249ed4d7f" + ] + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:57.560541" + }, + "fasta_gtf_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, exonGeTrInfo.tab, exonInfo.tab, geneInfo.tab, genomeParameters.txt, sjdbInfo.txt, sjdbList.fromGTF.out.tab, sjdbList.out.tab, transcriptInfo.tab]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:54:31.786814" + }, + "fasta_stub_index": { + "content": [ + "[Genome, Log.out, SA, SAindex, chrLength.txt, chrName.txt, chrNameLength.txt, chrStart.txt, genomeParameters.txt]" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.04.3" + }, + "timestamp": "2024-02-01T15:55:07.517472" + } +} \ No newline at end of file diff --git a/modules/nf-core/star/genomegenerate/tests/tags.yml b/modules/nf-core/star/genomegenerate/tests/tags.yml new file mode 100644 index 00000000..79f619bf --- /dev/null +++ b/modules/nf-core/star/genomegenerate/tests/tags.yml @@ -0,0 +1,2 @@ +star/genomegenerate: + - modules/nf-core/star/genomegenerate/** diff --git a/modules/nf-core/universc/CITATION.cff b/modules/nf-core/universc/CITATION.cff new file mode 100644 index 00000000..b00957d1 --- /dev/null +++ b/modules/nf-core/universc/CITATION.cff @@ -0,0 +1,51 @@ +cff-version: 1.2.0 +message: "If you use this software, please cite it as below." +authors: + - given-names: "S. Thomas" + family-names: "Kelly" + email: "tom.kelly@riken.jp" + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/0000-0003-3904-6690" + - family-names: "Battenberg" + given-names: "Kai" + email: "kai.battenberg@riken.jp" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-7517-2657" +version: 1.2.5.1 +doi: 10.1101/2021.01.19.427209 +date-released: 2021-02-14 +url: "https://github.com/minoda-lab/universc" +preferred-citation: + type: article + authors: + - given-names: "S. Thomas" + family-names: "Kelly" + email: "tom.kelly@riken.jp" + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/0000-0003-3904-6690" + - family-names: "Battenberg" + given-names: "Kai" + email: "kai.battenberg@riken.jp" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "https://orcid.org/http://orcid.org/0000-0001-7517-2657" + - family-names: "Hetherington" + given-names: "Nicola A." + affiliation: "Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-8802-2906" + - family-names: "Hayashi" + given-names: "Makoto" + affiliation: "Center for Sustainable Resource Science, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0001-6389-4265" + - given-names: "Aki" + family-names: "Minoda" + email: "akiko.minoda@riken.jp" + affiliation: Center for Integrative Medical Sciences, RIKEN, Suehiro-cho-1-7-22, Tsurumi Ward, Yokohama, Japan" + orcid: "http://orcid.org/0000-0002-2927-5791" + doi: "10.1101/2021.01.19.427209" + title: "UniverSC: a flexible cross-platform single-cell data processing pipeline" + year: "2021" + journal: "bioRxiv" + start: 2021.01.19.427209 + volume: + issue: + month: 1 diff --git a/modules/nf-core/universc/CITATION.md b/modules/nf-core/universc/CITATION.md new file mode 100644 index 00000000..4f420bb8 --- /dev/null +++ b/modules/nf-core/universc/CITATION.md @@ -0,0 +1,37 @@ +### Citation + +A submission to a journal and biorXiv is in progress. Please cite these when +they are available. Currently, the package can be cited +as follows: + +Kelly, S.T., Battenberg, Hetherington, N.A., K., Hayashi, K., and Minoda, A. (2021) +UniverSC: a flexible cross-platform single-cell data processing pipeline. +bioRxiv 2021.01.19.427209; doi: [https://doi.org/10.1101/2021.01.19.427209](https://doi.org/10.1101/2021.01.19.427209) +package version 1.2.5.1. [https://github.com/minoda-lab/universc](https://github.com/minoda-lab/universc) + +``` +@article {Kelly2021.01.19.427209, + author = {Kelly, S. Thomas and Battenberg, Kai and Hetherington, Nicola A. and Hayashi, Makoto and Minoda, Aki}, + title = {{UniverSC}: a flexible cross-platform single-cell data processing pipeline}, + elocation-id = {2021.01.19.427209}, + year = {2021}, + doi = {10.1101/2021.01.19.427209}, + publisher = {Cold Spring Harbor Laboratory}, + abstract = {Single-cell RNA-sequencing analysis to quantify RNA molecules in individual cells has become popular owing to the large amount of information one can obtain from each experiment. We have developed UniverSC (https://github.com/minoda-lab/universc), a universal single-cell processing tool that supports any UMI-based platform. Our command-line tool enables consistent and comprehensive integration, comparison, and evaluation across data generated from a wide range of platforms.Competing Interest StatementThe authors have declared no competing interest.}, + eprint = {https://www.biorxiv.org/content/early/2021/01/19/2021.01.19.427209.full.pdf}, + journal = {{bioRxiv}}, + note = {package version 1.2.5.1}, + URL = {https://github.com/minoda-lab/universc}, +} + +``` + +``` +@Manual{, + title = {{UniverSC}: a flexible cross-platform single-cell data processing pipeline}, + author = {S. Thomas Kelly, Kai Battenberg, Nicola A. Hetherington, Makoto Hayashi, and Aki Minoda}, + year = {2021}, + note = {package version 1.2.5.1}, + url = {https://github.com/minoda-lab/universc}, + } +``` diff --git a/modules/nf-core/universc/README.md b/modules/nf-core/universc/README.md new file mode 100644 index 00000000..793f4f13 --- /dev/null +++ b/modules/nf-core/universc/README.md @@ -0,0 +1,116 @@ +# UniverSC + +## Single-cell processing across technologies + +UniverSC is an open-source single-cell pipeline that runs across platforms on various technologies. + +## Maintainers + +Tom Kelly (RIKEN, IMS) + +Kai Battenberg (RIKEN CSRS/IMS) + +Contact: .[at]riken.jp + +## Implementation + +This container runs Cell Ranger v3.0.2 installed from source on MIT License on GitHub with +modifications for compatibility with updated dependencies. All software is installed from +open-source repositories and available for reuse. + +It is _not_ subject to the 10X Genomics End User License Agreement (EULA). +This version allows running Cell Ranger v3.0.2 on data generated from any experimental platform +without restrictions. However, updating to newer versions on Cell Ranger subject to the +10X EULA is not possible without the agreement of 10X Genomics. + +To comply with licensing and respect 10X Genomics Trademarks, the 10X Genomics logo +has been removed from HTML reports, the tool has been renamed, and proprietary +closed-source tools to build Cloupe files are disabled. + +It is still suffient to generate summary reports and count matrices compatible with +single-cell analysis tools available for 10X Genomics and Cell Ranger output format +in Python and R packages. + +## Usage + +### Generating References + +The Cell Ranger modules can be used to generate reference indexes to run UniverSC. +Note that UniverSC requires the Open Source version v3.0.2 of Cell Ranger included +in the nf-core/universc Docker image. The same module parameters can be run provided +that the container is changed in process configurations (modify nextflow.config). + +``` +process { + +... + withName: CELLRANGER_MKGTF { + container = "nf-core/universc:1.2.5.1" + } + withName: CELLRANGER_MKREF{ + container = "nf-core/universc:1.2.5.1" + } +... +} +``` + +This will generate a compatible index for UniverSC using the same version of the +STAR aligner and a permissive software license without and EULA. + +### Container settings + +The cellranger install directory must have write permissions to run UniverSC. +To run in docker or podman use the `--user root` option in container parameters +and for singularity use the `--writeable` parameter. + +These are set as default in universc/main.nf: + +``` + container "nf-core/universc:1.2.5.1" + if (workflow.containerEngine == 'docker'){ + containerOptions = "--privileged" + } + if (workflow.containerEngine == 'podman'){ + containerOptions = "--runtime /usr/bin/crun --userns=keep-id --user root --systemd=always" + } + if (workflow.containerEngine == 'singularity'){ + containerOptions = "--writable" + } +``` + +Select the container engine with `nextflow --profile "docker"` or set the environment variable +as one of the following before running nextflow. + +``` +export PROFILE="docker" +export PROFILE="podman" +export PROFILE="singularity" +``` + +Note that due to dependencies installed in a docker image, it is not possible to use conda environments. + +## Disclaimer + +We are third party developers not affiliated with 10X Genomics or any other vendor of +single-cell technologies. We are releasing this code on an open-source license which calls Cell Ranger +as an external dependency. + +## Licensing + +This package is provided open-source on a GPL-3 license. This means that you are free to use and +modify this code provided that they also contain this license. + +## Updating the package + +The tomkellygenetics/universc: container is automatically updated with tomkellygenetics/universc:latest. + +A stable release is mirrored at nf-core/universc:1.2.5.1 and will be updated as needed. + +To build an updated container use the Dockerfile provided here: + +[https://github.com/minoda-lab/universc/blob/master/Dockerfile](https://github.com/minoda-lab/universc/blob/master/Dockerfile) + +Note that this uses a custom base image which is built with an open-source implementation of +Cell Ranger v3.0.2 on MIT License and relies of Python 2. The build file can be found here: + +[https://github.com/TomKellyGenetics/cellranger_clean/blob/master/Dockerfile](https://github.com/TomKellyGenetics/cellranger_clean/blob/master/Dockerfile) diff --git a/modules/nf-core/universc/environment.yml b/modules/nf-core/universc/environment.yml new file mode 100644 index 00000000..e9cdf650 --- /dev/null +++ b/modules/nf-core/universc/environment.yml @@ -0,0 +1,5 @@ +name: universc +channels: + - conda-forge + - bioconda + - defaults diff --git a/modules/nf-core/universc/main.nf b/modules/nf-core/universc/main.nf new file mode 100644 index 00000000..de8394a0 --- /dev/null +++ b/modules/nf-core/universc/main.nf @@ -0,0 +1,80 @@ +process UNIVERSC { + tag "$meta.id" + label 'process_medium' + + container "nf-core/universc:1.2.5.1" + if (workflow.containerEngine == 'docker'){ + containerOptions = "--privileged" + } + if ( workflow.containerEngine == 'podman'){ + containerOptions = "--runtime crun --userns=keep-id --systemd=always" + } + if (workflow.containerEngine == 'singularity'){ + containerOptions = "-B /var/tmp --writable-tmpfs" + params.singularity_autoMounts = true + } + + input: + tuple val(meta), path(reads) + path reference + + + output: + tuple val(meta), path("sample-${meta.id}/outs/*"), emit: outs + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "UNIVERSC module does not support Conda. Please use Docker / Singularity / Podman instead." + } + def args = task.ext.args ?: '' + def sample_arg = meta.samples.unique().join(",") + def reference_name = reference.name + def input_reads = meta.single_end ? "--file $reads" : "-R1 ${reads[0]} -R2 ${reads[1]}" + """ + universc \\ + --id 'sample-${meta.id}' \\ + ${input_reads} \\ + --technology '${meta.technology}' \\ + --chemistry '${meta.chemistry}' \\ + --reference ${reference_name} \\ + --description ${sample_arg} \\ + --jobmode "local" \\ + --localcores ${task.cpus} \\ + --localmem ${task.memory.toGiga()} \\ + --per-cell-data \\ + $args 1> _log 2> _err + + # save log files + echo !! > sample-${meta.id}/outs/_invocation + cp _log sample-${meta.id}/outs/_log + cp _err sample-${meta.id}/outs/_err + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$(cellranger count --version 2>&1 | head -n 2 | tail -n 1 | sed 's/^.* //g' | sed 's/(//g' | sed 's/)//g' )) + universc: \$(echo \$(bash /universc/launch_universc.sh --version | grep version | grep universc | sed 's/^.* //g' )) + END_VERSIONS + """ + + + stub: + // Exit if running this module with -profile conda / -profile mamba + if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) { + error "UNIVERSC module does not support Conda. Please use Docker / Singularity / Podman instead." + } + """ + mkdir -p "sample-${meta.id}/outs/" + touch sample-${meta.id}/outs/fake_file.txt + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cellranger: \$(echo \$(cellranger count --version 2>&1 | head -n 2 | tail -n 1 | sed 's/^.* //g' | sed 's/(//g' | sed 's/)//g' )) + universc: \$(echo \$(bash /universc/launch_universc.sh --version | grep version | grep universc | sed 's/^.* //g' )) + END_VERSIONS + """ +} diff --git a/modules/nf-core/universc/meta.yml b/modules/nf-core/universc/meta.yml new file mode 100644 index 00000000..92a46bc6 --- /dev/null +++ b/modules/nf-core/universc/meta.yml @@ -0,0 +1,42 @@ +name: "universc" +description: Module to run UniverSC an open-source pipeline to demultiplex and process single-cell RNA-Seq data +keywords: + - demultiplex + - align + - single-cell + - scRNA-Seq + - count + - umi +tools: + - "universc": + description: "UniverSC: a flexible cross-platform single-cell data processing pipeline" + homepage: "https://hub.docker.com/r/tomkellygenetics/universc" + documentation: "https://raw.githubusercontent.com/minoda-lab/universc/master/man/launch_universc.sh" + tool_dev_url: "https://github.com/minoda-lab/universc" + doi: "10.1101/2021.01.19.427209" + licence: ["GPL-3.0-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - reads: + type: file + description: FASTQ or FASTQ.GZ file, list of 2 files for paired-end data + pattern: "*.{fastq,fq,fastq.gz,fq.gz}" +output: + - outs: + type: file + description: Files containing the outputs of Cell Ranger + pattern: "sample-${meta.id}/outs/*" + - versions: + type: file + description: File containing software version + pattern: "versions.yml" +authors: + - "@kbattenb" + - "@tomkellygenetics" +maintainers: + - "@kbattenb" + - "@tomkellygenetics" diff --git a/modules/nf-core/unzip/environment.yml b/modules/nf-core/unzip/environment.yml new file mode 100644 index 00000000..d3a535f1 --- /dev/null +++ b/modules/nf-core/unzip/environment.yml @@ -0,0 +1,7 @@ +name: unzip +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - conda-forge::p7zip=16.02 diff --git a/modules/nf-core/unzip/main.nf b/modules/nf-core/unzip/main.nf new file mode 100644 index 00000000..08cfc3c4 --- /dev/null +++ b/modules/nf-core/unzip/main.nf @@ -0,0 +1,37 @@ +process UNZIP { + tag "$archive" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/p7zip:16.02' : + 'biocontainers/p7zip:16.02' }" + + input: + tuple val(meta), path(archive) + + output: + tuple val(meta), path("${prefix}/"), emit: unzipped_archive + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + if ( archive instanceof List && archive.name.size > 1 ) { error "[UNZIP] error: 7za only accepts a single archive as input. Please check module input." } + + prefix = task.ext.prefix ?: ( meta.id ? "${meta.id}" : archive.baseName) + """ + 7za \\ + x \\ + -o"${prefix}"/ \\ + $args \\ + $archive + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + 7za: \$(echo \$(7za --help) | sed 's/.*p7zip Version //; s/(.*//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/unzip/meta.yml b/modules/nf-core/unzip/meta.yml new file mode 100644 index 00000000..e8e377e2 --- /dev/null +++ b/modules/nf-core/unzip/meta.yml @@ -0,0 +1,42 @@ +name: unzip +description: Unzip ZIP archive files +keywords: + - unzip + - decompression + - zip + - archiving +tools: + - unzip: + description: p7zip is a quick port of 7z.exe and 7za.exe (command line version of 7zip, see www.7-zip.org) for Unix. + homepage: https://sourceforge.net/projects/p7zip/ + documentation: https://sourceforge.net/projects/p7zip/ + tool_dev_url: https://sourceforge.net/projects/p7zip" + licence: ["LGPL-2.1-or-later"] +input: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - archive: + type: file + description: ZIP file + pattern: "*.zip" +output: + - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - unzipped_archive: + type: directory + description: Directory contents of the unzipped archive + pattern: "${archive.baseName}/" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "@jfy133" +maintainers: + - "@jfy133" diff --git a/nextflow.config b/nextflow.config index 9508cefa..495a8414 100644 --- a/nextflow.config +++ b/nextflow.config @@ -9,14 +9,55 @@ // Global default params, used in configs params { - // TODO nf-core: Specify your pipeline's command line flags - // Input options - input = null + // generic options + aligner = 'alevin' + outdir = null + input = null + save_reference = false + protocol = 'auto' + + // reference files + genome = null + transcript_fasta = null + + // salmon alevin parameters (simpleaf) + simpleaf_rlen = 91 + barcode_whitelist = null + txp2gene = null + salmon_index = null + + // kallist bustools parameters + kallisto_gene_map = null + kallisto_index = null + kb_workflow = "standard" + + // STARsolo parameters + star_index = null + star_ignore_sjdbgtf = null + seq_center = null + star_feature = "Gene" + + // Cellranger parameters + cellranger_index = null + + // Cellranger ARC parameters + motifs = null + cellrangerarc_config = null + cellrangerarc_reference = null + + // UniverSC paramaters + universc_index = null + + // Template Boilerplate options + skip_multiqc = false + // References genome = null igenomes_base = 's3://ngi-igenomes/igenomes/' igenomes_ignore = false - fasta = null// MultiQC options + + // QC and MultiQC options + skip_fastqc = false multiqc_config = null multiqc_title = null multiqc_logo = null @@ -24,7 +65,6 @@ params { multiqc_methods_description = null // Boilerplate options - outdir = null publish_dir_mode = 'copy' email = null email_on_fail = null @@ -168,6 +208,7 @@ profiles { } test { includeConfig 'conf/test.config' } test_full { includeConfig 'conf/test_full.config' } + test_multiome { includeConfig 'conf/test_multiome.config' } } // Set default registry for Apptainer, Docker, Podman and Singularity independent of -profile @@ -232,7 +273,7 @@ manifest { mainScript = 'main.nf' nextflowVersion = '!>=23.04.0' version = '2.6.0dev' - doi = '' + doi = '10.5281/zenodo.3568187' } // Load modules.config for DSL2 module specific options diff --git a/nextflow_schema.json b/nextflow_schema.json index 233b47d4..65b4b695 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -20,8 +20,7 @@ "mimetype": "text/csv", "pattern": "^\\S+\\.csv$", "description": "Path to comma-separated file containing information about the samples in the experiment.", - "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 3 columns, and a header row. See [usage docs](https://nf-co.re/scrnaseq/usage#samplesheet-input).", - "fa_icon": "fas fa-file-csv" + "help_text": "You will need to create a design file with information about the samples in your experiment before running the pipeline. Use this parameter to specify its location. It has to be a comma-separated file with 4 columns, and a header row. See [usage docs](https://nf-co.re/rnaseq/usage#samplesheet-input)." }, "outdir": { "type": "string", @@ -43,6 +42,52 @@ } } }, + "mandatory_arguments": { + "title": "Mandatory arguments", + "type": "object", + "description": "", + "default": "", + "properties": { + "barcode_whitelist": { + "type": "string", + "description": "If not using the 10X Genomics platform, a custom barcode whitelist can be used with `--barcode_whitelist`.", + "fa_icon": "fas fa-barcode" + }, + "aligner": { + "type": "string", + "description": "Name of the tool to use for scRNA (pseudo-) alignment.", + "default": "alevin", + "help_text": "The workflow can handle three types of methods:\n\n- Kallisto/Bustools\n- Salmon Alevin + AlevinQC\n- STARsolo\n\nTo choose which one to use, please specify either `alevin`, `star` or `kallisto` as a parameter option for `--aligner`. By default, the pipeline runs the `alevin` option. Note that specifying another aligner option also requires choosing appropriate parameters (see below) for the selected option.", + "fa_icon": "fas fa-align-center", + "enum": ["kallisto", "star", "alevin", "cellranger", "cellrangerarc", "universc"] + }, + "protocol": { + "type": "string", + "description": "The protocol that was used to generate the single cell data, e.g. 10x Genomics v2 Chemistry.\n\n Can be 'auto' (cellranger only), '10XV1', '10XV2', '10XV3', or any other protocol string that will get directly passed the respective aligner.", + "help_text": "The default is to auto-detect the protocol when running cellranger. For all other aligners the protocol MUST be manually specified. \n\n The following protocols are recognized by the pipeline and mapped to the corresponding protocol name of the respective aligner: '10XV1', '10XV2', '10XV3'. \n\nAny other protocol value is passed to the aligner in verbatim to support other sequencing platforms. See the [kallisto](https://pachterlab.github.io/kallisto/manual#bus), [simpleaf](https://simpleaf.readthedocs.io/en/latest/quant-command.html#a-note-on-the-chemistry-flag), [starsolo](https://gensoft.pasteur.fr/docs/STAR/2.7.9a/STARsolo.html), and [universc](https://github.com/minoda-lab/universc#pre-set-configurations) documentations for more details.", + "default": "auto", + "fa_icon": "fas fa-cogs" + } + }, + "fa_icon": "fas fa-terminal" + }, + "skip_tools": { + "title": "Skip Tools", + "type": "object", + "description": "This section can be used to disable certain tools in the pipeline", + "default": "", + "fa_icon": "fas fa-forward", + "properties": { + "skip_multiqc": { + "type": "boolean", + "description": "Skip MultiQC Report" + }, + "skip_fastqc": { + "type": "boolean", + "description": "Skip FastQC" + } + } + }, "reference_genome_options": { "title": "Reference genome options", "type": "object", @@ -71,6 +116,154 @@ "fa_icon": "fas fa-ban", "hidden": true, "help_text": "Do not load `igenomes.config` when running the pipeline. You may choose this option if you observe clashes between custom parameters and those supplied in `igenomes.config`." + }, + "transcript_fasta": { + "type": "string", + "description": "A cDNA FASTA file", + "fa_icon": "fas fa-dna" + }, + "gtf": { + "type": "string", + "description": "Reference GTF annotation file", + "fa_icon": "fas fa-code-branch" + }, + "save_reference": { + "type": "boolean", + "description": "Specify this parameter to save the indices created (STAR, Kallisto, Salmon) to the results.", + "fa_icon": "fas fa-bookmark" + }, + "igenomes_base": { + "type": "string", + "format": "directory-path", + "description": "Directory / URL base for iGenomes references.", + "default": "s3://ngi-igenomes/igenomes/", + "fa_icon": "fas fa-cloud-download-alt", + "hidden": true + } + } + }, + "alevin_options": { + "title": "Alevin Options", + "type": "object", + "description": "", + "default": "", + "properties": { + "salmon_index": { + "type": "string", + "description": "This can be used to specify a precomputed Salmon index in the pipeline, in order to skip the generation of required indices by Salmon itself.", + "fa_icon": "fas fa-fish" + }, + "txp2gene": { + "type": "string", + "description": "Path to transcript to gene mapping file. This allows the specification of a transcript to gene mapping file for Salmon Alevin and AlevinQC.", + "help_text": "> This is not the same as the `kallisto_gene_map` parameter down below and is only used by the Salmon Alevin workflow.", + "fa_icon": "fas fa-map-marked-alt" + }, + "simpleaf_rlen": { + "type": "integer", + "default": 91, + "description": "It is the target read length the index will be built for, using simpleaf.", + "fa_icon": "fas fa-map-marked-alt" + } + } + }, + "starsolo_options": { + "title": "STARSolo Options", + "type": "object", + "description": "", + "default": "", + "properties": { + "star_index": { + "type": "string", + "description": "Specify a path to the precomputed STAR index.", + "help_text": "> NB: This has to be computed with STAR Version 2.7 or later, as STARsolo was only first supported by STAR Version 2.7.", + "fa_icon": "fas fa-asterisk" + }, + "star_ignore_sjdbgtf": { + "type": "string", + "description": "Ignore the SJDB GTF file." + }, + "seq_center": { + "type": "string", + "description": "Name of sequencing center for BAM read group tag." + }, + "star_feature": { + "type": "string", + "default": "Gene", + "enum": ["Gene", "GeneFull", "Gene Velocyto"], + "description": "Quantification type of different transcriptomic feature. Use `GeneFull` on pre-mRNA count for single-nucleus RNA-seq reads. Use `Gene Velocyto` to generate RNA velocity matrix.", + "fa_icon": "fas fa-asterisk" + } + }, + "fa_icon": "fas fa-star" + }, + "kallisto_bus_options": { + "title": "Kallisto/BUS Options", + "type": "object", + "description": "Params related to Kallisto/BUS tool", + "default": "", + "fa_icon": "fas fa-fish", + "properties": { + "kallisto_gene_map": { + "type": "string", + "description": "Specify a Kallisto gene mapping file here. If you don't, this will be automatically created in the Kallisto workflow when specifying a valid `--gtf` file.", + "fa_icon": "fas fa-fish" + }, + "kallisto_index": { + "type": "string", + "description": "Specify a path to the precomputed Kallisto index.", + "fa_icon": "fas fa-fish" + }, + "kb_workflow": { + "type": "string", + "default": "standard", + "description": "Type of workflow. Use `lamanno` for RNA velocity based on La Manno et al. 2018 logic. Use `nucleus` for RNA velocity on single-nucleus RNA-seq reads. Use `kite` for feature barcoding. Use `kite: 10xFB` for 10x Genomics Feature Barcoding technology. (default: standard)", + "fa_icon": "fas fa-fish", + "enum": ["standard", "lamanno", "nucleus", "kite", "kite: 10xFB"] + } + } + }, + "cellranger_options": { + "title": "Cellranger Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "cellranger_index": { + "type": "string", + "description": "Specify a pre-calculated cellranger index. Readily prepared indexes can be obtained from the 10x Genomics website. " + } + } + }, + "cellrangerarc_options": { + "title": "Cellranger ARC Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "motifs": { + "type": "string", + "description": "Specify a motif file to create a cellranger-arc index. Can be taken, e.g., from the JASPAR database." + }, + "cellrangerarc_config": { + "type": "string", + "description": "Specify a config file to create the cellranger-arc index." + }, + "cellrangerarc_reference": { + "type": "string", + "description": "Specify the genome reference name used in the config file to create a cellranger-arc index." + } + } + }, + "universc_options": { + "title": "UniverSC Options", + "type": "object", + "description": "Params related to the Cellranger pipeline", + "default": "", + "properties": { + "universc_index": { + "type": "string", + "description": "Specify a pre-calculated cellranger index. Readily prepared indexes can be obtained from the 10x Genomics website." } } }, @@ -273,9 +466,33 @@ { "$ref": "#/definitions/input_output_options" }, + { + "$ref": "#/definitions/mandatory_arguments" + }, + { + "$ref": "#/definitions/skip_tools" + }, { "$ref": "#/definitions/reference_genome_options" }, + { + "$ref": "#/definitions/alevin_options" + }, + { + "$ref": "#/definitions/starsolo_options" + }, + { + "$ref": "#/definitions/kallisto_bus_options" + }, + { + "$ref": "#/definitions/cellranger_options" + }, + { + "$ref": "#/definitions/cellrangerarc_options" + }, + { + "$ref": "#/definitions/universc_options" + }, { "$ref": "#/definitions/institutional_config_options" }, diff --git a/nf-test.config b/nf-test.config new file mode 100644 index 00000000..2fa82adf --- /dev/null +++ b/nf-test.config @@ -0,0 +1,8 @@ +config { + + testsDir "tests" + workDir ".nf-test" + configFile "tests/nextflow.config" + profile "docker" + +} diff --git a/subworkflows/local/alevin.nf b/subworkflows/local/alevin.nf new file mode 100644 index 00000000..764c08f8 --- /dev/null +++ b/subworkflows/local/alevin.nf @@ -0,0 +1,71 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { GFFREAD_TRANSCRIPTOME } from '../../modules/local/gffread_transcriptome' +include { ALEVINQC } from '../../modules/local/alevinqc' +include { SIMPLEAF_INDEX } from '../../modules/local/simpleaf_index' +include { SIMPLEAF_QUANT } from '../../modules/local/simpleaf_quant' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { GFFREAD as GFFREAD_TXP2GENE } from '../../modules/nf-core/gffread/main' + +def multiqc_report = [] + +workflow SCRNASEQ_ALEVIN { + + take: + genome_fasta + gtf + transcript_fasta + salmon_index + txp2gene + barcode_whitelist + protocol + ch_fastq + + + main: + ch_versions = Channel.empty() + + assert (genome_fasta && gtf && salmon_index && txp2gene) || (genome_fasta && gtf) || (genome_fasta && gtf && transcript_fasta && txp2gene): + """Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf'), or a genome fasta file + and a transcriptome fasta file ('--transcript_fasta`) if no index and txp2gene is given!""".stripIndent() + + /* + * Build salmon index + */ + if (!salmon_index) { + SIMPLEAF_INDEX( genome_fasta, transcript_fasta, gtf ) + salmon_index = SIMPLEAF_INDEX.out.index.collect() + transcript_tsv = SIMPLEAF_INDEX.out.transcript_tsv.collect() + ch_versions = ch_versions.mix(SIMPLEAF_INDEX.out.versions) + + if (!txp2gene) { + txp2gene = SIMPLEAF_INDEX.out.transcript_tsv + } + } + + + + /* + * Perform quantification with salmon alevin + */ + SIMPLEAF_QUANT ( + ch_fastq, + salmon_index, + txp2gene, + protocol, + barcode_whitelist + ) + ch_versions = ch_versions.mix(SIMPLEAF_QUANT.out.versions) + + /* + * Run alevinQC + */ + ALEVINQC( SIMPLEAF_QUANT.out.alevin_results ) + ch_versions = ch_versions.mix(ALEVINQC.out.versions) + + emit: + ch_versions + alevin_results = SIMPLEAF_QUANT.out.alevin_results + alevinqc = ALEVINQC.out.report +} diff --git a/subworkflows/local/align_cellranger.nf b/subworkflows/local/align_cellranger.nf new file mode 100644 index 00000000..bfdd533e --- /dev/null +++ b/subworkflows/local/align_cellranger.nf @@ -0,0 +1,47 @@ +/* + * Alignment with Cellranger + */ + +include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf" +include {CELLRANGER_COUNT} from "../../modules/nf-core/cellranger/count/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGER_ALIGN { + take: + fasta + gtf + cellranger_index + ch_fastq + protocol + + main: + ch_versions = Channel.empty() + + assert cellranger_index || (fasta && gtf): + "Must provide either a cellranger index or both a fasta file ('--fasta') and a gtf file ('--gtf')." + + if (!cellranger_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF( gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + // Make reference genome + CELLRANGER_MKREF( fasta, CELLRANGER_MKGTF.out.gtf, "cellranger_reference" ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + cellranger_index = CELLRANGER_MKREF.out.reference + } + + // Obtain read counts + CELLRANGER_COUNT ( + // TODO what is `gem` and why is it needed? + ch_fastq.map{ meta, reads -> [meta + ["chemistry": protocol, "gem": meta.id, "samples": [meta.id]], reads] }, + cellranger_index + ) + ch_versions = ch_versions.mix(CELLRANGER_COUNT.out.versions) + + emit: + ch_versions + cellranger_out = CELLRANGER_COUNT.out.outs + star_index = cellranger_index +} diff --git a/subworkflows/local/align_cellrangerarc.nf b/subworkflows/local/align_cellrangerarc.nf new file mode 100644 index 00000000..6de84e7b --- /dev/null +++ b/subworkflows/local/align_cellrangerarc.nf @@ -0,0 +1,56 @@ +/* + * Alignment with Cellranger Arc + */ + +include {CELLRANGERARC_MKGTF} from "../../modules/nf-core/cellrangerarc/mkgtf/main.nf" +include {CELLRANGERARC_MKREF} from "../../modules/nf-core/cellrangerarc/mkref/main.nf" +include {CELLRANGERARC_COUNT} from "../../modules/nf-core/cellrangerarc/count/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow CELLRANGERARC_ALIGN { + take: + fasta + gtf + motifs + cellranger_index + ch_fastq + cellrangerarc_config + + main: + ch_versions = Channel.empty() + + assert cellranger_index || (fasta && gtf): + "Must provide either a cellranger index or a bundle of a fasta file ('--fasta') + gtf file ('--gtf')." + + if (!cellranger_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGERARC_MKGTF( gtf ) + filtered_gtf = CELLRANGERARC_MKGTF.out.gtf + ch_versions = ch_versions.mix(CELLRANGERARC_MKGTF.out.versions) + + // Make reference genome + assert (( !params.cellrangerarc_reference && !cellrangerarc_config ) || + ( params.cellrangerarc_reference && cellrangerarc_config ) ) : + "If you provide a config file you also have to specific the reference name and vice versa." + + cellrangerarc_reference = 'cellrangerarc_reference' + if ( params.cellrangerarc_reference ){ + cellrangerarc_reference = params.cellrangerarc_reference + } + + CELLRANGERARC_MKREF( fasta, filtered_gtf, motifs, cellrangerarc_config, cellrangerarc_reference ) + ch_versions = ch_versions.mix(CELLRANGERARC_MKREF.out.versions) + cellranger_index = CELLRANGERARC_MKREF.out.reference + } + + // Obtain read counts + CELLRANGERARC_COUNT ( + ch_fastq, + cellranger_index + ) + ch_versions = ch_versions.mix(CELLRANGERARC_COUNT.out.versions) + + emit: + ch_versions + cellranger_arc_out = CELLRANGERARC_COUNT.out.outs +} diff --git a/subworkflows/local/align_universc.nf b/subworkflows/local/align_universc.nf new file mode 100644 index 00000000..cf16985e --- /dev/null +++ b/subworkflows/local/align_universc.nf @@ -0,0 +1,52 @@ +/* + * Alignment with Cellranger open-source implementation called by UniverSC + */ + +include {CELLRANGER_MKGTF} from "../../modules/nf-core/cellranger/mkgtf/main.nf" +include {CELLRANGER_MKREF} from "../../modules/nf-core/cellranger/mkref/main.nf" +include {UNIVERSC} from "../../modules/nf-core/universc/main.nf" + +// Define workflow to subset and index a genome region fasta file +workflow UNIVERSC_ALIGN { + take: + fasta + gtf + universc_index + universc_technology + ch_fastq + + main: + ch_versions = Channel.empty() + + assert universc_index || (fasta && gtf): + "Must provide either a cellranger index or both a fasta file ('--fasta') and a gtf file ('--gtf')." + + if (!universc_index) { + // Filter GTF based on gene biotypes passed in params.modules + CELLRANGER_MKGTF( gtf ) + ch_versions = ch_versions.mix(CELLRANGER_MKGTF.out.versions) + + // Make reference genome + CELLRANGER_MKREF( fasta, CELLRANGER_MKGTF.out.gtf, "cellranger_reference" ) + ch_versions = ch_versions.mix(CELLRANGER_MKREF.out.versions) + universc_index = CELLRANGER_MKREF.out.reference + } + + // Obtain read counts + UNIVERSC ( + ch_fastq.map{ + meta, reads -> [ + // defaults + ["samples": [meta.id], "technology": universc_technology, "chemistry": "auto", "single_end": false, "strandedness": "forward"] + meta, // + meta overrides defaults with information already in meta + reads + ] + }, + universc_index + ) + ch_versions = ch_versions.mix(UNIVERSC.out.versions) + + emit: + ch_versions + universc_out = UNIVERSC.out.outs + star_index = universc_index +} diff --git a/subworkflows/local/fastqc.nf b/subworkflows/local/fastqc.nf new file mode 100644 index 00000000..05bb128b --- /dev/null +++ b/subworkflows/local/fastqc.nf @@ -0,0 +1,38 @@ +// +// Check input samplesheet and get read channels +// +include { FASTQC } from '../../modules/nf-core/fastqc/main' + +workflow FASTQC_CHECK { + take: + ch_fastq + + main: + + def n = (params.aligner == 'cellrangerarc') ? 3 : 1 + ch_fastq.map { ch -> [ ch[0], ch[n] ] }.set { ch_fastq } + + /* + * FastQ QC using FASTQC + */ + FASTQC ( ch_fastq ) + fastqc_zip = FASTQC.out.zip + fastqc_html = FASTQC.out.html + + fastqc_zip + .map { it -> [ it[1] ] } + .set { fastqc_zip_only } + fastqc_html + .map { it -> [ it[1] ] } + .set { fastqc_html_only } + + fastqc_multiqc = Channel.empty() + fastqc_multiqc = fastqc_multiqc.mix( fastqc_zip_only, fastqc_html_only ) + fastqc_version = FASTQC.out.versions + + emit: + fastqc_zip + fastqc_html + fastqc_version + fastqc_multiqc +} diff --git a/subworkflows/local/kallisto_bustools.nf b/subworkflows/local/kallisto_bustools.nf new file mode 100644 index 00000000..3210e47a --- /dev/null +++ b/subworkflows/local/kallisto_bustools.nf @@ -0,0 +1,72 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { GENE_MAP } from '../../modules/local/gene_map' +include {KALLISTOBUSTOOLS_COUNT } from '../../modules/nf-core/kallistobustools/count/main' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { KALLISTOBUSTOOLS_REF } from '../../modules/nf-core/kallistobustools/ref/main' + +def multiqc_report = [] + +workflow KALLISTO_BUSTOOLS { + take: + genome_fasta + gtf + kallisto_index + txp2gene + protocol + kb_workflow + ch_fastq + + main: + ch_versions = Channel.empty() + + assert kallisto_index || (genome_fasta && gtf): + "Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf') if no index is given!" + + assert txp2gene || gtf: + "Must provide either a GTF file ('--gtf') or kallisto gene map ('--kallisto_gene_map') to align with kallisto bustools!" + + /* + * Generate Kallisto Gene Map if not supplied and index is given + * If no index is given, the gene map will be generated in the 'kb ref' step + */ + if (!txp2gene && kallisto_index) { + GENE_MAP( gtf ) + txp2gene = GENE_MAP.out.gene_map + ch_versions = ch_versions.mix(GENE_MAP.out.versions) + } + + /* + * Generate kallisto index + */ + if (!kallisto_index) { + KALLISTOBUSTOOLS_REF( genome_fasta, gtf, kb_workflow ) + txp2gene = KALLISTOBUSTOOLS_REF.out.t2g.collect() + kallisto_index = KALLISTOBUSTOOLS_REF.out.index.collect() + ch_versions = ch_versions.mix(KALLISTOBUSTOOLS_REF.out.versions) + t1c = KALLISTOBUSTOOLS_REF.out.cdna_t2c.ifEmpty{ [] } + t2c = KALLISTOBUSTOOLS_REF.out.intron_t2c.ifEmpty{ [] } + } + + /* + * Quantification with kallistobustools count + */ + KALLISTOBUSTOOLS_COUNT( + ch_fastq, + kallisto_index, + txp2gene, + t1c, + t2c, + protocol + ) + + ch_versions = ch_versions.mix(KALLISTOBUSTOOLS_COUNT.out.versions) + + emit: + ch_versions + counts = KALLISTOBUSTOOLS_COUNT.out.count + txp2gene = txp2gene.collect() + + +} diff --git a/subworkflows/local/mtx_conversion.nf b/subworkflows/local/mtx_conversion.nf new file mode 100644 index 00000000..958da400 --- /dev/null +++ b/subworkflows/local/mtx_conversion.nf @@ -0,0 +1,56 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { MTX_TO_H5AD } from '../../modules/local/mtx_to_h5ad.nf' +include { CONCAT_H5AD } from '../../modules/local/concat_h5ad.nf' +include { MTX_TO_SEURAT } from '../../modules/local/mtx_to_seurat.nf' + +workflow MTX_CONVERSION { + + take: + mtx_matrices + samplesheet + txp2gene + star_index + + main: + ch_versions = Channel.empty() + + // Cellranger module output contains too many files which cause path collisions, we filter to the ones we need. + if (params.aligner in [ 'cellranger', 'cellrangerarc' ]) { + mtx_matrices = mtx_matrices.map { meta, mtx_files -> + [ meta, mtx_files.findAll { it.toString().contains("filtered_feature_bc_matrix") } ] + } + .filter { meta, mtx_files -> mtx_files } // Remove any that are missing the relevant files + } + + // + // Convert matrix to h5ad + // + MTX_TO_H5AD ( + mtx_matrices, + txp2gene, + star_index + ) + + // + // Concat sample-specific h5ad in one + // + CONCAT_H5AD ( + MTX_TO_H5AD.out.h5ad.collect(), // gather all sample-specific files + samplesheet + ) + + // + // Convert matrix do seurat + // + MTX_TO_SEURAT ( + mtx_matrices + ) + + //TODO CONCAT h5ad and MTX to h5ad should also have versions.yaml output + ch_versions = ch_versions.mix(MTX_TO_H5AD.out.versions, MTX_TO_SEURAT.out.versions) + + emit: + ch_versions + counts = MTX_TO_H5AD.out.counts + +} diff --git a/subworkflows/local/starsolo.nf b/subworkflows/local/starsolo.nf new file mode 100644 index 00000000..47b2e757 --- /dev/null +++ b/subworkflows/local/starsolo.nf @@ -0,0 +1,64 @@ +/* -- IMPORT LOCAL MODULES/SUBWORKFLOWS -- */ +include { STAR_ALIGN } from '../../modules/local/star_align' + +/* -- IMPORT NF-CORE MODULES/SUBWORKFLOWS -- */ +include { GUNZIP } from '../../modules/nf-core/gunzip/main' +include { STAR_GENOMEGENERATE } from '../../modules/nf-core/star/genomegenerate/main' + + +def multiqc_report = [] + +workflow STARSOLO { + take: + genome_fasta + gtf + star_index + protocol + barcode_whitelist + ch_fastq + star_feature + other_10x_parameters + + main: + ch_versions = Channel.empty() + + assert star_index || (genome_fasta && gtf): + "Must provide a genome fasta file ('--fasta') and a gtf file ('--gtf') if no index is given!" + + assert gtf: "Must provide a gtf file ('--gtf') for STARSOLO" + + /* + * Build STAR index if not supplied + */ + if (!star_index) { + STAR_GENOMEGENERATE( + genome_fasta.map{ f -> [[id: f.baseName], f]}, + gtf.map{ g -> [[id: g.baseName], g]} + ) + star_index = STAR_GENOMEGENERATE.out.index.collect() + ch_versions = ch_versions.mix(STAR_GENOMEGENERATE.out.versions) + } + + /* + * Perform mapping with STAR + */ + STAR_ALIGN( + ch_fastq, + star_index, + gtf, + barcode_whitelist, + protocol, + star_feature, + other_10x_parameters + ) + ch_versions = ch_versions.mix(STAR_ALIGN.out.versions) + + + emit: + ch_versions + // get rid of meta for star index + star_index = star_index.map{ meta, index -> index} + star_result = STAR_ALIGN.out.tab + star_counts = STAR_ALIGN.out.counts + for_multiqc = STAR_ALIGN.out.log_final +} diff --git a/tests/main_pipeline_alevin.test b/tests/main_pipeline_alevin.test new file mode 100644 index 00000000..303a625b --- /dev/null +++ b/tests/main_pipeline_alevin.test @@ -0,0 +1,67 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + + test("test-dataset_alevin_aligner") { + + when { + // the rest is taken from shared config + params { + aligner = 'alevin' + outdir = "${outputDir}/results_alevin" + + // Limit resources so that this can run on GitHub Actions -- for some reason it had not been taken from shared config + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + } + } + + then { + + assertAll( + + // + // General assertions + // + + // Did it finish successfully? + {assert workflow.success}, + + // How many tasks were executed? + {assert workflow.trace.tasks().size() == 16}, + + // How many results were produced? + {assert path("${outputDir}/results_alevin").list().size() == 5}, + {assert path("${outputDir}/results_alevin/alevin").list().size() == 4}, + {assert path("${outputDir}/results_alevin/alevin/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_alevin/alevinqc").list().size() == 2}, + {assert path("${outputDir}/results_alevin/fastqc").list().size() == 12}, + {assert path("${outputDir}/results_alevin/multiqc").list().size() == 3}, + + // + // Check if files were produced + // + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_alevin/alevin/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + + // + // Check if files are the same + // + {assert snapshot( + workflow, + path( "${outputDir}/results_alevin/alevin/Sample_X_alevin_results/af_quant/alevin/quants_mat_cols.txt" ), + path( "${outputDir}/results_alevin/alevin/Sample_X_alevin_results/af_quant/alevin/quants_mat.mtx" ), + path( "${outputDir}/results_alevin/alevin/Sample_X_alevin_results/af_quant/alevin/quants_mat_rows.txt" ), + path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_cols.txt" ), + path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat.mtx" ), + path( "${outputDir}/results_alevin/alevin/Sample_Y_alevin_results/af_quant/alevin/quants_mat_rows.txt" ) + ).match()} + + ) // end of assertAll() + + } + } + +} diff --git a/tests/main_pipeline_alevin.test.snap b/tests/main_pipeline_alevin.test.snap new file mode 100644 index 00000000..9d194012 --- /dev/null +++ b/tests/main_pipeline_alevin.test.snap @@ -0,0 +1,32 @@ +{ + "test-dataset_alevin_aligner": { + "content": [ + { + "stderr": [ + + ], + "errorReport": "", + "exitStatus": 0, + "failed": false, + "stdout": [ + + ], + "errorMessage": "", + "trace": { + "tasksFailed": 0, + "tasksCount": 16, + "tasksSucceeded": 16 + }, + "name": "workflow", + "success": true + }, + "quants_mat_cols.txt:md5,e9868982c17a330392e38c2a5933cf97", + "quants_mat.mtx:md5,b8aa7b3c488fd8923de50a3621d4991f", + "quants_mat_rows.txt:md5,6227df5a13127b71c71fb18cd8574857", + "quants_mat_cols.txt:md5,e9868982c17a330392e38c2a5933cf97", + "quants_mat.mtx:md5,54cd12666016adce94c025b2e07f4b02", + "quants_mat_rows.txt:md5,6b458a7777260ba90eccbe7919df934b" + ], + "timestamp": "2024-01-19T10:28:35.652763852" + } +} \ No newline at end of file diff --git a/tests/main_pipeline_cellranger.test b/tests/main_pipeline_cellranger.test new file mode 100644 index 00000000..996c0f76 --- /dev/null +++ b/tests/main_pipeline_cellranger.test @@ -0,0 +1,75 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + + test("test-dataset_cellranger_aligner") { + + when { + // the rest is taken from shared config + params { + aligner = 'cellranger' + outdir = "${outputDir}/results_cellranger" + + // Limit resources so that this can run on GitHub Actions -- for some reason it had not been taken from shared config + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + } + } + + then { + + assertAll( + + // + // General assertions + // + + // Did it finish successfully? + {assert workflow.success}, + + // How many tasks were executed? + {assert workflow.trace.tasks().size() == 15}, + + // How many results were produced? + {assert path("${outputDir}/results_cellranger").list().size() == 4}, + {assert path("${outputDir}/results_cellranger/cellranger").list().size() == 4}, + {assert path("${outputDir}/results_cellranger/cellranger/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_cellranger/cellranger/count").list().size() == 3}, + {assert path("${outputDir}/results_cellranger/fastqc").list().size() == 12}, + {assert path("${outputDir}/results_cellranger/multiqc").list().size() == 3}, + + // + // Check if files were produced + // + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + + // + // Check if files are the same + // + {assert snapshot( + workflow, + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/filtered_feature_bc_matrix/features.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/filtered_feature_bc_matrix/matrix.mtx.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/filtered_feature_bc_matrix/barcodes.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/filtered_feature_bc_matrix/features.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/filtered_feature_bc_matrix/matrix.mtx.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/raw_feature_bc_matrix/barcodes.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/raw_feature_bc_matrix/features.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_X/outs/raw_feature_bc_matrix/matrix.mtx.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/barcodes.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/features.tsv.gz" ), + path( "${outputDir}/results_cellranger/cellranger/count/Sample_Y/outs/raw_feature_bc_matrix/matrix.mtx.gz" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), + path( "${outputDir}/results_cellranger/cellranger/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + ).match()} + + ) // end of assertAll() + + } + } + +} diff --git a/tests/main_pipeline_cellranger.test.snap b/tests/main_pipeline_cellranger.test.snap new file mode 100644 index 00000000..5f88b954 --- /dev/null +++ b/tests/main_pipeline_cellranger.test.snap @@ -0,0 +1,40 @@ +{ + "test-dataset_cellranger_aligner": { + "content": [ + { + "stderr": [ + + ], + "errorReport": "", + "exitStatus": 0, + "failed": false, + "stdout": [ + + ], + "errorMessage": "", + "trace": { + "tasksFailed": 0, + "tasksCount": 15, + "tasksSucceeded": 15 + }, + "name": "workflow", + "success": true + }, + "barcodes.tsv.gz:md5,fe6e51564b4405b37ca8604a844b1f2e", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "matrix.mtx.gz:md5,79471f700ec5bab852a960f0d1537705", + "barcodes.tsv.gz:md5,77afe9a76631fc7b44236d3962a55aa5", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "matrix.mtx.gz:md5,e224aa759250fa5730dc069bce9be253", + "barcodes.tsv.gz:md5,85da6b6e0c78dfe81af8c07c2017ab5e", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "matrix.mtx.gz:md5,1bc71041a425bfcee38472bfec5997f8", + "barcodes.tsv.gz:md5,081f72b5252ccaf5ffd535ffbd235c4c", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "matrix.mtx.gz:md5,a4db04e43e650accc96361a287126a6b", + "Sample_X_matrix.rds:md5,f9191ba575a3ab79ada4807715f18573", + "Sample_Y_matrix.rds:md5,7be3f7b29d668dcf7e951b9f4d371a5e" + ], + "timestamp": "2024-01-22T15:19:20.134275449" + } +} \ No newline at end of file diff --git a/tests/main_pipeline_kallisto.test b/tests/main_pipeline_kallisto.test new file mode 100644 index 00000000..98ede8c4 --- /dev/null +++ b/tests/main_pipeline_kallisto.test @@ -0,0 +1,70 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + + test("test-dataset_kallisto_aligner") { + + when { + // the rest is taken from shared config + params { + aligner = 'kallisto' + outdir = "${outputDir}/results_kallisto" + + // Limit resources so that this can run on GitHub Actions -- for some reason it had not been taken from shared config + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + } + } + + then { + + assertAll( + + // + // General assertions + // + + // Did it finish successfully? + {assert workflow.success}, + + // How many tasks were executed? + {assert workflow.trace.tasks().size() == 14}, + + // How many results were produced? + {assert path("${outputDir}/results_kallisto").list().size() == 4}, + {assert path("${outputDir}/results_kallisto/kallisto").list().size() == 4}, + {assert path("${outputDir}/results_kallisto/kallisto/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_kallisto/kallisto/Sample_X.count").list().size() == 9}, + {assert path("${outputDir}/results_kallisto/kallisto/Sample_Y.count").list().size() == 9}, + {assert path("${outputDir}/results_kallisto/fastqc").list().size() == 12}, + {assert path("${outputDir}/results_kallisto/multiqc").list().size() == 3}, + + // + // Check if files were produced + // + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + + // + // Check if files are the same + // + {assert snapshot( + workflow, + path( "${outputDir}/results_kallisto/kallisto/Sample_X.count/counts_unfiltered/cells_x_genes.barcodes.txt" ), + path( "${outputDir}/results_kallisto/kallisto/Sample_X.count/counts_unfiltered/cells_x_genes.genes.txt" ), + path( "${outputDir}/results_kallisto/kallisto/Sample_X.count/counts_unfiltered/cells_x_genes.mtx" ), + path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.barcodes.txt" ), + path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.genes.txt" ), + path( "${outputDir}/results_kallisto/kallisto/Sample_Y.count/counts_unfiltered/cells_x_genes.mtx" ), + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), + path( "${outputDir}/results_kallisto/kallisto/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ) + ).match()} + + ) // end of assertAll() + + } + } + +} diff --git a/tests/main_pipeline_kallisto.test.snap b/tests/main_pipeline_kallisto.test.snap new file mode 100644 index 00000000..b0fb50bb --- /dev/null +++ b/tests/main_pipeline_kallisto.test.snap @@ -0,0 +1,34 @@ +{ + "test-dataset_kallisto_aligner": { + "content": [ + { + "stderr": [ + + ], + "errorReport": "", + "exitStatus": 0, + "failed": false, + "stdout": [ + + ], + "errorMessage": "", + "trace": { + "tasksFailed": 0, + "tasksCount": 14, + "tasksSucceeded": 14 + }, + "name": "workflow", + "success": true + }, + "cells_x_genes.barcodes.txt:md5,18be561873e435d4587f6b3f95a0e301", + "cells_x_genes.genes.txt:md5,acd9d00120f52031974b2add3e7521b6", + "cells_x_genes.mtx:md5,37d2cd8c712f9c70463e87485bf6cd36", + "cells_x_genes.barcodes.txt:md5,488437e1f5477243697efb93366e5676", + "cells_x_genes.genes.txt:md5,acd9d00120f52031974b2add3e7521b6", + "cells_x_genes.mtx:md5,af90e05b404490f6cb133ab7f62949f8", + "Sample_X_matrix.rds:md5,f0e43f69403f4b2e7704065421592ad0", + "Sample_Y_matrix.rds:md5,61809156e64dbdaf254cbc1c3456588e" + ], + "timestamp": "2024-01-23T12:19:47.921508953" + } +} \ No newline at end of file diff --git a/tests/main_pipeline_star.test b/tests/main_pipeline_star.test new file mode 100644 index 00000000..6616ea87 --- /dev/null +++ b/tests/main_pipeline_star.test @@ -0,0 +1,74 @@ +nextflow_pipeline { + + name "Test Workflow main.nf" + script "main.nf" + + test("test-dataset_star_aligner") { + + when { + // the rest is taken from shared config + params { + aligner = 'star' + outdir = "${outputDir}/results_star" + + // Limit resources so that this can run on GitHub Actions -- for some reason it had not been taken from shared config + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + } + } + + then { + + assertAll( + + // + // General assertions + // + + // Did it finish successfully? + {assert workflow.success}, + + // How many tasks were executed? + {assert workflow.trace.tasks().size() == 14}, + + // How many results were produced? + {assert path("${outputDir}/results_star").list().size() == 4}, + {assert path("${outputDir}/results_star/star").list().size() == 3}, + {assert path("${outputDir}/results_star/star/mtx_conversions").list().size() == 4}, + {assert path("${outputDir}/results_star/fastqc").list().size() == 12}, + {assert path("${outputDir}/results_star/multiqc").list().size() == 3}, + + // + // Check if files were produced + // + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ).exists()}, + {assert new File( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ).exists()}, + + // + // Check if files are the same + // + {assert snapshot( + workflow, + path( "${outputDir}/results_star/star/Sample_X/Sample_X.SJ.out.tab" ), + path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Barcodes.stats" ), + path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/matrix.mtx.gz" ), + path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/features.tsv.gz" ), + path( "${outputDir}/results_star/star/Sample_X/Sample_X.Solo.out/Gene/filtered/barcodes.tsv.gz" ), + // path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.h5ad" ), // does not match + path( "${outputDir}/results_star/star/mtx_conversions/Sample_X/Sample_X_matrix.rds" ), + path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.SJ.out.tab" ), + path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Barcodes.stats" ), + path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/matrix.mtx.gz" ), + path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/features.tsv.gz" ), + path( "${outputDir}/results_star/star/Sample_Y/Sample_Y.Solo.out/Gene/filtered/barcodes.tsv.gz" ), + // path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.h5ad" ), // does not match + path( "${outputDir}/results_star/star/mtx_conversions/Sample_Y/Sample_Y_matrix.rds" ), + ).match()} + + ) // end of assertAll() + + } + } + +} diff --git a/tests/main_pipeline_star.test.snap b/tests/main_pipeline_star.test.snap new file mode 100644 index 00000000..ef086f60 --- /dev/null +++ b/tests/main_pipeline_star.test.snap @@ -0,0 +1,38 @@ +{ + "test-dataset_star_aligner": { + "content": [ + { + "stderr": [ + + ], + "errorReport": "", + "exitStatus": 0, + "failed": false, + "stdout": [ + + ], + "errorMessage": "", + "trace": { + "tasksFailed": 0, + "tasksCount": 14, + "tasksSucceeded": 14 + }, + "name": "workflow", + "success": true + }, + "Sample_X.SJ.out.tab:md5,d2d7f0abe38029012571bdf6622fc6eb", + "Barcodes.stats:md5,7f99dc8aa5e074fbe5779ea7712c0886", + "matrix.mtx.gz:md5,6a923393343aa1a69b0cf1bd998c9285", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "barcodes.tsv.gz:md5,9a7dacaa1779ea43c1507a947fe6992a", + "Sample_X_matrix.rds:md5,aa2d36dd8507aba864347c88e4ce0d27", + "Sample_Y.SJ.out.tab:md5,98bd31104a860cf80119dc30d938d163", + "Barcodes.stats:md5,2dbf1ae426c1afd97903ee001f0db5ce", + "matrix.mtx.gz:md5,0ae080bd0002e350531a5816e159345e", + "features.tsv.gz:md5,99e453cb1443a3e43e99405184e51a5e", + "barcodes.tsv.gz:md5,9b695b0b91bcb146ec9c4688ca10a690", + "Sample_Y_matrix.rds:md5,d459af8f99258bcc88b80b2f7c58e911" + ], + "timestamp": "2024-01-19T15:46:22.470527538" + } +} \ No newline at end of file diff --git a/tests/nextflow.config b/tests/nextflow.config new file mode 100644 index 00000000..66e56d29 --- /dev/null +++ b/tests/nextflow.config @@ -0,0 +1,32 @@ +/* +======================================================================================== + Nextflow config file for running tests +======================================================================================== +*/ + +// Copy from test.config for standardization +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function' + + // Limit resources so that this can run on GitHub Actions + max_cpus = 2 + max_memory = '6.GB' + max_time = '6.h' + + // Input data + input = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/samplesheet-2-0.csv' + + // Genome references + fasta = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/GRCm38.p6.genome.chr19.fa' + gtf = 'https://github.com/nf-core/test-datasets/raw/scrnaseq/reference/gencode.vM19.annotation.chr19.gtf' + protocol = '10XV2' + + validationSchemaIgnoreParams = 'genomes' +} + +process { + withName: '.*:CELLRANGER_COUNT' { + maxForks = 1 + } +} diff --git a/tower.yml b/tower.yml index 787aedfe..999e82d3 100644 --- a/tower.yml +++ b/tower.yml @@ -1,5 +1,11 @@ reports: multiqc_report.html: display: "MultiQC HTML report" - samplesheet.csv: - display: "Auto-created samplesheet with collated metadata and FASTQ paths" + "**/fastqc/*_fastqc.html": + display: "FastQC HTML report" + "**/pipeline_info/execution_timeline_*.html": + display: "Execution timeline report" + "**/pipeline_info/execution_report_*.html": + display: "Execution overview report" + "**/star/**/*.Log.final.out": + display: "Star per-sample report" diff --git a/workflows/scrnaseq.nf b/workflows/scrnaseq.nf index 40e0dfdb..6801d4dc 100644 --- a/workflows/scrnaseq.nf +++ b/workflows/scrnaseq.nf @@ -10,6 +10,61 @@ include { paramsSummaryMap } from 'plugin/nf-validation' include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrnaseq_pipeline' +include { paramsSummaryLog; paramsSummaryMap } from 'plugin/nf-validation' + +def summary_params = paramsSummaryMap(workflow) + +def checkPathParamList = [ + params.input, params.multiqc_config, params.fasta, params.gtf, + params.transcript_fasta, params.salmon_index, params.kallisto_index, + params.star_index, params.txp2gene, params.barcode_whitelist, params.cellranger_index, + params.universc_index +] +for (param in checkPathParamList) { if (param) { file(param, checkIfExists: true) } } + + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + CONFIG FILES +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +ch_multiqc_config = Channel.fromPath("$projectDir/assets/multiqc_config.yml", checkIfExists: true) +ch_multiqc_custom_config = params.multiqc_config ? Channel.fromPath( params.multiqc_config, checkIfExists: true ) : Channel.empty() +ch_multiqc_logo = params.multiqc_logo ? Channel.fromPath( params.multiqc_logo, checkIfExists: true ) : Channel.empty() +ch_multiqc_custom_methods_description = params.multiqc_methods_description ? file(params.multiqc_methods_description, checkIfExists: true) : file("$projectDir/assets/methods_description_template.yml", checkIfExists: true) + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT LOCAL MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// SUBWORKFLOW: Consisting of a mix of local and nf-core/modules +// +include { INPUT_CHECK } from '../subworkflows/local/input_check' +include { FASTQC_CHECK } from '../subworkflows/local/fastqc' +include { KALLISTO_BUSTOOLS } from '../subworkflows/local/kallisto_bustools' +include { SCRNASEQ_ALEVIN } from '../subworkflows/local/alevin' +include { STARSOLO } from '../subworkflows/local/starsolo' +include { CELLRANGER_ALIGN } from "../subworkflows/local/align_cellranger" +include { CELLRANGERARC_ALIGN } from "../subworkflows/local/align_cellrangerarc" +include { UNIVERSC_ALIGN } from "../subworkflows/local/align_universc" +include { MTX_CONVERSION } from "../subworkflows/local/mtx_conversion" +include { GTF_GENE_FILTER } from '../modules/local/gtf_gene_filter' + +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + IMPORT NF-CORE MODULES/SUBWORKFLOWS +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +// +// MODULE: Installed directly from nf-core/modules +// +include { CUSTOM_DUMPSOFTWAREVERSIONS } from '../modules/nf-core/custom/dumpsoftwareversions/main' +include { MULTIQC } from '../modules/nf-core/multiqc/main' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -17,6 +72,52 @@ include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_scrn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ */ +// Info required for completion email and summary +// TODO: Are this channels still necessary? +ch_output_docs = file("$projectDir/docs/output.md", checkIfExists: true) +ch_output_docs_images = file("$projectDir/docs/images/", checkIfExists: true) +protocol_config = WorkflowScrnaseq.getProtocol(workflow, log, params.aligner, params.protocol) +if (protocol_config['protocol'] == 'auto' && params.aligner != "cellranger") { + error "Only cellranger supports `protocol = 'auto'`. Please specify the protocol manually!" +} + +// general input and params +ch_input = file(params.input) +ch_genome_fasta = Channel.value(params.fasta ? file(params.fasta) : []) +ch_gtf = params.gtf ? file(params.gtf) : [] +ch_transcript_fasta = params.transcript_fasta ? file(params.transcript_fasta): [] +ch_motifs = params.motifs ? file(params.motifs) : [] +ch_cellrangerarc_config = params.cellrangerarc_config ? file(params.cellrangerarc_config) : [] +ch_txp2gene = params.txp2gene ? file(params.txp2gene) : [] +ch_multiqc_alevin = Channel.empty() +ch_multiqc_star = Channel.empty() +ch_multiqc_cellranger = Channel.empty() +if (params.barcode_whitelist) { + ch_barcode_whitelist = file(params.barcode_whitelist) +} else if (protocol_config.containsKey("whitelist")) { + ch_barcode_whitelist = file("$projectDir/${protocol_config['whitelist']}") +} else { + ch_barcode_whitelist = [] +} + + +//kallisto params +ch_kallisto_index = params.kallisto_index ? file(params.kallisto_index) : [] +kb_workflow = params.kb_workflow + +//salmon params +ch_salmon_index = params.salmon_index ? file(params.salmon_index) : [] + +//star params +ch_star_index = params.star_index ? file(params.star_index) : [] +star_feature = params.star_feature + +//cellranger params +ch_cellranger_index = params.cellranger_index ? file(params.cellranger_index) : [] + +//universc params +ch_universc_index = params.universc_index ? file(params.universc_index) : [] + workflow SCRNASEQ { take: @@ -24,24 +125,139 @@ workflow SCRNASEQ { main: - ch_versions = Channel.empty() - ch_multiqc_files = Channel.empty() + ch_versions = Channel.empty() + ch_mtx_matrices = Channel.empty() - // - // MODULE: Run FastQC - // - FASTQC ( - ch_samplesheet + // Check input files and stage input data + ch_fastq = INPUT_CHECK( ch_input ).reads + + ch_versions = ch_versions.mix(INPUT_CHECK.out.versions) + // TODO: OPTIONAL, you can use nf-validation plugin to create an input channel from the samplesheet with Channel.fromSamplesheet("input") + // See the documentation https://nextflow-io.github.io/nf-validation/samplesheets/fromSamplesheet/ + // ! There is currently no tooling to help you write a sample sheet schema + + // Run FastQC + ch_multiqc_fastqc = Channel.empty() + if (!params.skip_fastqc) { + FASTQC_CHECK ( ch_fastq ) + ch_versions = ch_versions.mix(FASTQC_CHECK.out.fastqc_version) + ch_multiqc_fastqc = FASTQC_CHECK.out.fastqc_zip + } else { + ch_multiqc_fastqc = Channel.empty() + } + + ch_filter_gtf = GTF_GENE_FILTER ( ch_genome_fasta, ch_gtf ).gtf + + // Run kallisto bustools pipeline + if (params.aligner == "kallisto") { + KALLISTO_BUSTOOLS( + ch_genome_fasta, + ch_filter_gtf, + ch_kallisto_index, + ch_txp2gene, + protocol_config['protocol'], + kb_workflow, + ch_fastq + ) + ch_versions = ch_versions.mix(KALLISTO_BUSTOOLS.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(KALLISTO_BUSTOOLS.out.counts) + ch_txp2gene = KALLISTO_BUSTOOLS.out.txp2gene + } + + // Run salmon alevin pipeline + if (params.aligner == "alevin") { + SCRNASEQ_ALEVIN( + ch_genome_fasta, + ch_filter_gtf, + ch_transcript_fasta, + ch_salmon_index, + ch_txp2gene, + ch_barcode_whitelist, + protocol_config['protocol'], + ch_fastq + ) + ch_versions = ch_versions.mix(SCRNASEQ_ALEVIN.out.ch_versions) + ch_multiqc_alevin = SCRNASEQ_ALEVIN.out.alevin_results + ch_mtx_matrices = ch_mtx_matrices.mix(SCRNASEQ_ALEVIN.out.alevin_results) + } + + // Run STARSolo pipeline + if (params.aligner == "star") { + STARSOLO( + ch_genome_fasta, + ch_filter_gtf, + ch_star_index, + protocol_config['protocol'], + ch_barcode_whitelist, + ch_fastq, + star_feature, + protocol_config.get('extra_args', ""), + ) + ch_versions = ch_versions.mix(STARSOLO.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(STARSOLO.out.star_counts) + ch_star_index = STARSOLO.out.star_index + ch_multiqc_star = STARSOLO.out.for_multiqc + } + + // Run cellranger pipeline + if (params.aligner == "cellranger") { + CELLRANGER_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_cellranger_index, + ch_fastq, + protocol_config['protocol'] + ) + ch_versions = ch_versions.mix(CELLRANGER_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGER_ALIGN.out.cellranger_out) + ch_star_index = CELLRANGER_ALIGN.out.star_index + ch_multiqc_cellranger = CELLRANGER_ALIGN.out.cellranger_out.map{ + meta, outs -> outs.findAll{ it -> it.name == "web_summary.html"} + } + } + + // Run universc pipeline + if (params.aligner == "universc") { + UNIVERSC_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_universc_index, + protocol_config['protocol'], + ch_fastq + ) + ch_versions = ch_versions.mix(UNIVERSC_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(UNIVERSC_ALIGN.out.universc_out) + } + + // Run cellranger pipeline + if (params.aligner == "cellrangerarc") { + CELLRANGERARC_ALIGN( + ch_genome_fasta, + ch_filter_gtf, + ch_motifs, + ch_cellranger_index, + ch_fastq, + ch_cellrangerarc_config + ) + ch_versions = ch_versions.mix(CELLRANGERARC_ALIGN.out.ch_versions) + ch_mtx_matrices = ch_mtx_matrices.mix(CELLRANGERARC_ALIGN.out.cellranger_arc_out) + } + + // Run mtx to h5ad conversion subworkflow + MTX_CONVERSION ( + ch_mtx_matrices, + ch_input, + ch_txp2gene, + ch_star_index ) - ch_multiqc_files = ch_multiqc_files.mix(FASTQC.out.zip.collect{it[1]}) - ch_versions = ch_versions.mix(FASTQC.out.versions.first()) - // - // Collate and save software versions - // - softwareVersionsToYAML(ch_versions) - .collectFile(storeDir: "${params.outdir}/pipeline_info", name: 'nf_core_pipeline_software_mqc_versions.yml', sort: true, newLine: true) - .set { ch_collated_versions } + //Add Versions from MTX Conversion workflow too + ch_versions.mix(MTX_CONVERSION.out.ch_versions) + + // collect software versions + CUSTOM_DUMPSOFTWAREVERSIONS ( + ch_versions.unique().collectFile(name: 'collated_versions.yml') + ) // // MODULE: MultiQC