diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 413f19c..bc024c6 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -13,7 +13,7 @@ jobs: steps: - uses: actions/checkout@v4 - name: install micromamba - uses: mamba-org/provision-with-micromamba@v16 + uses: mamba-org/setup-micromamba@v1 with: environment-file: docs/environment.yml environment-name: sphinx diff --git a/.gitignore b/.gitignore index 9706f2a..38d5e81 100644 --- a/.gitignore +++ b/.gitignore @@ -26,4 +26,5 @@ mix_data *data *report.html *.simg -*results \ No newline at end of file +*results +.DS_Store \ No newline at end of file diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 466df71..a915e8c 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "0.1.0" + ".": "0.1.1" } diff --git a/CHANGELOG.md b/CHANGELOG.md index be3e20d..9f1a35e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,12 @@ # Changelog +## [0.1.1](https://github.com/kircherlab/MPRAsnakeflow/compare/MPRAsnakeflow-v0.1.0...MPRAsnakeflow-v0.1.1) (2024-09-30) + +### Bug Fixes + +* Detach from anaconda ([#122](https://github.com/kircherlab/MPRAsnakeflow/issues/122)) ([16bcea2](https://github.com/kircherlab/MPRAsnakeflow/commit/16bcea2f04190a5965ad1865cf30f6dd44f1b6a0)) +* memory resources for bbmap ([#123](https://github.com/kircherlab/MPRAsnakeflow/issues/123)) ([af93f58](https://github.com/kircherlab/MPRAsnakeflow/commit/af93f588e9387ddf91197f5587d36c3481499b38)) + ## [0.1.0](https://github.com/kircherlab/MPRAsnakeflow/compare/MPRAsnakeflow-v0.0.1...MPRAsnakeflow-v0.1.0) (2024-09-18) First release of MPRAsnakeflow! diff --git a/Dockerfile b/Dockerfile index 698b800..cb2bf4a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,6 @@ -FROM condaforge/mambaforge:latest +ARG VERSION=0.1.1 + +FROM condaforge/miniforge3:latest LABEL io.github.snakemake.containerized="true" LABEL io.github.snakemake.conda_env_hash="7a57714fe74eb25255d53b45e2095cd8a4dd4fe73db79006353670c432af97b1" @@ -6,85 +8,79 @@ LABEL io.github.snakemake.conda_env_hash="7a57714fe74eb25255d53b45e2095cd8a4dd4f # Conda environment: # source: workflow/envs/NGmerge.yaml -# prefix: /conda-envs/c243bde7dc056785a077f6c33e56e8d6 +# prefix: /conda-envs/8b5bbd33d7ccdbbe3a28773771abe2b3 # --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - ngmerge=0.3 # - python -# - click +# - click # - htslib -RUN mkdir -p /conda-envs/c243bde7dc056785a077f6c33e56e8d6 -COPY workflow/envs/NGmerge.yaml /conda-envs/c243bde7dc056785a077f6c33e56e8d6/environment.yaml +RUN mkdir -p /conda-envs/8b5bbd33d7ccdbbe3a28773771abe2b3 +COPY workflow/envs/NGmerge.yaml /conda-envs/8b5bbd33d7ccdbbe3a28773771abe2b3/environment.yaml # Conda environment: # source: workflow/envs/bbmap_samtools_htslib.yaml -# prefix: /conda-envs/575ebc82fb464fb2d0748323abbd3a13 +# prefix: /conda-envs/f24b0ccfc23aadb93b466380cd592733 # --- # channels: # - bioconda # - conda-forge -# - defaults # dependencies: # - bbmap # - samtools # - htslib -RUN mkdir -p /conda-envs/575ebc82fb464fb2d0748323abbd3a13 -COPY workflow/envs/bbmap_samtools_htslib.yaml /conda-envs/575ebc82fb464fb2d0748323abbd3a13/environment.yaml +RUN mkdir -p /conda-envs/f24b0ccfc23aadb93b466380cd592733 +COPY workflow/envs/bbmap_samtools_htslib.yaml /conda-envs/f24b0ccfc23aadb93b466380cd592733/environment.yaml # Conda environment: # source: workflow/envs/bwa_samtools_picard_htslib.yaml -# prefix: /conda-envs/f354d1f7a8fd64abb8ea8902ec91d399 +# prefix: /conda-envs/a0a27bbdca540d02023c7fdcf819bfc1 # --- # channels: # - bioconda # - conda-forge -# - defaults # dependencies: # - bwa # - samtools # - picard # - htslib -RUN mkdir -p /conda-envs/f354d1f7a8fd64abb8ea8902ec91d399 -COPY workflow/envs/bwa_samtools_picard_htslib.yaml /conda-envs/f354d1f7a8fd64abb8ea8902ec91d399/environment.yaml +RUN mkdir -p /conda-envs/a0a27bbdca540d02023c7fdcf819bfc1 +COPY workflow/envs/bwa_samtools_picard_htslib.yaml /conda-envs/a0a27bbdca540d02023c7fdcf819bfc1/environment.yaml # Conda environment: # source: workflow/envs/default.yaml -# prefix: /conda-envs/9444545a0ebc79ec516fa74514742720 +# prefix: /conda-envs/899c0a8c04c6edeed4b242dbc3e0e1c8 # --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - htslib -RUN mkdir -p /conda-envs/9444545a0ebc79ec516fa74514742720 -COPY workflow/envs/default.yaml /conda-envs/9444545a0ebc79ec516fa74514742720/environment.yaml +RUN mkdir -p /conda-envs/899c0a8c04c6edeed4b242dbc3e0e1c8 +COPY workflow/envs/default.yaml /conda-envs/899c0a8c04c6edeed4b242dbc3e0e1c8/environment.yaml # Conda environment: # source: workflow/envs/fastqsplitter.yaml -# prefix: /conda-envs/dc242c7dafc90db387bc0290c31dc7ae +# prefix: /conda-envs/e5aec3a0d6b8921994e5305d4f89e90f # --- # channels: # - bioconda -# - defaults # - conda-forge # dependencies: # - fastqsplitter -RUN mkdir -p /conda-envs/dc242c7dafc90db387bc0290c31dc7ae -COPY workflow/envs/fastqsplitter.yaml /conda-envs/dc242c7dafc90db387bc0290c31dc7ae/environment.yaml +RUN mkdir -p /conda-envs/e5aec3a0d6b8921994e5305d4f89e90f +COPY workflow/envs/fastqsplitter.yaml /conda-envs/e5aec3a0d6b8921994e5305d4f89e90f/environment.yaml # Conda environment: # source: workflow/envs/python3.yaml -# prefix: /conda-envs/dadb883da8c83465d38f12e012df0cd0 +# prefix: /conda-envs/a4e1b935cbca52df9b6f192ff86c464c # --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - biopython # - click @@ -95,62 +91,59 @@ COPY workflow/envs/fastqsplitter.yaml /conda-envs/dc242c7dafc90db387bc0290c31dc7 # - python # - pysam # - pyfastx -RUN mkdir -p /conda-envs/dadb883da8c83465d38f12e012df0cd0 -COPY workflow/envs/python3.yaml /conda-envs/dadb883da8c83465d38f12e012df0cd0/environment.yaml +RUN mkdir -p /conda-envs/a4e1b935cbca52df9b6f192ff86c464c +COPY workflow/envs/python3.yaml /conda-envs/a4e1b935cbca52df9b6f192ff86c464c/environment.yaml # Conda environment: # source: workflow/envs/r.yaml -# prefix: /conda-envs/e6c048b22dbbbe081b8d18143c20afe3 +# prefix: /conda-envs/ae3e37bf43cbb30416a885168e10c552 # --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - r-base # - r-cowplot # - r-cairo # - r-optparse # - r-tidyverse -RUN mkdir -p /conda-envs/e6c048b22dbbbe081b8d18143c20afe3 -COPY workflow/envs/r.yaml /conda-envs/e6c048b22dbbbe081b8d18143c20afe3/environment.yaml +RUN mkdir -p /conda-envs/ae3e37bf43cbb30416a885168e10c552 +COPY workflow/envs/r.yaml /conda-envs/ae3e37bf43cbb30416a885168e10c552/environment.yaml # Conda environment: # source: workflow/envs/python27.yaml -# prefix: /conda-envs/c1d850971f4158052cd52615fbc1591a +# prefix: /conda-envs/cb972f023533b03e742da9095ce03b06 # --- # channels: # - bioconda -# - defaults # - conda-forge # dependencies: # - htslib # - pysam # - python=2.7 # - samtools -RUN mkdir -p /conda-envs/c1d850971f4158052cd52615fbc1591a -COPY workflow/envs/python27.yaml /conda-envs/c1d850971f4158052cd52615fbc1591a/environment.yaml +RUN mkdir -p /conda-envs/cb972f023533b03e742da9095ce03b06 +COPY workflow/envs/python27.yaml /conda-envs/cb972f023533b03e742da9095ce03b06/environment.yaml # Conda environment: # source: workflow/envs/cutadapt.yaml -# prefix: /conda-envs/e6c048b22dbbbe081b8d18143c20afe3 +# prefix: /conda-envs/a3e2fce7f2f6fdbe1aa97232e3def601 # --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - cutadapt -RUN mkdir -p /conda-envs/d49adba2589cd2a66656b9298acdbece -COPY workflow/envs/cutadapt.yaml /conda-envs/d49adba2589cd2a66656b9298acdbece/environment.yaml +RUN mkdir -p /conda-envs/a3e2fce7f2f6fdbe1aa97232e3def601 +COPY workflow/envs/cutadapt.yaml /conda-envs/a3e2fce7f2f6fdbe1aa97232e3def601/environment.yaml # Conda environment: # source: workflow/envs/quarto.yaml -# prefix: /conda-envs/b8e51d222ab0d9caac2206a127729b1c +# prefix: /conda-envs/b933cc1aa7c25db04635e7ec0e37f80e +# --- # channels: # - conda-forge # - bioconda -# - defaults # dependencies: # - python # - quarto @@ -158,21 +151,31 @@ COPY workflow/envs/cutadapt.yaml /conda-envs/d49adba2589cd2a66656b9298acdbece/en # - pandas # - matplotlib # - papermill -RUN mkdir -p /conda-envs/b8e51d222ab0d9caac2206a127729b1c -COPY workflow/envs/quarto.yaml /conda-envs/b8e51d222ab0d9caac2206a127729b1c/environment.yaml +RUN mkdir -p /conda-envs/b933cc1aa7c25db04635e7ec0e37f80e +COPY workflow/envs/quarto.yaml /conda-envs/b933cc1aa7c25db04635e7ec0e37f80e/environment.yaml # Step 2: Generate conda environments -RUN mamba env create --prefix /conda-envs/c243bde7dc056785a077f6c33e56e8d6 --file /conda-envs/c243bde7dc056785a077f6c33e56e8d6/environment.yaml -RUN mamba env create --prefix /conda-envs/575ebc82fb464fb2d0748323abbd3a13 --file /conda-envs/575ebc82fb464fb2d0748323abbd3a13/environment.yaml -RUN mamba env create --prefix /conda-envs/f354d1f7a8fd64abb8ea8902ec91d399 --file /conda-envs/f354d1f7a8fd64abb8ea8902ec91d399/environment.yaml -RUN mamba env create --prefix /conda-envs/9444545a0ebc79ec516fa74514742720 --file /conda-envs/9444545a0ebc79ec516fa74514742720/environment.yaml -RUN mamba env create --prefix /conda-envs/dc242c7dafc90db387bc0290c31dc7ae --file /conda-envs/dc242c7dafc90db387bc0290c31dc7ae/environment.yaml -RUN mamba env create --prefix /conda-envs/dadb883da8c83465d38f12e012df0cd0 --file /conda-envs/dadb883da8c83465d38f12e012df0cd0/environment.yaml -RUN mamba env create --prefix /conda-envs/e6c048b22dbbbe081b8d18143c20afe3 --file /conda-envs/e6c048b22dbbbe081b8d18143c20afe3/environment.yaml -RUN mamba env create --prefix /conda-envs/c1d850971f4158052cd52615fbc1591a --file /conda-envs/c1d850971f4158052cd52615fbc1591a/environment.yaml -RUN mamba env create --prefix /conda-envs/d49adba2589cd2a66656b9298acdbece --file /conda-envs/d49adba2589cd2a66656b9298acdbece/environment.yaml -RUN mamba env create --prefix /conda-envs/b8e51d222ab0d9caac2206a127729b1c --file /conda-envs/b8e51d222ab0d9caac2206a127729b1c/environment.yaml -RUN mamba clean --all -y +RUN <`_. + + File tree of the result folder (names in :code:`< >` can be specified in the config file.) diff --git a/docs/cluster.rst b/docs/cluster.rst index 8d48e3c..0b8028a 100644 --- a/docs/cluster.rst +++ b/docs/cluster.rst @@ -11,12 +11,12 @@ Snakemake gives us the opportunity to run MPRAsnakeflow in a cluster environment :lines: 1-30 -We used this workflow successfully in a SLURM environment using the `slurm excecutor plugin `_ from snakemake. therfore the partition is set with :code:`slurm_partition` and has to be renamed maybe due to your environment. +We used this workflow successfully in a SLURM environment using the `slurm excecutor plugin `_ from snakemake. Therfore the partition is set with :code:`slurm_partition` and has to be renamed or removed to fith with your own SLURM configuration. Running with resources ---------------------- -having 30 cores and 10GB of memory. +Having 30 cores and 10GB of memory. .. code-block:: bash @@ -35,13 +35,13 @@ Using the slurm excecutor plugin running 300 jobs in parallel. Snakemake 7 ----------- -Here we used the :code:`cluster` option which is not anymore avialable in snakemake 8. You can also use the predefined `config/sbatch.yaml` but this might be outdated and we highly recommend to use resources with the workfloe profile. +Here we used the :code:`--cluster` option which is not anymo,onger available in snakemake 8. You can also use the predefined `config/sbatch.yaml` but this might be outdated and we highly recommend to use resources with the workfloe profile. .. code-block:: bash snakemake --use-conda --configfile config/config.yaml --cluster "sbatch --nodes=1 --ntasks={cluster.threads} --mem={cluster.mem} -t {cluster.time} -p {cluster.queue} -o {cluster.output}" --jobs 100 --cluster-config config/sbatch.yaml -Please note that the log folder of the cluster environment has to be generated first, e.g: +Please note that with this :code:`--cluster` option the log folder of the cluster environment (see :code:` -o {cluster.output}`) has to be generated first, e.g: .. code-block:: bash diff --git a/docs/combined_example1.rst b/docs/combined_example1.rst index cfb1a34..f920b9b 100644 --- a/docs/combined_example1.rst +++ b/docs/combined_example1.rst @@ -174,7 +174,7 @@ When dry-drun does not give any errors we will run the workflow. We use a machin .. code-block:: bash - snakemake -c 30 --use-conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/combined_basic/config.yml + snakemake -c 30 --sdm conda --snakefile /home/user/MPRAsnakeflow/workflow/Snakefile --configfile /home/user/MPRAsnakeflow/resources/combined_basic/config.yml .. note:: Please modify your code when running in a cluster environment. We have an example SLURM config file here :code:`config/sbatch.yml`. diff --git a/docs/conf.py b/docs/conf.py index 6fc0188..39e2c62 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -23,10 +23,12 @@ copyright = u'2024, Max Schubach' author = u'Max Schubach' +f = open("../version.txt", "r") + # The short X.Y version -version = u'1.0' +version = f.read().strip() # The full version, including alpha/beta/rc tags -release = u'1.0' +release = version # -- General configuration --------------------------------------------------- @@ -161,8 +163,8 @@ # dir menu entry, description, category) texinfo_documents = [ (master_doc, 'MPRAsnakeflow', u'MPRAsnakeflow Documentation', - author, 'MPRAsnakeflow', 'One line description of project.', - 'miscellaneous'), + author, 'MPRAsnakeflow', 'Snakemake workflow to get assignments or counts from ´MPRA sequencing data.', + 'MPRA sequencing data workflow'), ] diff --git a/docs/config.rst b/docs/config.rst index 263ab6c..d5c7203 100644 --- a/docs/config.rst +++ b/docs/config.rst @@ -4,14 +4,14 @@ Config File ===================== -The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend to use one config file per MPRA experiment or MPRA project. But in theory many different experiments can be configured in only one file. It is divided into :code:`global` (generell settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`. +The config file is a yaml file that contains the configuration. Different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). This is a full example file with default configurations. :download:`config/example_config.yaml <../config/example_config.yaml>`. .. literalinclude:: ../config/example_config.yaml :language: yaml :linenos: -Note that the config file is conrolled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`. +Note that the config file is controlled by json schema. This means that the config file is validated against the schema. If the config file is not valid, the program will exit with an error message. The schema is located in :download:`workflow/schemas/config.schema.yaml <../workflow/schemas/config.schema.yaml>`. ---------------- General settings @@ -28,7 +28,7 @@ The general settings are located in the :code:`global` section. The following se Global parameters that hold for the assignment workflow. :split_number: - To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 this means that the reads are split into 300 files and each file is mapped in parallel. This is only usefull when using on a cluster. Running the workflow only on one machine the default value shopuld be used. Default is set to 1. + To parallize mapping for assignment the reads are split into :code:`split_number` files. E.g. setting to 300 means that the reads are split into 300 files and each file is mapped in parallel. This is only useful when using on a cluster. Running the workflow only on one machine the default value should be used. The default is set to 1. -------------------- Assignment workflow @@ -41,43 +41,43 @@ The assignment workflow is configured in the :code:`assignments` section. The fo :start-after: start_assignments :end-before: start_experiments -Each assignment you want to process you have to giv him a name like :code:`example_assignment`. The name is used to name the output files. +For each assignment you want to process you have to give him a name like :code:`example_assignment`. The name is used to name the output files. :alignment_tool: Alignment tool configuration that is used to map the reads to the oligos. :tool: - Alignment tool that is used. Currently :code:`bwa` and :code:`exact` is supported. + Alignment tool that is used. Currently :code:`bwa` and :code:`exact` are supported. :configs: Configurations of the alignment tool selected. :sequence_length (bwa): - Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify . :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the design file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory this option enables designs with multiple sequence lengths. + Defines the :code:`min` and :code:`max` of a :code:`sequence_length` specify. :code:`sequence_length` is basically the length of a sequence alignment to an oligo in the design file. Because there can be insertion and deletions we recommend to vary it a bit around the exact length (e.g. +-5). In theory, this option enables designs with multiple sequence lengths. :alignment_start (bwa): - Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. We also recommend to vary this value a bit because the start might not be exact after the adapter. E.g. by +-1. + Defines the :code:`min` and :code:`max` of the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise, 1 will be the choice for most cases. We also recommend varying this value a bit because the start might not be exact after the adapter. E.g. by +-1. :min_mapping_quality (bwa): - (Optinal) Defines the minimum mapping quality (MAPQ) of the alinment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 1. For regions only with larger edit distances 30 or 40 might be a good choice. Default :code:`1`. + (Optional) Defines the minimum mapping quality (MAPQ) of the alignment to an oligo. When using oligos with only 1bp difference it is recommended to set it to 1. For regions only with larger edit distances 30 or 40 might be a good choice. Default :code:`1`. :sequence_length (exact): Defines the :code:`sequence_length` which is the length of a sequence alignment to an oligo in the design file. Only one length design is supported. :alignment_start (exact): - Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise 1 will be the choice for most cases. + Defines the start of the alignment in an oligo. When using adapters you have to set basically the length of the adapter. Otherwise, 1 will be the choice for most cases. :bc_length: Length of the barcode. Must match with the length of :code:`BC`. :BC_rev_comp: - (Optional) If set to :code:`true` the barcode of is reverse complemented. Default is :code:`false`. + (Optional) If set to :code:`true` the barcode is reverse complemented. Default is :code:`false`. :linker_length: (Optional) Length of the linker. Only needed if you don't have a barcode read and the barcode is in the FW read with the structure: BC+Linker+Insert. The fixed length is used for the linker after a fixed length of BC. The recommended option is :code:`linker` by defining the exact linker sequence and using cutadapt for trimming. :linker: (Optional) Length of the linker. Only needed if you don't have a barcode read and the barcode is in the FW read with the structure: BC+Linker+Insert. Uses cutadapt to trim the linker to get the barcode as well as the starting of the insert. :FW: - List of forward read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. + List of forward-read files in gzipped fastq format. The full or relative path to the files should be used. The same order in FW, BC, and REV is important. :REV: - list of reverse read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. + List of reverse read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. :BC: List of index read files in gzipped fastq format. The full or relative path to the files should be used. Same order in FW, BC, and REV is important. :NGmerge: - (Optional) Options for NGmerge. NGmerge is used merge FW and REV reads. The following options are possible (we recommend to use the default values): + (Optional) Options for NGmerge. NGmerge is used to merge FW and REV reads. The following options are possible (we recommend to use the default values): :min_overlap: (Optional) Minimum overlap of the reads. Default :code:`20`. @@ -87,9 +87,9 @@ Each assignment you want to process you have to giv him a name like :code:`examp (Optional) Minimum dovetailed overlap. Default :code:`10`. :design_file: - Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. The oligo name later used to link barcode to oligo. The sequence is used to map the reads to the oligos. Adapters can be in the seuqence and therefore :code:`alignment_start` has to be adjusted. + Design file (full or relative path) in fasta format. The design file should contain the oligos in fasta format. The header should contain the oligo name and should be unique. The sequence should be the sequence of the oligo and must also be unique. When having multiple oligo names with the same sequence please merge them into one fasta entry. The oligo name was later used to link barcode to oligo. The sequence is used to map the reads to the oligos. Adapters can be in the sequence and therefore :code:`alignment_start` has to be adjusted. :design_check: - (Optional) Options for checking your design fasta file. Design file cannot have :code:`[` or :code:`]`, duplicated headers and for best performance sequences should not be identical. + (Optional) Options for checking your design fasta file. Design file cannot have :code:`[` or :code:`]`, duplicated headers and for best performance sequences should not be identical. :fast: (Optional) Using a simple dictionary to find identical sequences. This is faster but uses only the whole (or center part depending on start/length) of the design file. Cannot find substrings as part of any sequence. Set to false for more correct, but slower, search. Default :code:`true`. @@ -97,16 +97,16 @@ Each assignment you want to process you have to giv him a name like :code:`examp (Optional) Check if there are identical sequences in the design file. Default :code:`true`. :configs: - After mapping the reads to the design file and extracting the barcodes per oligo the configuration (using different names) can be used to generate multiple filtering and configuration settings of the final maq oligo to barcode. Use `: {}` to use the default values for the keys. Each configuration is a dictionary with the following keys: + After mapping the reads to the design file and extracting the barcodes per oligo, the configuration (using different names) can be used to generate multiple filtering and configuration settings of the final mapping oligo to barcode. Use `: {}` to use the default values for the keys. Each configuration is a dictionary with the following keys: :min_support: - Minimum number of same BC that map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`. + A minimum number of same BC that map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`3`. :fraction: - Minumum fraction of same BC that map to the same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. Larger value gives more evidence to be correct. But can remove lot's of BCs (depedning on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`. + Minimum fraction of same BC that map to the same oligo. E.g. :code:`0.7` means that at least 70% of the BC map to the same oligo. A larger value gives more evidence to be correct. But can remove lots of BCs (depending on the complexity, sequencing depth and quality of sequencing). Recommended option is :code:`0.7`. :unknown_other: - (Optional) Shows not mapped BCs in the final output map. Not recommended to use as mapping file fore the experiment workflow. But can be usefull for debugging. Default is :code:`false`. + (Optional) Shows not mapped BCs in the final output map. Not recommended to use as mapping file for the experiment workflow. But can be useful for debugging. Default is :code:`false`. :ambigous: - (Optional) Shows ambigous BCs in the final output map. Not recommended to use as mapping file fore the experiment workflow. But can be usefull for debugging. Default is :code:`false`. + (Optional) Shows ambiguous BCs in the final output map. Not recommended to use as mapping file fore the experiment workflow. But can be usefull for debugging. Default is :code:`false`. -------------------------------------- Experiment workflow (including counts) diff --git a/docs/contributing.rst b/docs/contributing.rst index 13b50f8..bf81a5a 100644 --- a/docs/contributing.rst +++ b/docs/contributing.rst @@ -129,8 +129,8 @@ Use the following steps for installing Sphinx and the dependencies for building .. code-block:: bash cd MPRAsnakeflow/docs - conda env create -f environment.yml -n sphinx - conda activate sphinx + mamba env create -f environment.yml -n sphinx + mamba activate sphinx Use the following for building the documentation. The first two lines is only required for loading the virtualenv. @@ -162,10 +162,10 @@ First, create your Documentation development setup. Now you can make your changes locally. -5. When you're done making your changes, make sure that snakemake runs properly +5. When you're done making your changes, make sure that snakemake runs properly by using a dry-run. For snakemake:: - snakemake --use-conda -p -n + snakemake --sdm conda --configfile config.yml -p -n For documentation:: diff --git a/docs/environment.yml b/docs/environment.yml index 0506dc5..98a7dce 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -1,7 +1,6 @@ --- channels: - conda-forge - - defaults dependencies: - python==3.12 - sphinx diff --git a/docs/experiment.rst b/docs/experiment.rst index aefdfbb..70ba2cf 100644 --- a/docs/experiment.rst +++ b/docs/experiment.rst @@ -24,6 +24,13 @@ Comma separated file (CSV) that assigns all fastq files present in a directory t Condidtion2,2,C2R2_DNA_barcode_F.fastq.gz,C2R2_DNA_barcode_UMI.fastq.gz,C2R2_DNA_barcode_R.fastq.gz,C2R2_RNA_barcode_F.fastq.gz,C2R2_RNA_barcode_UMI.fastq.gz,C2R2_RNA_barcode_R.fastq.gz Condidtion2,3,C2R3_DNA_barcode_F.fastq.gz,C2R3_DNA_barcode_UMI.fastq.gz,C2R3_DNA_barcode_R.fastq.gz,C2R3_RNA_barcode_F.fastq.gz,C2R3_RNA_barcode_UMI.fastq.gz,C2R3_RNA_barcode_R.fastq.gz + +We allow different flavours of experiment files because sometimes no UMI exists or only a FW read is used. Different options are: + * :code:`Condition,Replicate,DNA_BC_F,DNA_UMI,DNA_BC_R,RNA_BC_F,RNA_UMI,RNA_BC_R` + * :code:`Condition,Replicate,DNA_BC_F,DNA_BC_R,RNA_BC_F,RNA_BC_R` + * :code:`Condition,Replicate,DNA_BC_F,RNA_BC_F` + + Assignment File or configuration -------------------------------- Tab separated gzipped file with barcode mapped to sequence. Can be generated using the :ref:`Assignment` workflow. Config file must be configured similar to this: @@ -138,6 +145,8 @@ The output can be found in the folder defined by the option :code:`results/exper Files ------------- +Once the pipline is finished running then all the output files can be seen in the results folder. This pipline also generates a qc report. +For more details, refer to the `HTML QC report `_. File tree diff --git a/docs/faq.rst b/docs/faq.rst index 8729461..ab1c131 100644 --- a/docs/faq.rst +++ b/docs/faq.rst @@ -6,6 +6,17 @@ Frequently Asked Questions If you have more question please write us a ticket on `github `_. + +Is it possible to differntiate beteween sense and antisense? + No! Or not directly. The reason why we are not able to do this is that reads will map to both sequence strands equally. Then assignment of the barcode becomes ambigous and is discarded. But when dsigning oligos you can add short sequence fragment on the start and on the end of the sequence that ar edifferent sense and antisense. These sequences should not be trimmed away during demultiplexing and have to be in the design file. For the lentiMPRA dsign we have 15bp adpaters on both ends for integration of the sequence. They can be used for that purpose. + +The design/reference file check faild, why? + The design file has to have: + * Unique headers. Each sequence has to have a unique sequence/id strating from :code:`>` to the first whitespace or newline. + * No special characters within the headers. This is because mapping tools create a reference dictionary and cannot handle all characters. In addition most databases (like SRA) have their restricted character set for the header. + * Unique sequences. They have to be different. Otherwise mapper place the read to both IDs and the barcode get ambigous and is discarded. Wenn you allow min/max start/lengths for sequences (e.g. in BWA mapping) be aware that the smalles substring has to be unqiue across all other (sub) sequences. + + MPRAsnakeflow is not able to create a Conda environment If you get a message like:: diff --git a/docs/index.rst b/docs/index.rst index e1c6364..3c5e3f4 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -16,7 +16,7 @@ MPRAsnakeflow's documentation MPRAsnakeflow pipeline processes sequencing data from Massively Parallel Reporter Assays (MPRAs) to create count tables for candidate sequences tested in the experiment. -MPRAsnakeflow is built on top of `Snakemake `_. Insert your code into the respective folders, i.e. ``scripts``, ``rules``, and ``envs``. Define the entry point of the workflow in the ``Snakefile`` and the main configuration in a ``.yaml`` file. +MPRAsnakeflow is built on top of `Snakemake `_ (version 8 preferred) and is configured via a ``.yaml`` file. Authors Max Schubach (`@visze `_) @@ -24,16 +24,22 @@ Authors Berlin Institute of Health at Charité - Universitätsklinikum Berlin Usage - If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of the (original) repository and, if available, it's DOI. (see above) + If you use this workflow in a paper, don't forget to give credits to the authors by citing the URL of the (original) repository. Installation & Getting Started Instructions for the Installation of the program and some examples to get you started. MPRAsnakeflow Workflows - An overview of how MPRAsnakeflow works and documentation for the MPRAsnakeflow sub workflows. + An overview of how MPRAsnakeflow works and documentation for the MPRAsnakeflow sub-workflows. + +MPRAsnakeflow Turorials + Get to know MPRAsnakeflow by runnig it via jupyter notebooks or colab on small examples. MPRAsnakeflow Examples - Muliple examples from the literature are listed for every sub workflow in MPRAsnakeflow. + Multiple examples from the literature are listed for every sub-workflow in MPRAsnakeflow. + +Tips & Tricks + Find our FAQ here. Project Information More information on the project, including the changelog, list of contributing authors, and contribution instructions. @@ -43,7 +49,7 @@ Project Information Quick Example ------------- -To run MPRAsnakeflow, first activate the snakemake environment with the following command: +To run MPRAsnakeflow, first activate the snakemake 8 environment with the following command: .. code-block:: bash @@ -55,24 +61,33 @@ And then run the main workflow with: .. code-block:: bash snakemake --software-deployment-method conda --cores $N --configfile config/example_config.yaml - + -------- Features -------- -:--software-deployment-method: - When ```conda`` is set the utility uses mamba to efficiently query repositories and query package dependencies. MPRAsnakeflow also can use containers via apptainer by using ``--software-deployment-method apptainer``. Recommended option: ``--software-deployment-method conda apptainer`` -:--cores: - This utility sets the number of cores (``$N``) to be used by MPRAsnakeflow. -:--configfile: - This file (e.g., ``config/example_config.yaml``) contains the project, its objects and properties, and sub-properties and its objects that **must** be set before running MPRAsnakeflow. +.. list-table:: + :widths: 25 80 + :header-rows: 1 + + * - Option + - Description + * - ``--software-deployment-method`` + - When ``conda`` is set, the utility uses mamba to efficiently query repositories and query package dependencies. MPRAsnakeflow also can use containers via apptainer by using ``--software-deployment-method apptainer``. Recommended option: ``--software-deployment-method conda apptainer`` + * - ``--cores`` + - This utility sets the number of cores (``$N``) to be used by MPRAsnakeflow. + * - ``--configfile`` + - This file (e.g., ``config/example_config.yaml``) contains the project, its objects and properties, and sub-properties and its objects that **must** be set before running MPRAsnakeflow. + ------------------- Investigate results ------------------- -After successful execution, you can create a self-contained interactive HTML report with all results via: +Th ebest option to investiate your results is to have a look at the QC report. + +(In development) After successful execution, you can create a self-contained interactive HTML report with all results via: .. code-block:: bash @@ -80,7 +95,7 @@ After successful execution, you can create a self-contained interactive HTML rep This report can be forwarded to your collaborators. -An example of a generated report (using some trivial test data) can be seen `here `_. +An example of a generated report (using some trivial test data) can be seen `here `_. -------- Feedback diff --git a/docs/install.rst b/docs/install.rst index ec49884..df7dad5 100644 --- a/docs/install.rst +++ b/docs/install.rst @@ -4,7 +4,7 @@ Installation ===================== -Installation should take less than 5 minutes +Installation should take less than 5 minutes. System Requirements =================== diff --git a/docs/quickstart.rst b/docs/quickstart.rst index ef885bf..b997e73 100644 --- a/docs/quickstart.rst +++ b/docs/quickstart.rst @@ -5,14 +5,18 @@ Getting started ===================== -1. Create an :code:`experiment.csv` in the format below, including the header. - `DNA_BC_F` or `RNA_BC_F` is name of the gzipped fastq of the forward read of the DNA or RNA from the defined condition and replicate. +We highly recommend as first start the MPRAsnakeflow :ref:`Tutorial` or the :ref:`Assignment example` and :ref:`Count example` examples. Here we provide a quick overview what you need to start the workflow. + +MPRAsnakeflow exoists of two subworkflows, :ref:`Assignment` and :ref:`Experiment`. This quickstart shows the configuration for both and you have to leave out the respective part for if you only want to run one of them. + +1. **Experiment workflow only:** Create an :code:`experiment.csv` in the format below, including the header. + `DNA_BC_F` or `RNA_BC_F` is the name of the gzipped fastq of the forward read of the DNA or RNA from the defined condition and replicate. `DNA_UMI` or `RNA_UMI` is the corresponding index read with UMIs (excluding sample barcodes), and `DNA_BC_R` or `RNA_BC_R` of the reverse read. - Multiple fastq files can be used for each column by seperating them with :code:`;`. + Multiple fastq files can be used for each column by separating them with :code:`;`. - Right now an UMI have to be used. If you want to use MPRAsnakeflow without an UMI please sitch to MPRAflow or contact us. + Right now a UMI has to be used. If you want to use MPRAsnakeflow without a UMI please switch to MPRAflow or contact us. Here is an example of an :code:`experiment.csv` file and it can be downloaded :download:`experiment.csv <../resources/example_experiment.csv>`: @@ -21,7 +25,7 @@ Getting started :widths: 5, 2, 25, 25, 25, 25, 25, 25 :header-rows: 1 -2. If you would like each designed sequence to be colored based on different user-specified categories, such as `positive control`, `negative control`, `shuffled control`, and `putative enhancer`. To assess the overall quality, you can create a ``label.tsv`` in the format below that maps the name to category as shown here: +2. **Experiment workflow only:** If you would like each designed sequence to be coloured based on different user-specified categories, such as `positive control`, `negative control`, `shuffled control`, and `putative enhancer`. To assess the overall quality, you can create a ``label.tsv`` in the format below that maps the name to the category as shown here: .. code-block:: text @@ -33,7 +37,7 @@ Getting started 3. Set up the config file -The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend to use one config file per MPRA experiment or MPRA roject. But in theory many different experiments can be configured in only one file. It is divided into :code:`global` (generell settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). +The config file is the heart of MPRAsnakflow. Here different runs can be configured. We recommend using one config file per MPRA experiment or MPRA project. But in theory, many different experiments can be configured in only one file. It is divided into :code:`global` (general settings), :code:`assignments` (assigment workflow), and :code:`experiments` (count workflow including variants). See :ref:`Config` for more details about the config file. Here is an example running only the count experiments and using a provided assignment file. diff --git a/version.txt b/version.txt index 6e8bf73..17e51c3 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.1.0 +0.1.1 diff --git a/workflow/Snakefile b/workflow/Snakefile index 643270c..c9eef96 100644 --- a/workflow/Snakefile +++ b/workflow/Snakefile @@ -1,7 +1,14 @@ ########################## #### containerization #### ########################## -containerized: "docker://visze/mprasnakeflow:0.1.0" + +f = open(workflow.source_path("../version.txt")) +version = f.read().strip() + +print(f"\033[95mRunning MPRAsnakeflow version {version}\033[0m") + + +containerized: f"docker://visze/mprasnakeflow:{version}" ################# diff --git a/workflow/envs/NGmerge.yaml b/workflow/envs/NGmerge.yaml index aa8567a..3c52681 100644 --- a/workflow/envs/NGmerge.yaml +++ b/workflow/envs/NGmerge.yaml @@ -2,9 +2,8 @@ channels: - conda-forge - bioconda - - defaults dependencies: - ngmerge=0.3 - python - - click + - click - htslib diff --git a/workflow/envs/bbmap_samtools_htslib.yaml b/workflow/envs/bbmap_samtools_htslib.yaml index 00d1711..dea9768 100644 --- a/workflow/envs/bbmap_samtools_htslib.yaml +++ b/workflow/envs/bbmap_samtools_htslib.yaml @@ -2,7 +2,6 @@ channels: - bioconda - conda-forge - - defaults dependencies: - bbmap - samtools diff --git a/workflow/envs/bwa_samtools_picard_htslib.yaml b/workflow/envs/bwa_samtools_picard_htslib.yaml index 2a185ee..3263e0d 100644 --- a/workflow/envs/bwa_samtools_picard_htslib.yaml +++ b/workflow/envs/bwa_samtools_picard_htslib.yaml @@ -2,7 +2,6 @@ channels: - bioconda - conda-forge - - defaults dependencies: - bwa - samtools diff --git a/workflow/envs/cutadapt.yaml b/workflow/envs/cutadapt.yaml index a03c6db..b101299 100644 --- a/workflow/envs/cutadapt.yaml +++ b/workflow/envs/cutadapt.yaml @@ -2,6 +2,5 @@ channels: - conda-forge - bioconda - - defaults dependencies: - - cutadapt \ No newline at end of file + - cutadapt diff --git a/workflow/envs/default.yaml b/workflow/envs/default.yaml index d1b49a5..7034554 100644 --- a/workflow/envs/default.yaml +++ b/workflow/envs/default.yaml @@ -2,6 +2,5 @@ channels: - conda-forge - bioconda - - defaults dependencies: - - htslib \ No newline at end of file + - htslib diff --git a/workflow/envs/fastq_join.yaml b/workflow/envs/fastq_join.yaml index 35952fe..260a22d 100644 --- a/workflow/envs/fastq_join.yaml +++ b/workflow/envs/fastq_join.yaml @@ -2,9 +2,8 @@ channels: - conda-forge - bioconda - - defaults dependencies: - fastq-join=1.3.1 - python - click - - htslib \ No newline at end of file + - htslib diff --git a/workflow/envs/fastqsplitter.yaml b/workflow/envs/fastqsplitter.yaml index 8ffb86b..c147cfb 100644 --- a/workflow/envs/fastqsplitter.yaml +++ b/workflow/envs/fastqsplitter.yaml @@ -1,7 +1,6 @@ --- channels: - bioconda - - defaults - conda-forge dependencies: - fastqsplitter diff --git a/workflow/envs/python27.yaml b/workflow/envs/python27.yaml index eb50405..2fcd7d9 100644 --- a/workflow/envs/python27.yaml +++ b/workflow/envs/python27.yaml @@ -1,7 +1,6 @@ --- channels: - bioconda - - defaults - conda-forge dependencies: - htslib diff --git a/workflow/envs/python3.yaml b/workflow/envs/python3.yaml index c06db5c..0fb09e4 100644 --- a/workflow/envs/python3.yaml +++ b/workflow/envs/python3.yaml @@ -2,7 +2,6 @@ channels: - conda-forge - bioconda - - defaults dependencies: - biopython - click diff --git a/workflow/envs/quarto.yaml b/workflow/envs/quarto.yaml index 101a3c1..8e752a5 100644 --- a/workflow/envs/quarto.yaml +++ b/workflow/envs/quarto.yaml @@ -1,7 +1,7 @@ +--- channels: - conda-forge - bioconda - - defaults dependencies: - python - quarto diff --git a/workflow/envs/r.yaml b/workflow/envs/r.yaml index b5b9c63..e33428c 100644 --- a/workflow/envs/r.yaml +++ b/workflow/envs/r.yaml @@ -2,7 +2,6 @@ channels: - conda-forge - bioconda - - defaults dependencies: - r-base - r-cowplot diff --git a/workflow/rules/assignment/mapping_bbmap.smk b/workflow/rules/assignment/mapping_bbmap.smk index bf933e6..238c803 100644 --- a/workflow/rules/assignment/mapping_bbmap.smk +++ b/workflow/rules/assignment/mapping_bbmap.smk @@ -24,6 +24,9 @@ rule assignment_mapping_bbmap: """ conda: "../../envs/bbmap_samtools_htslib.yaml" + threads: 1 + resources: + mem="4G", input: reads="results/assignment/{assignment}/fastq/merge_split{split}.join.fastq.gz", check="results/assignment/{assignment}/design_check.done", @@ -39,7 +42,8 @@ rule assignment_mapping_bbmap: temp("results/logs/assignment/mapping.bbmap.{assignment}.{split}.log"), shell: """ - bbmap.sh in={input.reads} ref={input.reference} nodisk -t={threads} out={output.bam} &> {log}; + bbmap.sh -eoom -Xmx{resources.mem} -t={threads} \ + in={input.reads} ref={input.reference} nodisk out={output.bam} &> {log}; samtools sort -l 0 -@ {threads} {output.bam} > {output.sorted_bam} 2>> {log}; """