From ce1da8f38150d188cb08407e5a31c0c7b34d529f Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Fri, 27 Dec 2024 12:23:37 +0100 Subject: [PATCH 01/25] Merge changes from main into hotfix (#195) (#388) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: Sara Monzón Co-authored-by: Pablo Mata <76519482+Shettland@users.noreply.github.com> Co-authored-by: Víctor López <98259577+victor5lm@users.noreply.github.com> From 4a70c003bf9458df8e3ac39fd1aff166ce67efd2 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 16:52:19 +0100 Subject: [PATCH 02/25] Removed setup.py and replaced it by pyproject.toml --- pyproject.toml | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++ setup.py | 36 -------------------------- 2 files changed, 69 insertions(+), 36 deletions(-) create mode 100755 pyproject.toml delete mode 100755 setup.py diff --git a/pyproject.toml b/pyproject.toml new file mode 100755 index 000000000..ddc428d66 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,69 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "buisciii-tools" +version = "2.2.4" +dynamic = ["dependencies"] + +authors = [ + {name = "Sara Monzon", email = "smonzon@isciii.es"}, + {name = "Luis Chapado", email = "lchapado@externos.isciii.es"}, + {name = "Isabel Cuesta", email = "isabel.cuesta@isciii.es"}, + {name = "Sarai Varona", email = "s.varona@isciii.es"}, + {name = "Daniel Valle", email = "da.valle@externos.isciii.es"}, + {name = "Pablo Mata", email = "pmata@externos.isciii.es"}, + {name = "Victor Lopez", email = "vmlopez@isciii.es"}, + {name = "Emi Arjona", email = "eiarjona@isciii.es"}, + {name = "Jaime Ozaez", email = "jaime.ozaez@isciii.es"}, + {name = "Juan Ledesma", email = "juan.ledesma@isciii.es"}, + {name = "Sergio Olmos", email = "sergio.olmos@externos.isciii.es"}, + {name = "Alejandro Bernabeu", email = "abernabeu@isciii.es"}, + {name = "Alba Talavera", email = "alba.talavera@externos.isciii.es"} +] + +maintainers = [ + {name = "Sara Monzon", email = "smonzon@isciii.es"}, + {name = "Luis Chapado", email = "lchapado@externos.isciii.es"}, + {name = "Isabel Cuesta", email = "isabel.cuesta@isciii.es"}, + {name = "Sarai Varona", email = "s.varona@isciii.es"}, + {name = "Daniel Valle", email = "da.valle@externos.isciii.es"}, + {name = "Pablo Mata", email = "pmata@externos.isciii.es"}, + {name = "Victor Lopez", email = "vmlopez@isciii.es"}, + {name = "Emi Arjona", email = "eiarjona@isciii.es"}, + {name = "Jaime Ozaez", email = "jaime.ozaez@isciii.es"}, + {name = "Juan Ledesma", email = "juan.ledesma@isciii.es"}, + {name = "Sergio Olmos", email = "sergio.olmos@externos.isciii.es"}, + {name = "Alejandro Bernabeu", email = "abernabeu@isciii.es"}, + {name = "Alba Talavera", email = "alba.talavera@externos.isciii.es"} +] + +description = "Tools for managing and resolution of buisciii services." +readme = "README.md" +license = {file = "GNU GENERAL PUBLIC LICENSE v.3"} +keywords = [ + "buisciii", + "bioinformatics", + "pipeline", + "sequencing", + "NGS", + "next generation sequencing" +] + +[project.urls] +Homepage = "https://github.com/BU-ISCIII/buisciii-tools" +Issues = "https://github.com/BU-ISCIII/buisciii-tools/issues" + +[tool.setuptools.dynamic] +dependencies = {file = ["requirements.txt"]} + +[project.scripts] +bu_isciii = "bu_isciii.__main__:run_bu_isciii" + +[tool.setuptools.packages.find] +exclude = ["docs"] + +[tool.setuptools] +include-package-data = true +zip-safe = false diff --git a/setup.py b/setup.py deleted file mode 100755 index 483ba2e5c..000000000 --- a/setup.py +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env python - -from setuptools import setup, find_packages - -version = "2.2.4" - -with open("README.md") as f: - readme = f.read() - -with open("requirements.txt") as f: - required = f.read().splitlines() - -setup( - name="bu-isciii", - version=version, - description="Tools for managing and resolution of buisciii services.", - long_description=readme, - long_description_content_type="text/markdown", - keywords=[ - "buisciii", - "bioinformatics", - "pipeline", - "sequencing", - "NGS", - "next generation sequencing", - ], - author="Sara Monzon", - author_email="smonzon@isciii.es", - url="https://github.com/BU-ISCIII/buisciii-tools", - license="GNU GENERAL PUBLIC LICENSE v.3", - entry_points={"console_scripts": ["bu-isciii=bu_isciii.__main__:run_bu_isciii"]}, - install_requires=required, - packages=find_packages(exclude=("docs")), - include_package_data=True, - zip_safe=False, -) From 839852f1c185178b43df6a4b71d05a5e279a6e12 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:30:48 +0100 Subject: [PATCH 03/25] Changed mag.md by taxprofiler.md --- .../reports/md/{mag.md => taxprofiler.md} | 753 ------------------ 1 file changed, 753 deletions(-) rename bu_isciii/assets/reports/md/{mag.md => taxprofiler.md} (50%) diff --git a/bu_isciii/assets/reports/md/mag.md b/bu_isciii/assets/reports/md/taxprofiler.md similarity index 50% rename from bu_isciii/assets/reports/md/mag.md rename to bu_isciii/assets/reports/md/taxprofiler.md index 67ef902a2..6ba22265f 100644 --- a/bu_isciii/assets/reports/md/mag.md +++ b/bu_isciii/assets/reports/md/taxprofiler.md @@ -1,756 +1,3 @@ -# nf-core/mag: Output - -## Introduction - -This document describes the output produced by the pipeline. Most of the plots are taken from the MultiQC report, which summarises results at the end of the pipeline. - -The directories listed below will be created in the results directory after the pipeline has finished. All paths are relative to the top-level results directory. - -## Pipeline overview - -The pipeline is built using [Nextflow](https://www.nextflow.io/) and processes data using the following steps: - -- [Quality control](#quality-control) of input reads - trimming and contaminant removal -- [Taxonomic classification of trimmed reads](#taxonomic-classification-of-trimmed-reads) -- [Digital sequencing normalisation](#digital-normalization-with-BBnorm) -- [Assembly](#assembly) of trimmed reads -- [Protein-coding gene prediction](#gene-prediction) of assemblies -- [Virus identification](#virus-identification-in-assemblies) of assemblies -- [Binning and binning refinement](#binning-and-binning-refinement) of assembled contigs -- [Taxonomic classification of binned genomes](#taxonomic-classification-of-binned-genomes) -- [Genome annotation of binned genomes](#genome-annotation-of-binned-genomes) -- [Additional summary for binned genomes](#additional-summary-for-binned-genomes) -- [Ancient DNA](#ancient-dna) -- [MultiQC](#multiqc) - aggregate report, describing results of the whole pipeline -- [Pipeline information](#pipeline-information) - Report metrics generated during the workflow execution - -Note that when specifying the parameter `--coassemble_group`, for the corresponding output filenames/directories of the assembly or downsteam processes the group ID, or more precisely the term `group-[group_id]`, will be used instead of the sample ID. - -## Quality control - -These steps trim away the adapter sequences present in input reads, trims away bad quality bases and sicard reads that are too short. -It also removes host contaminants and sequencing controls, such as PhiX or the Lambda phage. -FastQC is run for visualising the general quality metrics of the sequencing runs before and after trimming. - - - -### FastQC - -
-Output files - -- `QC_shortreads/fastqc/` - - `[sample]_[1/2]_fastqc.html`: FastQC report, containing quality metrics for your untrimmed raw fastq files - - `[sample].trimmed_[1/2]_fastqc.html`: FastQC report, containing quality metrics for trimmed and, if specified, filtered read files - -
- -[FastQC](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/) gives general quality metrics about your sequenced reads. It provides information about the quality score distribution across your reads, per base sequence content (%A/T/G/C), adapter contamination and overrepresented sequences. For further reading and documentation see the [FastQC help pages](http://www.bioinformatics.babraham.ac.uk/projects/fastqc/Help/). - -### fastp - -[fastp](https://github.com/OpenGene/fastp) is a all-in-one fastq preprocessor for read/adapter trimming and quality control. It is used in this pipeline for trimming adapter sequences and discard low-quality reads. Its output is in the results folder and part of the MultiQC report. - -
-Output files - -- `QC_shortreads/fastp/[sample]/` - - `fastp.html`: Interactive report - - `fastp.json`: Report in json format - -
- -### AdapterRemoval2 - -[AdapterRemoval](https://adapterremoval.readthedocs.io/en/stable/) searches for and removes remnant adapter sequences from High-Throughput Sequencing (HTS) data and (optionally) trims low quality bases from the 3' end of reads following adapter removal. It is popular in the field of palaeogenomics. The output logs are stored in the results folder, and as a part of the MultiQC report. - -
-Output files - -- `QC_shortreads/adapterremoval/[sample]/` - - `[sample]_ar2.settings`: AdapterRemoval log file. - -
- -### Remove PhiX sequences from short reads - -The pipeline uses bowtie2 to map the reads against PhiX and removes mapped reads. - -
-Output files - -- `QC_shortreads/remove_phix/` - - `[sample].phix_removed.bowtie2.log`: Contains a brief log file indicating how many reads have been retained. - -
- -### Host read removal - -The pipeline uses bowtie2 to map short reads against the host reference genome specified with `--host_genome` or `--host_fasta` and removes mapped reads. The information about discarded and retained reads is also included in the MultiQC report. - -
-Output files - -- `QC_shortreads/remove_host/` - - `[sample].host_removed.bowtie2.log`: Contains the bowtie2 log file indicating how many reads have been mapped. - - `[sample].host_removed.mapped*.read_ids.txt`: Contains a file listing the read ids of discarded reads. - -
- -### Remove Phage Lambda sequences from long reads - -The pipeline uses Nanolyse to map the reads against the Lambda phage and removes mapped reads. - -
-Output files - -- `QC_longreads/NanoLyse/` - - `[sample]_nanolyse.log`: Contains a brief log file indicating how many reads have been retained. - -
- -### Filtlong and porechop - -The pipeline uses filtlong and porechop to perform quality control of the long reads that are eventually provided with the TSV input file. - -No direct host read removal is performed for long reads. -However, since within this pipeline filtlong uses a read quality based on k-mer matches to the already filtered short reads, reads not overlapping those short reads might be discarded. -The lower the parameter `--longreads_length_weight`, the higher the impact of the read qualities for filtering. -For further documentation see the [filtlong online documentation](https://github.com/rrwick/Filtlong). - -### Quality visualisation for long reads - -NanoPlot is used to calculate various metrics and plots about the quality and length distribution of long reads. For more information about NanoPlot see the [online documentation](https://github.com/wdecoster/NanoPlot). - -
-Output files - -- `QC_longreads/NanoPlot/[sample]/` - - `raw_*.[png/html/txt]`: Plots and reports for raw data - - `filtered_*.[png/html/txt]`: Plots and reports for filtered data - -
- -## Digital normalization with BBnorm - -If the pipeline is called with the `--bbnorm` option, it will normalize sequencing depth of libraries prior assembly by removing reads to 1) reduce coverage of very abundant kmers and 2) delete very rare kmers (see `--bbnorm_target` and `--bbnorm_min` parameters). -When called in conjunction with `--coassemble_group`, BBnorm will operate on interleaved (merged) FastQ files, producing only a single output file. -If the `--save_bbnorm_reads` parameter is set, the resulting FastQ files are saved together with log output. - -
-Output files - -- `bbmap/bbnorm/[sample]\*.fastq.gz` -- `bbmap/bbnorm/log/[sample].bbnorm.log` - -
- -## Taxonomic classification of trimmed reads - -### Kraken - -Kraken2 classifies reads using a k-mer based approach as well as assigns taxonomy using a Lowest Common Ancestor (LCA) algorithm. - -
-Output files - -- `Taxonomy/kraken2/[sample]/` - - `kraken2.report`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details - - `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) - -
- -### Centrifuge - -Centrifuge is commonly used for the classification of DNA sequences from microbial samples. It uses an indexing scheme based on the Burrows-Wheeler transform (BWT) and the Ferragina-Manzini (FM) index. - -More information on the [Centrifuge](https://ccb.jhu.edu/software/centrifuge/) website - -
-Output files - -- `Taxonomy/centrifuge/[sample]/` - - `report.txt`: Tab-delimited result file. See the [centrifuge manual](https://ccb.jhu.edu/software/centrifuge/manual.shtml#centrifuge-classification-output) for information about the fields - - `kreport.txt`: Classification in the Kraken report format. See the [kraken2 manual](https://github.com/DerrickWood/kraken2/wiki/Manual#output-formats) for more details - - `taxonomy.krona.html`: Interactive pie chart produced by [KronaTools](https://github.com/marbl/Krona/wiki) - -
- -## Assembly - -Trimmed (short) reads are assembled with both megahit and SPAdes. Hybrid assembly is only supported by SPAdes. - -### MEGAHIT - -[MEGAHIT](https://github.com/voutcn/megahit) is a single node assembler for large and complex metagenomics short reads. - -
-Output files - -- `Assembly/MEGAHIT/` - - `[sample/group].contigs.fa.gz`: Compressed metagenome assembly in fasta format - - `[sample/group].log`: Log file - - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - - `MEGAHIT-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - - `MEGAHIT-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - - `MEGAHIT-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. - -
- -### SPAdes - -[SPAdes](http://cab.spbu.ru/software/spades/) was originally a single genome assembler that later added support for assembling metagenomes. - -
-Output files - -- `Assembly/SPAdes/` - - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - - `[sample/group].log`: Log file - - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - - `SPAdes-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - - `SPAdes-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - - `SPAdes-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. - -
- -### SPAdesHybrid - -SPAdesHybrid is a part of the [SPAdes](http://cab.spbu.ru/software/spades/) software and is used when the user provides both long and short reads. - -
-Output files - -- `Assembly/SPAdesHybrid/` - - `[sample/group]_scaffolds.fasta.gz`: Compressed assembled scaffolds in fasta format - - `[sample/group]_graph.gfa.gz`: Compressed assembly graph in gfa format - - `[sample/group]_contigs.fasta.gz`: Compressed assembled contigs in fasta format - - `[sample/group].log`: Log file - - `QC/[sample/group]/`: Directory containing QUAST files and Bowtie2 mapping logs - - `SPAdesHybrid-[sample].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the sample that the metagenome was assembled from, only present if `--coassemble_group` is not set. - - `SPAdesHybrid-[sample/group]-[sampleToMap].bowtie2.log`: Bowtie2 log file indicating how many reads have been mapped from the respective sample ("sampleToMap"). - - `SPAdesHybrid-[sample].[bam/bai]`: Optionally saved BAM file of the Bowtie2 mapping of reads against the assembly. - -
- -### Metagenome QC with QUAST - -[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates metagenome assemblies by computing various metrics. The QUAST output is also included in the MultiQC report, as well as in the assembly directories themselves. - -
-Output files - -- `Assembly/[assembler]/QC/[sample/group]/QUAST/` - - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt - - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) - - `quast.log`: QUAST log file - - `metaquast.log`: MetaQUAST log file - - `icarus.html`: Icarus main menu with links to interactive viewers - - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest - - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) - - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs - - `basic_stats/[assembler]-[sample/group]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs - - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. - - `predicted_genes/[assembler]-[sample/group].rna.gff`: Contig positions for rRNA genes in gff version 3 format - - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) - -
- -## Gene prediction - -Protein-coding genes are predicted for each assembly. - -
-Output files - -- `Annotation/Prodigal/` - - `[assembler]-[sample/group].gff.gz`: Gene Coordinates in GFF format - - `[assembler]-[sample/group].faa.gz`: The protein translation file consists of all the proteins from all the sequences in multiple FASTA format. - - `[assembler]-[sample/group].fna.gz`: Nucleotide sequences of the predicted proteins using the DNA alphabet, not mRNA (so you will see 'T' in the output and not 'U'). - - `[assembler]-[sample/group]_all.txt.gz`: Information about start positions of genes. - -
- -## Virus identification in assemblies - -### geNomad - -[geNomad](https://github.com/apcamargo/genomad) identifies viruses and plasmids in sequencing data (isolates, metagenomes, and metatranscriptomes) - -
-Output files - -- `VirusIdentification/geNomad/[assembler]-[sample/group]*/` - - `[assembler]-[sample/group]*_annotate` - - `[assembler]-[sample/group]*_taxonomy.tsv`: Taxonomic assignment data - - `[assembler]-[sample/group]*_aggregated_classification` - - `[assembler]-[sample/group]*_aggregated_classification.tsv`: Sequence classification in tabular format - - `[assembler]-[sample/group]*_find_proviruses` - - `[assembler]-[sample/group]*_provirus.tsv`: Characteristics of proviruses identified by geNomad - - `[assembler]-[sample/group]*_summary` - - `[assembler]-[sample/group]*_virus_summary.tsv`: Virus classification summary file in tabular format - - `[assembler]-[sample/group]*_plasmid_summary.tsv`: Plasmid classification summary file in tabular format - - `[assembler]-[sample/group]*_viruses_genes.tsv`: Virus gene annotation data in tabular format - - `[assembler]-[sample/group]*_plasmids_genes.tsv`: Plasmid gene annotation data in tabular format - - `[assembler]-[sample/group]*_viruses.fna`: Virus nucleotide sequences in FASTA format - - `[assembler]-[sample/group]*_plasmids.fna`: Plasmid nucleotide sequences in FASTA format - - `[assembler]-[sample/group]*_viruses_proteins.faa`: Virus protein sequences in FASTA format - - `[assembler]-[sample/group]*_plasmids_proteins.faa`: Plasmid protein sequences in FASTA format - - `[assembler]-[sample/group]*.log`: Plain text log file detailing the steps executed by geNomad (annotate, find-proviruses, marker-classification, nn-classification, aggregated-classification and summary) - -
- -## Binning and binning refinement - -### Contig sequencing depth - -Sequencing depth per contig and sample is generated by MetaBAT2's `jgi_summarize_bam_contig_depths --outputDepth`. The values correspond to `(sum of exactly aligned bases) / ((contig length)-2*75)`. For example, for two reads aligned exactly with `10` and `9` bases on a 1000 bp long contig the depth is calculated by `(10+9)/(1000-2*75)` (1000bp length of contig minus 75bp from each end, which is excluded). - -These depth files are used for downstream binning steps. - -
-Output files - -- `GenomeBinning/depths/contigs/` - - `[assembler]-[sample/group]-depth.txt.gz`: Sequencing depth for each contig and sample or group, only for short reads. - -
- -### MetaBAT2 - -[MetaBAT2](https://bitbucket.org/berkeleylab/metabat) recovers genome bins (that is, contigs/scaffolds that all belongs to a same organism) from metagenome assemblies. - -
-Output files - -- `GenomeBinning/MetaBAT2/` - - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly - - `unbinned/[assembler]-[binner]-[sample/group].unbinned.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported - -
- -All the files and contigs in these folders will be assessed by QUAST and BUSCO. - -All other files that were discarded by the tool, or from the low-quality unbinned contigs, can be found here. - -
-Output files - -- `GenomeBinning/MetaBAT2/discarded/` - - `*.lowDepth.fa.gz`: Low depth contigs that are filtered by MetaBAT2 - - `*.tooShort.fa.gz`: Too short contigs that are filtered by MetaBAT2 -- `GenomeBinning/MetaBAT2/unbinned/discarded/` - - `*.unbinned.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. - - `*.unbinned.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. - -
- -All the files in this folder contain small and/or unbinned contigs that are not further processed. - -Files in these two folders contain all contigs of an assembly. - -### MaxBin2 - -[MaxBin2](https://sourceforge.net/projects/maxbin2/) recovers genome bins (that is, contigs/scaffolds that all belongs to a same organism) from metagenome assemblies. - -
-Output files - -- `GenomeBinning/MaxBin2/` - - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly - - `unbinned/[assembler]-[binner]-[sample/group].noclass.[1-9]*.fa.gz`: Contigs that were not binned with other contigs but considered interesting. By default, these are at least 1 Mbp (`--min_length_unbinned_contigs`) in length and at most the 100 longest contigs (`--max_unbinned_contigs`) are reported. - -
- -All the files and contigs in these folders will be assessed by QUAST and BUSCO. - -
-Output files - -- `GenomeBinning/MaxBin2/discarded/` - - `*.tooshort.gz`: Too short contigs that are filtered by MaxBin2 -- `GenomeBinning/MaxBin2/unbinned/discarded/` - - `*.noclass.pooled.fa.gz`: Pooled unbinned contigs equal or above `--min_contig_size`, by default 1500 bp. - - `*.noclass.remaining.fa.gz`: Remaining unbinned contigs below `--min_contig_size`, by default 1500 bp, but not in any other file. - -
- -All the files in this folder contain small and/or unbinned contigs that are not further processed. - -Files in these two folders contain all contigs of an assembly. - -### CONCOCT - -[CONCOCT](https://github.com/BinPro/CONCOCT) performs unsupervised binning of metagenomic contigs by using nucleotide composition, coverage data in multiple samples and linkage data from paired end reads. - -
-Output files - -- `GenomeBinning/CONCOCT/` - - `bins/[assembler]-[binner]-[sample/group].*.fa.gz`: Genome bins retrieved from input assembly - - `stats/[assembler]-[binner]-[sample/group].csv`: Table indicating which contig goes with which cluster bin. - - `stats/[assembler]-[binner]-[sample/group]*_gt1000.csv`: Various intermediate PCA statistics used for clustering. - - `stats/[assembler]-[binner]-[sample/group]_*.tsv`: Coverage statistics of each sub-contig cut up by CONOCOCT prior in an intermediate step prior to binning. Likely not useful in most cases. - - `stats/[assembler]-[binner]-[sample/group].log.txt`: CONCOCT execution log file. - - `stats/[assembler]-[binner]-[sample/group]_*.args`: List of arguments used in CONCOCT execution. - -
- -All the files and contigs in these folders will be assessed by QUAST and BUSCO, if the parameter `--postbinning_input` is not set to `refined_bins_only`. - -Note that CONCOCT does not output what it considers 'unbinned' contigs, therefore no 'discarded' contigs are produced here. You may still need to do your own manual curation of the resulting bins. - -### DAS Tool - -[DAS Tool](https://github.com/cmks/DAS_Tool) is an automated binning refinement method that integrates the results of a flexible number of binning algorithms to calculate an optimized, non-redundant set of bins from a single assembly. nf-core/mag uses this tool to attempt to further improve bins based on combining the MetaBAT2 and MaxBin2 binning output, assuming sufficient quality is met for those bins. - -DAS Tool will remove contigs from bins that do not pass additional filtering criteria, and will discard redundant lower-quality output from binners that represent the same estimated 'organism', until the single highest quality bin is represented. - -> ⚠️ If DAS Tool does not find any bins passing your selected threshold it will exit with an error. Such an error is 'ignored' by nf-core/mag, therefore you will not find files in the `GenomeBinning/DASTool/` results directory for that particular sample. - -
-Output files - -- `GenomeBinning/DASTool/` - - `[assembler]-[sample/group]_allBins.eval`: Tab-delimited description with quality and completeness metrics for the input bin sets. Quality and completeness are estimated by DAS TOOL using a scoring function based on the frequency of bacterial or archaeal reference single-copy genes (SCG). Please see note at the bottom of this section on file names. - - `[assembler]-[sample/group]_DASTool_summary.tsv`: Tab-delimited description with quality and completeness metrics for the refined output bin sets. - - `[assembler]-[sample/group]_DASTool_contig2bin.tsv`: File describing which contig is associated to which bin from the input binners. - - `[assembler]-[sample/group]_DASTool.log`: Log file from the DAS Tool run describing the command executed and additional runtime information. - - `[assembler]-[sample/group].seqlength`: Tab-delimited file describing the length of each contig. - - `bins/[assembler]-[binner]Refined-[sample/group].*.fa`: Refined bins in fasta format. - - `unbinned/[assembler]-DASToolUnbinned-[sample/group].*.fa`: Unbinned contigs from bin refinement in fasta format. - -
- -By default, only the raw bins (and unbinned contigs) from the actual binning methods, but not from the binning refinement with DAS Tool, will be used for downstream bin quality control, annotation and taxonomic classification. The parameter `--postbinning_input` can be used to change this behaviour. - -⚠️ Due to ability to perform downstream QC of both raw and refined bins in parallel (via `--postbinning_input)`, bin names in DAS Tools's `*_allBins.eval` file will include `Refined`. However for this particular file, they _actually_ refer to the 'raw' input bins. The pipeline renames the input files prior to running DASTool to ensure they can be disambiguated from the original bin files in the downstream QC steps. - -### Tiara - -Tiara is a contig classifier that identifies the domain (prokarya, eukarya) of contigs within an assembly. This is used in this pipeline to rapidly and with few resources identify the most likely domain classification of each bin or unbin based on its contig identities. - -
-Output files - -- `Taxonomy/Tiara/` - - `[assembler]-[sample/group].tiara.txt` - Tiara output classifications (with probabilities) for all contigs within the specified sample/group assembly - - `log_[assembler]-[sample/group].txt` - log file detailing the parameters used by the Tiara model for contig classification. -- `GenomeBinning/tiara_summary.tsv` - Summary of Tiara domain classification for all bins. - -
- -Typically, you would use `tiara_summary.tsv` as the primary file to see which bins or unbins have been classified to which domains at a glance, whereas the files in `Taxonomy/Tiara` provide classifications for each contig. - -### Bin sequencing depth - -For each bin or refined bin the median sequencing depth is computed based on the corresponding contig depths. - -
-Output files - -- `GenomeBinning/depths/bins/` - - `bin_depths_summary.tsv`: Summary of bin sequencing depths for all samples. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. - - `bin_refined_depths_summary.tsv`: Summary of sequencing depths for refined bins for all samples, if refinement was performed. Depths are available for samples mapped against the corresponding assembly, i.e. according to the mapping strategy specified with `--binning_map_mode`. Only for short reads. - - `[assembler]-[binner]-[sample/group]-binDepths.heatmap.png`: Clustered heatmap showing bin abundances of the assembly across samples. Bin depths are transformed to centered log-ratios and bins as well as samples are clustered by Euclidean distance. Again, sample depths are available according to the mapping strategy specified with `--binning_map_mode`. - -
- -### QC for metagenome assembled genomes with QUAST - -[QUAST](http://cab.spbu.ru/software/quast/) is a tool that evaluates genome assemblies by computing various metrics. The QUAST output is in the bin directories shown below. This QUAST output is not shown in the MultiQC report. - -
-Output files - -- `GenomeBinning/QC/QUAST/[assembler]-[bin]/` - - `report.*`: QUAST report in various formats, such as html, pdf, tex, tsv, or txt - - `transposed_report.*`: QUAST report that has been transposed into wide format (tex, tsv, or txt) - - `quast.log`: QUAST log file - - `metaquast.log`: MetaQUAST log file - - `icarus.html`: Icarus main menu with links to interactive viewers - - `icarus_viewers/contig_size_viewer.html`: Diagram of contigs that are ordered from longest to shortest - - `basic_stats/cumulative_plot.pdf`: Shows the growth of contig lengths (contigs are ordered from largest to shortest) - - `basic_stats/GC_content_plot.pdf`: Shows the distribution of GC content in the contigs - - `basic_stats/[assembler]-[bin]_GC_content_plot.pdf`: Histogram of the GC percentage for the contigs - - `basic_stats/Nx_plot.pdf`: Plot of Nx values as x varies from 0 to 100%. - - `predicted_genes/[assembler]-[bin].rna.gff`: Contig positions for rRNA genes in gff version 3 format - - `predicted_genes/barrnap.log`: Barrnap log file (ribosomal RNA predictor) -- `GenomeBinning/QC/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]-quast_summary.tsv`: QUAST output summarized per sample/condition. - - `quast_summary.tsv`: QUAST output for all bins summarized - -
- -### QC for metagenome assembled genomes - -#### BUSCO - -[BUSCO](https://busco.ezlab.org/) is a tool used to assess the completeness of a genome assembly. It is run on all the genome bins and high quality contigs obtained by the applied binning and/or binning refinement methods (depending on the `--postbinning_input` parameter). By default, BUSCO is run in automated lineage selection mode in which it first tries to select the domain and then a more specific lineage based on phylogenetic placement. If available, result files for both the selected domain lineage and the selected more specific lineage are placed in the output directory. If a lineage dataset is specified already with `--busco_db`, only results for this specific lineage will be generated. - -
-Output files - -- `GenomeBinning/QC/BUSCO/` - - `[assembler]-[bin]_busco.log`: Log file containing the standard output of BUSCO. - - `[assembler]-[bin]_busco.err`: File containing potential error messages returned from BUSCO. - - `short_summary.domain.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results for the selected domain when run in automated lineage selection mode. Not available for bins for which a viral lineage was selected. - - `short_summary.specific_lineage.[lineage].[assembler]-[bin].txt`: BUSCO summary of the results in case a more specific lineage than the domain could be selected or for the lineage provided via `--busco_db`. - - `[assembler]-[bin]_buscos.[lineage].fna.gz`: Nucleotide sequence of all identified BUSCOs for used lineages (domain or specific). - - `[assembler]-[bin]_buscos.[lineage].faa.gz`: Aminoacid sequence of all identified BUSCOs for used lineages (domain or specific). - - `[assembler]-[bin]_prodigal.gff`: Genes predicted with Prodigal. - -
- -If the parameter `--save_busco_db` is set, additionally the used BUSCO lineage datasets are stored in the output directory. - -
-Output files - -- `GenomeBinning/QC/BUSCO/` - - `busco_downloads/`: All files and lineage datasets downloaded by BUSCO when run in automated lineage selection mode. (Can currently not be used to reproduce analysis, see the [nf-core/mag website documentation](https://nf-co.re/mag/usage#reproducibility) how to achieve reproducible BUSCO results). - - `reference/*.tar.gz`: BUSCO reference lineage dataset that was provided via `--busco_db`. - -
- -Besides the reference files or output files created by BUSCO, the following summary files will be generated: - -
-Output files - -- `GenomeBinning/QC/` - - `busco_summary.tsv`: A summary table of the BUSCO results, with % of marker genes found. If run in automated lineage selection mode, both the results for the selected domain and for the selected more specific lineage will be given, if available. - -
- -#### CheckM - -[CheckM](https://ecogenomics.github.io/CheckM/) CheckM provides a set of tools for assessing the quality of genomes recovered from isolates, single cells, or metagenomes. It provides robust estimates of genome completeness and contamination by using collocated sets of genes that are ubiquitous and single-copy within a phylogenetic lineage - -By default, nf-core/mag runs CheckM with the `check_lineage` workflow that places genome bins on a reference tree to define lineage-marker sets, to check for completeness and contamination based on lineage-specific marker genes. and then subsequently runs `qa` to generate the summary files. - -
-Output files - -- `GenomeBinning/QC/CheckM/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_qa.txt`: Detailed statistics about bins informing completeness and contamamination scores (output of `checkm qa`). This should normally be your main file to use to evaluate your results. - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]_wf.tsv`: Overall summary file for completeness and contamination (output of `checkm lineage_wf`). - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/`: intermediate files for CheckM results, including CheckM generated annotations, log, lineage markers etc. - - `checkm_summary.tsv`: A summary table of the CheckM results for all bins (output of `checkm qa`). - -
- -If the parameter `--save_checkm_reference` is set, additionally the used the CheckM reference datasets are stored in the output directory. - -
-Output files - -- `GenomeBinning/QC/CheckM/` - - `checkm_downloads/`: All CheckM reference files downloaded from the CheckM FTP server, when not supplied by the user. - - `checkm_data_2015_01_16/*`: a range of directories and files required for CheckM to run. - -
- -#### GUNC - -[Genome UNClutterer (GUNC)](https://grp-bork.embl-community.io/gunc/index.html) is a tool for detection of chimerism and contamination in prokaryotic genomes resulting from mis-binning of genomic contigs from unrelated lineages. It does so by applying an entropy based score on taxonomic assignment and contig location of all genes in a genome. It is generally considered as a additional complement to CheckM results. - -
-Output files - -- `GenomeBinning/QC/gunc_summary.tsv` -- `GenomeBinning/QC/gunc_checkm_summary.tsv` -- `[gunc-database].dmnd` -- `GUNC/` - - `raw/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/GUNC_checkM.merged.tsv`: Per sample GUNC [output](https://grp-bork.embl-community.io/gunc/output.html) containing with taxonomic and completeness QC statistics. - - `checkmmerged/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group]/GUNC.progenomes_2.1.maxCSS_level.tsv`: Per sample GUNC output merged with output from [CheckM](#checkm) - -
- -GUNC will be run if specified with `--run_gunc` as a standalone, unless CheckM is also activated via `--qc_tool 'checkm'`, in which case GUNC output will be merged with the CheckM output using `gunc merge_checkm`. - -If `--gunc_save_db` is specified, the output directory will also contain the requested database (progenomes, or GTDB) in DIAMOND format. - -## Taxonomic classification of binned genomes - -### CAT - -[CAT](https://github.com/dutilh/CAT) is a toolkit for annotating contigs and bins from metagenome-assembled-genomes. The nf-core/mag pipeline uses CAT to assign taxonomy to genome bins based on the taxnomy of the contigs. - -
-Output files - -- `Taxonomy/CAT/[assembler]/[binner]/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.names.txt.gz`: Tab-delimited files containing the lineage of each contig, with full lineage names - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.names.txt.gz`: Taxonomy classification of the genome bins, with full lineage names -- `Taxonomy/CAT/[assembler]/[binner]/raw/` - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.faa.gz`: Predicted protein sequences for each genome bin, in fasta format - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].concatenated.predicted_proteins.gff.gz`: Predicted protein features for each genome bin, in gff format - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].ORF2LCA.txt.gz`: Tab-delimited files containing the lineage of each contig - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].bin2classification.txt.gz`: Taxonomy classification of the genome bins - - `[assembler]-[binner]-[domain]-[refinement]-[sample/group].log`: Log files - -
- -If the parameters `--cat_db_generate` and `--save_cat_db` are set, additionally the generated CAT database is stored: - -
-Output files - -- `Taxonomy/CAT/CAT_prepare_*.tar.gz`: Generated and used CAT database. - -
- -### GTDB-Tk - -[GTDB-Tk](https://github.com/Ecogenomics/GTDBTk) is a toolkit for assigning taxonomic classifications to bacterial and archaeal genomes based on the Genome Database Taxonomy [GTDB](https://gtdb.ecogenomic.org/). nf-core/mag uses GTDB-Tk to classify binned genomes which satisfy certain quality criteria (i.e. completeness and contamination assessed with the BUSCO analysis). - -
-Output files - -- `Taxonomy/GTDB-Tk/[assembler]/[binner]/[sample/group]/` - - `gtdbtk.[assembler]-[binner]-[sample/group].{bac120/ar122}.summary.tsv`: Classifications for bacterial and archaeal genomes (see the [GTDB-Tk documentation for details](https://ecogenomics.github.io/GTDBTk/files/summary.tsv.html)). - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.classify.tree.gz`: Reference tree in Newick format containing query genomes placed with pplacer. - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.markers_summary.tsv`: A summary of unique, duplicated, and missing markers within the 120 bacterial marker set, or the 122 archaeal marker set for each submitted genome. - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.msa.fasta.gz`: FASTA file containing MSA of submitted and reference genomes. - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].{bac120/ar122}.filtered.tsv`: A list of genomes with an insufficient number of amino acids in MSA. - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].*.log`: Log files. - - `gtdbtk.[assembler]-[binner]-[domain]-[refinement]-[sample/group].failed_genomes.tsv`: A list of genomes for which the GTDB-Tk analysis failed, e.g. because Prodigal could not detect any genes. -- `Taxonomy/GTDB-Tk/gtdbtk_summary.tsv`: A summary table of the GTDB-Tk classification results for all bins, also containing bins which were discarded based on the BUSCO QC, which were filtered out by GTDB-Tk (listed in `*.filtered.tsv`) or for which the analysis failed (listed in `*.failed_genomes.tsv`). - -
- -## Genome annotation of binned genomes - -### Prokka - -Whole genome annotation is the process of identifying features of interest in a set of genomic DNA sequences, and labelling them with useful information. [Prokka](https://github.com/tseemann/prokka) is a software tool to annotate bacterial, archaeal and viral genomes quickly and produce standards-compliant output files. - -
-Output files - -- `Annotation/Prokka/[assembler]/[bin]/` - - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format, containing both sequences and annotations - - `[assembler]-[binner]-[bin].gbk`: annotation in GenBank format, containing both sequences and annotations - - `[assembler]-[binner]-[bin].fna`: nucleotide FASTA file of the input contig sequences - - `[assembler]-[binner]-[bin].faa`: protein FASTA file of the translated CDS sequences - - `[assembler]-[binner]-[bin].ffn`: nucleotide FASTA file of all the prediction transcripts (CDS, rRNA, tRNA, tmRNA, misc_RNA) - - `[assembler]-[binner]-[bin].sqn`: an ASN1 format "Sequin" file for submission to Genbank - - `[assembler]-[binner]-[bin].fsa`: nucleotide FASTA file of the input contig sequences, used by "tbl2asn" to create the .sqn file - - `[assembler]-[binner]-[bin].tbl`: feature Table file, used by "tbl2asn" to create the .sqn file - - `[assembler]-[binner]-[bin].err`: unacceptable annotations - the NCBI discrepancy report. - - `[assembler]-[binner]-[bin].log`: contains all the output that Prokka produced during its run - - `[assembler]-[binner]-[bin].txt`: statistics relating to the annotated features found - - `[assembler]-[binner]-[bin].tsv`: tab-separated file of all features (locus_tag, ftype, len_bp, gene, EC_number, COG, product) - -
- -### MetaEuk - -In cases where eukaryotic genomes are recovered in binning, [MetaEuk](https://github.com/soedinglab/metaeuk) is also available to annotate eukaryotic genomes quickly with standards-compliant output files. - -
-Output files - -- `Annotation/MetaEuk/[assembler]/[bin]` - - `[assembler]-[binner]-[bin].fas`: fasta file of protein sequences identified by MetaEuk - - `[assembler]-[binner]-[bin].codon.fas`: fasta file of nucleotide sequences corresponding to the protein sequences fasta - - `[assembler]-[binner]-[bin].headersMap.tsv`: tab-separated table containing the information from each header in the fasta files - - `[assembler]-[binner]-[bin].gff`: annotation in GFF3 format - -
- -## Additional summary for binned genomes - -
-Output files - -- `GenomeBinning/bin_summary.tsv`: Summary of bin sequencing depths together with BUSCO, CheckM, QUAST and GTDB-Tk results, if at least one of the later was generated. This will also include refined bins if `--refine_bins_dastool` binning refinement is performed. Note that in contrast to the other tools, for CheckM the bin name given in the column "Bin Id" does not contain the ".fa" extension. - -
- -## Ancient DNA - -Optional, only running when parameter `-profile ancient_dna` is specified. - -### `PyDamage` - -[Pydamage](https://github.com/maxibor/pydamage), is a tool to automate the process of ancient DNA damage identification and estimation from contigs. After modelling the ancient DNA damage using the C to T transitions, Pydamage uses a likelihood ratio test to discriminate between truly ancient, and modern contigs originating from sample contamination. - -
-Output files - -- `Ancient_DNA/pydamage/analyze` - - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage raw result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) -- `Ancient_DNA/pydamage/filter` - - `[assembler]_[sample/group]/pydamage_results/pydamage_results.csv`: PyDamage filtered result tabular file in `.csv` format. Format described here: [pydamage.readthedocs.io/en/0.62/output.html](https://pydamage.readthedocs.io/en/0.62/output.html) - -
- -### `variant_calling` - -Because of aDNA damage, _de novo_ assemblers sometimes struggle to call a correct consensus on the contig sequence. To avoid this situation, the consensus is optionally re-called with a variant calling software using the reads aligned back to the contigs when `--run_ancient_damagecorrection` is supplied. - -
-Output files - -- `variant_calling/consensus` - - `[assembler]_[sample/group].fa`: contigs sequence with re-called consensus from read-to-contig alignment -- `variant_calling/unfiltered` - - `[assembler]_[sample/group].vcf.gz`: raw variant calls of the reads aligned back to the contigs. -- `variant_calling/filtered` - - `[assembler]_[sample/group].filtered.vcf.gz`: quality filtered variant calls of the reads aligned back to the contigs. - -
- -### MultiQC - -
-Output files - -- `multiqc/` - - `multiqc_report.html`: a standalone HTML file that can be viewed in your web browser. - - `multiqc_data/`: directory containing parsed statistics from the different tools used in the pipeline. - - `multiqc_plots/`: directory containing static images from the report in various formats. - -
- -[MultiQC](http://multiqc.info) is a visualization tool that generates a single HTML report summarising all samples in your project. Most of the pipeline QC results are visualised in the report and further statistics are available in the report data directory. - -Results generated by MultiQC collate pipeline QC from supported tools e.g. FastQC. The pipeline has special steps which also allow the software versions to be reported in the MultiQC output for future traceability. For more information about how to use MultiQC reports, see . - -The general stats table at the top of the table will by default only display the most relevant pre- and post-processing statistics prior to assembly, i.e., FastQC, fastp/Adapter removal, and Bowtie2 PhiX and host removal mapping results. - -Note that the FastQC raw and processed columns are right next to each other for improved visual comparability, however the processed columns represent the input reads _after_ fastp/Adapter Removal processing (the dedicated columns of which come directly after the two FastQC set of columns). Hover your cursor over each column name to see the which tool the column is derived from. - -Summary tool-specific plots and tables of following tools are currently displayed (if activated): - -- FastQC (pre- and post-trimming) -- fastp -- Adapter Removal -- bowtie2 -- BUSCO -- QUAST -- Kraken2 / Centrifuge -- PROKKA - -### Pipeline information - -
-Output files - -- `pipeline_info/` - - Reports generated by Nextflow: `execution_report.html`, `execution_timeline.html`, `execution_trace.txt` and `pipeline_dag.dot`/`pipeline_dag.svg`. - - Reports generated by the pipeline: `pipeline_report.html`, `pipeline_report.txt` and `software_versions.yml`. The `pipeline_report*` files will only be present if the `--email` / `--email_on_fail` parameter's are used when running the pipeline. - - Parameters used by the pipeline run: `params.json`. - -
- -[Nextflow](https://www.nextflow.io/docs/latest/tracing.html) provides excellent functionality for generating various reports relevant to the running and execution of the pipeline. This will allow you to troubleshoot errors with the running of the pipeline, and also provide you with other information such as launch commands, run times and resource usage. - # nf-core/taxprofiler: Output ## Introduction From d6fc1c8f264684066a73c5ba4e70b36eca7c29c8 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:31:27 +0100 Subject: [PATCH 04/25] Replaced mag.md by taxprofiler.md --- bu_isciii/assets/reports/results/mag.md | 26 ------------------- .../assets/reports/results/taxprofiler.md | 9 +++++++ 2 files changed, 9 insertions(+), 26 deletions(-) delete mode 100644 bu_isciii/assets/reports/results/mag.md create mode 100644 bu_isciii/assets/reports/results/taxprofiler.md diff --git a/bu_isciii/assets/reports/results/mag.md b/bu_isciii/assets/reports/results/mag.md deleted file mode 100644 index ce9685141..000000000 --- a/bu_isciii/assets/reports/results/mag.md +++ /dev/null @@ -1,26 +0,0 @@ -## MAG - -Here we describe the results from the MAG pipeline for multispecies metagenomic analysis. - -### MAG - TAXONIMIC ANALYSIS - -* `krona_results.html`​ : Final HTML report with the top 5 species most present in all samples. - -> [!WARNING] -> Software's versions used in this analysis can be obtained from the `MultiQC` report. - -### MAG - COMPLETE ANALYSIS - -* `mag_all/krona/${sample_name}.${tool}.report.html`: A Krona interactive visualization report for the each sample based on Kraken2 (or other) taxonomic classification mehtod. -* `mag_all/quast/${sample_name}.${tool}.report.html`: A Quast report for the assembly quality control of each sample assembled using MEGAHIT, SPAdes or other. -* `mag_all/multiqc_report.html`: A combined report generated by MultiQC summarizing various quality control results for all samples. - -## Taxprofiler - -Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. - -* `taxprofiler/multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. -* `taxprofiler/krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) - -> [!WARNING] -> Software's versions used in this analysis can be obtained from the `MultiQC` report. \ No newline at end of file diff --git a/bu_isciii/assets/reports/results/taxprofiler.md b/bu_isciii/assets/reports/results/taxprofiler.md new file mode 100644 index 000000000..d16ab8597 --- /dev/null +++ b/bu_isciii/assets/reports/results/taxprofiler.md @@ -0,0 +1,9 @@ +## Taxprofiler + +Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. + +* `taxprofiler/multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. +* `taxprofiler/krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) + +> [!WARNING] +> Software's versions used in this analysis can be obtained from the `MultiQC` report. From cc36e5dd0f8f9dd5d456eef92f5b28452f6019f1 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:31:48 +0100 Subject: [PATCH 05/25] Modified irma_output.md to include only taxprofiler --- bu_isciii/assets/reports/results/irma_output.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/assets/reports/results/irma_output.md b/bu_isciii/assets/reports/results/irma_output.md index ef4d79f3f..c45d3dfb0 100644 --- a/bu_isciii/assets/reports/results/irma_output.md +++ b/bu_isciii/assets/reports/results/irma_output.md @@ -2,9 +2,9 @@ This markdown briefly describes the files found in `RESULTS/` folder for IRMA services. As described [here]() -## **`krona_results.html`** +## **`taxprofiler`** -Includes the multiQC html report from MAG, you can find a further description in [MAG](https://github.com/BU-ISCIII/buisciii-tools/blob/main/bu_isciii/assets/reports/md/mag.md). +Includes the results from Taxprofiler, you can find a further description in [Taxprofiler](https://github.com/BU-ISCIII/buisciii-tools/blob/main/bu_isciii/assets/reports/md/taxprofiler.md). ## Files in `fragment_name/` From f11b7419abfe8f2e6339188f987c725956f69e3a Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:32:54 +0100 Subject: [PATCH 06/25] Updated services.json with taxprofiler --- bu_isciii/templates/services.json | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/bu_isciii/templates/services.json b/bu_isciii/templates/services.json index e0b1fcc2b..938b8a18a 100755 --- a/bu_isciii/templates/services.json +++ b/bu_isciii/templates/services.json @@ -21,7 +21,7 @@ "template": "mtbseq", "order": 1, "begin": "base", - "end": "mag_met", + "end": "taxprofiler", "url": "https://github.com/ngs-fzb/MTBseq_source", "description": "Mycobacterium tuberculosis mapping, variant calling and detection of resistance using MTBseq", "clean": { @@ -38,7 +38,7 @@ "template": "pikavirus", "order": 1, "begin": "base", - "end": "mag_met", + "end": "taxprofiler", "url": "https://github.com/BU-ISCIII/PikaVirus", "description": "PikaVirus, a mapping-based tool for metagenome analysis of virus.", "clean": { @@ -71,7 +71,7 @@ "template": "taranis", "order": 1, "begin": "base", - "end": "mag_met", + "end": "taxprofiler", "url": "https://github.com/BU-ISCIII/taranis", "description": "Multilocus sequence typing (MLST) using Taranis", "depends_on": "assembly_annotation", @@ -89,7 +89,7 @@ "template": "chewbbaca", "order": 1, "begin": "base", - "end": "mag_met", + "end": "taxprofiler", "url": "https://github.com/B-UMMI/chewBBACA", "description": "Multilocus sequence typing (MLST) using chewBBACA", "depends_on": "assembly_annotation", @@ -108,7 +108,7 @@ "url": "https://github.com/BU-ISCIII/viralrecon", "order": 1, "begin": "", - "end": "mag_met", + "end": "taxprofiler", "description": "Viral genome reconstruction analysis for SARS-COV-2 data", "clean": { "folders":[], @@ -125,7 +125,7 @@ "url": "https://github.com/nf-core/rnaseq", "order": 1, "begin": "", - "end": "mag_met", + "end": "taxprofiler", "description": "RNA-seq analysis", "clean": { "folders":[], @@ -203,22 +203,22 @@ "delivery_md": "", "results_md": "" }, - "mag_met": { + "taxprofiler": { "label": "Taxonomic based Identification and classification of organisms in complex communities", - "template": "mag", + "template": "taxprofiler", "order": 2, "begin": "base", "end": "", - "url": "https://github.com/nf-core/mag or https://github.com/nf-core/taxprofiler", - "description": "1- Bioinformatics best-practise analysis for taxonomic classification and/or genome binning; 2- Bioinformatics best-practise analysis pipeline for assembly, binning and annotation of metagenomes.", + "url": "https://github.com/nf-core/taxprofiler", + "description": "Highly parallelised multi-taxonomic profiling of shotgun short- and long-read metagenomic data", "clean": { "folders":[], "files":[] }, "no_copy": ["RAW", "TMP"], "last_folder":"RESULTS", - "delivery_md": "assets/reports/md/mag.md", - "results_md": "assets/reports/results/mag.md" + "delivery_md": "assets/reports/md/taxprofiler.md", + "results_md": "assets/reports/results/taxprofiler.md" }, "exometrio": { "label": "Human: Exome sequencing for variant calling, annotation and inheritance filtering (e.g. Exome sequencing of a human trio (two parents and one child))", @@ -294,7 +294,7 @@ "url": "", "order": 1, "begin": "", - "end": "", + "end": "taxprofiler", "description": "", "clean": { "folders":["02-preprocessing"], From dc74ced8d314f40caa9325a589f366307e6535da Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:33:20 +0100 Subject: [PATCH 07/25] Created a lablog file for chewbbaca/REFERENCES --- bu_isciii/templates/chewbbaca/REFERENCES/lablog | 13 +++++++++++++ 1 file changed, 13 insertions(+) create mode 100644 bu_isciii/templates/chewbbaca/REFERENCES/lablog diff --git a/bu_isciii/templates/chewbbaca/REFERENCES/lablog b/bu_isciii/templates/chewbbaca/REFERENCES/lablog new file mode 100644 index 000000000..6e8f58f47 --- /dev/null +++ b/bu_isciii/templates/chewbbaca/REFERENCES/lablog @@ -0,0 +1,13 @@ +#!/bin/bash + +# module load singularity + +mkdir logs +scratch_dir=$(echo $PWD | sed "s/\/data\ucct/\/bi\/scratch_tmp/\/scratch/g") + +# Schema creation +# WARNING: You have to indicate manually the folder in which your .fasta files are for the schema creation +echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-CREATE-SCHEMA.%j.log --job-name CHEWBBACA-CREATE-SCHEMA --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../ /data/ucct/bi/pipelines/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py CreateSchema -i ${scratch_dir}/%% -o ./created_schema --cpu 4 &" > _01_create_schema.sh + +# cgMLST extraction +echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-EXTRACT-CGMLST.%j.log --job-name CHEWBBACA-EXTRACT-CGMLST --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../../ /data/ucct/bi/pipelines/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py ExtractCgMLST -i ${scratch_dir}/../ANALYSIS/*/*-chewbbaca/allele_calling/results_alleles.tsv -o ./results_cgmlst &" > _02_extract_cgmlst.sh From 6cb1cbf58b83cf593706317e1509c2a119b47fcc Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:40:09 +0100 Subject: [PATCH 08/25] Updated lablog_irma and renamed ANALYSIS01 folders --- .../01-preproQC/lablog | 0 .../02-preprocessing/lablog | 0 .../{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/03-procQC/lablog | 0 .../ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/lablog | 0 bu_isciii/templates/IRMA/ANALYSIS/lablog_irma | 4 ++-- 5 files changed, 2 insertions(+), 2 deletions(-) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/01-preproQC/lablog (100%) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/02-preprocessing/lablog (100%) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/03-procQC/lablog (100%) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/lablog (100%) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/01-preproQC/lablog similarity index 100% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/01-preproQC/lablog rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/01-preproQC/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/02-preprocessing/lablog similarity index 100% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/02-preprocessing/lablog rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/02-preprocessing/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/03-procQC/lablog similarity index 100% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/03-procQC/lablog rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/03-procQC/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/lablog similarity index 100% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/lablog rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/lablog diff --git a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma index 798ee5497..5de6e19d9 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma +++ b/bu_isciii/templates/IRMA/ANALYSIS/lablog_irma @@ -1,4 +1,4 @@ #ls ../RAW/* | tr '\/' '\t' | cut -f3 | cut -d "_" -f 1 | sort -u | grep -v "md5" > samples_id.txt mkdir -p 00-reads -mv ANALYSIS01_FLU_IRMA $(date '+%Y%m%d')_ANALYSIS01_FLU_IRMA -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - \ No newline at end of file +mv ANALYSIS01_IRMA $(date '+%Y%m%d')_ANALYSIS01_IRMA +cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - From 5768698ddcca73b784ce91f85c519ef030e3f105 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:43:32 +0100 Subject: [PATCH 09/25] Updated IRMA template to include RSV --- .../04-irma/create_irma_stats_flu.sh} | 6 +- .../04-irma/create_irma_stats_rsv.sh | 11 ++++ .../04-irma/create_irma_vcf.py | 0 .../04-irma/lablog_flu} | 8 +-- .../ANALYSIS01_IRMA/04-irma/lablog_rsv | 12 ++++ .../04-irma/postprocessing_flu.sh | 57 +++++++++++++++++++ .../04-irma/postprocessing_rsv.sh | 12 ++++ .../IRMA/RESULTS/lablog_irma_results | 38 ++++++++++--- 8 files changed, 130 insertions(+), 14 deletions(-) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh => ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh} (75%) create mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_rsv.sh rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA => ANALYSIS01_IRMA}/04-irma/create_irma_vcf.py (100%) rename bu_isciii/templates/IRMA/ANALYSIS/{ANALYSIS01_FLU_IRMA/04-irma/lablog => ANALYSIS01_IRMA/04-irma/lablog_flu} (59%) create mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_rsv create mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_flu.sh create mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_rsv.sh mode change 100755 => 100644 bu_isciii/templates/IRMA/RESULTS/lablog_irma_results diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh similarity index 75% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh index cc39eb168..11f075497 100755 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_stats.sh +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh @@ -1,5 +1,5 @@ -echo -e "sample_ID\tTotalReads\tMappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt +echo -e "sample_ID\tTotalReads\tMappedReads\t%MappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt cat ../samples_id.txt | while read in do @@ -19,9 +19,9 @@ PB2=$(grep '4-[A-C]_PB2' ${in}/tables/READ_COUNTS.txt | cut -f2) #In case of Influenza C in samples: HE=$(grep '4-C_HE' ${in}/tables/READ_COUNTS.txt | cut -f2) if [[ -n "$HE" ]]; then - LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE)) + LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $PCTMAPPED) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2) <(echo $HE)) else - LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2)) + LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $PCTMAPPED) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2)) fi echo "$LINE" >> irma_stats.txt diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_rsv.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_rsv.sh new file mode 100644 index 000000000..781e0539d --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_rsv.sh @@ -0,0 +1,11 @@ +echo -e "Sample_ID\tTotalReads\tMappedReads\t%MappedReads\tRSV_type" > irma_stats_rsv.txt + +cat ../samples_id.txt | while read in +do +SAMPLE_ID=$(echo ${in}) +TOTAL_READS=$(grep '1-initial' ${in}/tables/READ_COUNTS.txt | cut -f2) +MAPPEDREADS=$(grep '3-match' ${in}/tables/READ_COUNTS.txt | cut -f2) +PCTMAPPED=$(awk "BEGIN {printf \"%.2f\", ($MAPPEDREADS/$TOTAL_READS)*100}") +RSV_TYPE=$(grep '4-RSV_' ${in}/tables/READ_COUNTS.txt | cut -f1 | cut -d '_' -f2) +echo -e "${SAMPLE_ID}\t${TOTAL_READS}\t${MAPPEDREADS}\t${PCTMAPPED}\t${RSV_TYPE}" >> irma_stats_rsv.txt +done diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_vcf.py similarity index 100% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/create_irma_vcf.py rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_vcf.py diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_flu similarity index 59% rename from bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog rename to bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_flu index fcf827589..842ef3eb6 100644 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/lablog +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_flu @@ -5,10 +5,10 @@ mkdir logs scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") -cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/ucct/bi/pipelines/flu-amd/flu-amd-1.1.4/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma.sh +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/ucct/bi/pipelines/flu-amd/flu-amd-1.2-0/IRMA FLU_AD ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma_flu.sh -echo 'bash create_irma_stats.sh' > _02_create_stats.sh +echo 'bash create_irma_stats_flu.sh' > _02_create_stats_flu.sh -echo 'bash postprocessing.sh' > _03_post_processing.sh +echo 'bash postprocessing_flu.sh' > _03_post_processing_flu.sh -echo 'sed "s/__//g" irma_stats.txt | sed "s/_\t/\t/g" > clean_irma_stats.txt' >> _03_post_processing.sh +echo 'sed "s/__//g" irma_stats_flu.txt | sed "s/_\t/\t/g" > clean_irma_stats_flu.txt' >> _03_post_processing_flu.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_rsv b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_rsv new file mode 100644 index 000000000..10430f7bf --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/lablog_rsv @@ -0,0 +1,12 @@ +#module load R +#Fix pearl warning, add "export LC_ALL="en_US.UTF8" to .basrhc or run it now + +mkdir logs + +scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") + +cat ../samples_id.txt | while read in; do echo "srun --partition short_idx --cpus-per-task 32 --mem 35000M --chdir $scratch_dir --time 01:00:00 --output logs/IRMA.${in}.%j.log /data/ucct/bi/pipelines/flu-amd/flu-amd-1.2.0/IRMA RSV ../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz ../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz ${in} --external-config ../../../DOC/irma_config.sh &"; done > _01_irma_rsv.sh + +echo 'bash create_irma_stats_rsv.sh' > _02_create_stats_rsv.sh + +echo 'bash postprocessing_rsv.sh' > _03_post_processing_rsv.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_flu.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_flu.sh new file mode 100644 index 000000000..2ef0e02cf --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_flu.sh @@ -0,0 +1,57 @@ +#CLEAN +if test -f all_samples_completo.txt; then rm all_samples_completo.txt; fi +if test -d A_*; then rm -rf A_*; fi +if test -d B; then rm -rf B; fi +if test -d C; then rm -rf C; fi +if test -d D; then rm -rf D; fi + +cat ../samples_id.txt | while read sample; do + FLUSUBTYPE=$(ls ${sample}/*H*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,3 | sort -u) + FLUTYPE=$(ls ${sample}/*H*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1 | sort -u) + mkdir -p $FLUSUBTYPE + ls ${sample}/amended_consensus/*.fa | cut -d '_' -f3 | cut -d '.' -f1 | while read fragment; do + if [ $fragment == 1 ]; then + if [ $FLUTYPE == "B" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_1/_PB1/' | tee -a ${FLUSUBTYPE}/B_PB1.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_1/_PB2/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PB2.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 2 ]; then + if [ $FLUTYPE == "B" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_2/_PB2/' | tee -a ${FLUSUBTYPE}/B_PB2.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_2/_PB1/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PB1.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 3 ]; then + if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_3/_PA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PA.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_3/_P3/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_P3.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 4 ]; then + if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_4/_HA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_HA.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_4/_HE/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_HE.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 5 ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_5/_NP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NP.txt all_samples_completo.txt > /dev/null + elif [ $fragment == 6 ]; then + if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_6/_NA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NA.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_6/_MP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_MP.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 7 ]; then + if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_7/_MP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_MP.txt all_samples_completo.txt > /dev/null + else + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_7/_NS/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NS.txt all_samples_completo.txt > /dev/null + fi + elif [ $fragment == 8 ]; then + cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_8/_NS/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NS.txt all_samples_completo.txt > /dev/null + else + echo "The sample $sample has a segment with number $fragment, but I don't know which segment it is." + fi + done +done diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_rsv.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_rsv.sh new file mode 100644 index 000000000..278ad435e --- /dev/null +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/postprocessing_rsv.sh @@ -0,0 +1,12 @@ +#CLEAN +if test -f all_samples_completo.txt; then rm all_samples_completo.txt; fi +if test -d A; then rm -rf A; fi +if test -d B; then rm -rf B; fi +if test -d AD; then rm -rf AD; fi +if test -d BD; then rm -rf BD; fi + +cat ../samples_id.txt | while read sample; do + RSVTYPE=$(ls ${sample}/*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f2 | sort -u) + mkdir -p $RSVTYPE + cat ${sample}/amended_consensus/${sample}.fa | sed 's/-/\//g' | sed "s/^>\([^/]*\)/>${RSVTYPE}\/\1/" | tee -a ${RSVTYPE}/${RSVTYPE}.txt all_samples_completo.txt > /dev/null +done diff --git a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results old mode 100755 new mode 100644 index 5cb7c418d..bab5ec1fd --- a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results @@ -1,10 +1,34 @@ +#!/bin/bash + mkdir $(date '+%Y%m%d')_entrega01 cd $(date '+%Y%m%d')_entrega01 -#Create symbolic links depending on the analysis -#Individual files -ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/all_samples_completo.txt . -ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/A_H* . -ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/B . -ln -s ../../ANALYSIS/*FLU_IRMA/04-irma/C . -tail -n +2 ../../ANALYSIS/*_FLU_IRMA/04-irma/clean_irma_stats.txt | cut -f4 | sort | uniq -c > flu_type_summary.txt \ No newline at end of file +# Setting the organism +echo "Please specify the organism that was analysed." +echo "1. FLU" +echo "2. RSV" +while true; do + read -p "Enter your choice (1 or 2): " ORGANISM + if [ "$ORGANISM" == "1" ]; then + ORGANISM="FLU" + echo "You selected $ORGANISM." + ln -s ../../ANALYSIS/*_IRMA/04-irma/all_samples_completo.txt . + ln -s ../../ANALYSIS/*_IRMA/04-irma/A_H* . + ln -s ../../ANALYSIS/*_IRMA/04-irma/B . + ln -s ../../ANALYSIS/*_IRMA/04-irma/C . + tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/clean_irma_stats.txt | cut -f4 | sort | uniq -c > flu_type_summary.txt + break + elif [ "$ORGANISM" == "2" ]; then + ORGANISM="RSV" + echo "You selected $ORGANISM." + ln -s ../../ANALYSIS/*_IRMA/04-irma/all_samples_completo.txt . + ln -s ../../ANALYSIS/*_IRMA/04-irma/A . + ln -s ../../ANALYSIS/*_IRMA/04-irma/B . + ln -s ../../ANALYSIS/*_IRMA/04-irma/AD . + ln -s ../../ANALYSIS/*_IRMA/04-irma/BD . + tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/irma_stats.txt | cut -f5 | sort | uniq -c > rsv_type_summary.txt + break + else + echo "Invalid input. Please enter 1 or 2." + fi +done From 96115ae78cb920fd005e46aaf0ddb1b440b499e8 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:45:00 +0100 Subject: [PATCH 10/25] Divided postprocessing.sh in two different files for flu and RSV --- .../04-irma/postprocessing.sh | 57 ------------------- 1 file changed, 57 deletions(-) delete mode 100644 bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/postprocessing.sh diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/postprocessing.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/postprocessing.sh deleted file mode 100644 index 2ef0e02cf..000000000 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_FLU_IRMA/04-irma/postprocessing.sh +++ /dev/null @@ -1,57 +0,0 @@ -#CLEAN -if test -f all_samples_completo.txt; then rm all_samples_completo.txt; fi -if test -d A_*; then rm -rf A_*; fi -if test -d B; then rm -rf B; fi -if test -d C; then rm -rf C; fi -if test -d D; then rm -rf D; fi - -cat ../samples_id.txt | while read sample; do - FLUSUBTYPE=$(ls ${sample}/*H*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1,3 | sort -u) - FLUTYPE=$(ls ${sample}/*H*.fasta | cut -d '/' -f2 | cut -d '.' -f1 | cut -d '_' -f1 | sort -u) - mkdir -p $FLUSUBTYPE - ls ${sample}/amended_consensus/*.fa | cut -d '_' -f3 | cut -d '.' -f1 | while read fragment; do - if [ $fragment == 1 ]; then - if [ $FLUTYPE == "B" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_1/_PB1/' | tee -a ${FLUSUBTYPE}/B_PB1.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_1/_PB2/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PB2.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 2 ]; then - if [ $FLUTYPE == "B" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_2/_PB2/' | tee -a ${FLUSUBTYPE}/B_PB2.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_2/_PB1/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PB1.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 3 ]; then - if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_3/_PA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_PA.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_3/_P3/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_P3.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 4 ]; then - if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_4/_HA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_HA.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_4/_HE/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_HE.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 5 ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_5/_NP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NP.txt all_samples_completo.txt > /dev/null - elif [ $fragment == 6 ]; then - if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_6/_NA/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NA.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_6/_MP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_MP.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 7 ]; then - if [ $FLUTYPE == "B" ] || [ $FLUTYPE == "A" ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_7/_MP/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_MP.txt all_samples_completo.txt > /dev/null - else - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_7/_NS/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NS.txt all_samples_completo.txt > /dev/null - fi - elif [ $fragment == 8 ]; then - cat ${sample}/amended_consensus/*_${fragment}.fa | sed 's/-/\//g' | sed 's/_8/_NS/' | tee -a ${FLUSUBTYPE}/${FLUTYPE}_NS.txt all_samples_completo.txt > /dev/null - else - echo "The sample $sample has a segment with number $fragment, but I don't know which segment it is." - fi - done -done From eac8dfa0614a11a16e12e37980f2295b07d18611 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:48:01 +0100 Subject: [PATCH 11/25] Replaced MAG by taxprofiler --- .../ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog | 25 ------- .../99-stats/multiqc_config.yaml | 13 ---- .../ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog | 37 --------- .../mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog | 38 ---------- .../templates/mag/ANALYSIS/lablog_mag_all | 5 -- .../mag/ANALYSIS/lablog_mag_taxonomics | 1 - bu_isciii/templates/mag/DOC/mag.config | 75 ------------------- .../mag/RESULTS/lablog_mag_all_results | 25 ------- .../mag/RESULTS/lablog_mag_taxonomics_results | 6 -- .../ANALYSIS/ANALYSIS01_TAXPROFILER/lablog | 2 +- .../ANALYSIS/lablog_taxprofiler | 0 .../DOC/databasesheet.csv | 0 .../DOC/taxprofiler.config | 0 .../templates/{mag => taxprofiler}/RAW/README | 0 .../{mag => taxprofiler}/REFERENCES/README | 0 .../RESULTS/lablog_taxprofiler_results | 5 +- .../templates/{mag => taxprofiler}/TMP/README | 0 17 files changed, 3 insertions(+), 229 deletions(-) delete mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog delete mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml delete mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog delete mode 100644 bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog delete mode 100644 bu_isciii/templates/mag/ANALYSIS/lablog_mag_all delete mode 100644 bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics delete mode 100644 bu_isciii/templates/mag/DOC/mag.config delete mode 100644 bu_isciii/templates/mag/RESULTS/lablog_mag_all_results delete mode 100755 bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results rename bu_isciii/templates/{mag => taxprofiler}/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog (99%) rename bu_isciii/templates/{mag => taxprofiler}/ANALYSIS/lablog_taxprofiler (100%) rename bu_isciii/templates/{mag => taxprofiler}/DOC/databasesheet.csv (100%) rename bu_isciii/templates/{mag => taxprofiler}/DOC/taxprofiler.config (100%) rename bu_isciii/templates/{mag => taxprofiler}/RAW/README (100%) rename bu_isciii/templates/{mag => taxprofiler}/REFERENCES/README (100%) rename bu_isciii/templates/{mag => taxprofiler}/RESULTS/lablog_taxprofiler_results (62%) rename bu_isciii/templates/{mag => taxprofiler}/TMP/README (100%) diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog deleted file mode 100644 index a9daa731f..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/lablog +++ /dev/null @@ -1,25 +0,0 @@ -#module load singularity - -cat ../../samples_id.txt | while read in; do ln -s ../*_mag/Taxonomy/kraken2/${in}/${in}.kraken2_report.txt .; done - -scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") - -cat < multiqc.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 00:30:00 -#SBATCH --partition short_idx -#SBATCH --output $(date '+%Y%m%d')_multiqc.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/multiqc:1.9--py_1 multiqc -d . --config multiqc_config.yaml - -EOF - -echo "sbatch multiqc.sbatch" > _01_run_multiqc.sh - -echo "find -type l | while read in; do unlink \${in}; done" > _02_unlink.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml deleted file mode 100644 index 96b7e6136..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_config.yaml +++ /dev/null @@ -1,13 +0,0 @@ -extra_fn_clean_exts: - - _R1 - - _R2 - - .R1 - - .R2 - - .sort - - _sort - - .stats - - _bamstat - - _align - - .txt -report_comment: > - This report has been generated by BU-ISCIII diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog deleted file mode 100644 index 6dddb048c..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS02_MAG_TAXONOMICS/lablog +++ /dev/null @@ -1,37 +0,0 @@ -# SETUP INTPUT SAMPLE SHEET -ln -s ../00-reads . -ln -s ../samples_id.txt . - -# Setup samplesheet -echo "sample,group,short_reads_1,short_reads_2,long_reads" > samplesheet.csv -cat samples_id.txt | while read in; do - echo "${in},,00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz," -done >> samplesheet.csv - -#module load Nextflow -#module load singularity - -scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") - -cat < mag.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 4G -#SBATCH --time 2:00:00 -#SBATCH --partition middle_idx -#SBATCH --output $(date '+%Y%m%d')_mag.log -#SBATCH --chdir $scratch_dir - -export NXF_OPTS="-Xms500M -Xmx4G" - -nextflow run /data/ucct/bi/pipelines/nf-core-mag/nf-core-mag-3.0.3/3_0_3/main.nf \\ - -c ../../DOC/mag.config \\ - --input samplesheet.csv \\ - --outdir $(date '+%Y%m%d')_mag \\ - --kraken2_db /data/ucct/bi/references/kraken/minikraken_8GB_20200312.tgz \\ - --skip_busco --skip_spades --skip_spadeshybrid --skip_megahit --skip_prodigal --skip_binning \\ - -resume -EOF - -echo "sbatch mag.sbatch" > _01_run_mag.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog b/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog deleted file mode 100644 index 1f57ae768..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS03_MAG_ALL/lablog +++ /dev/null @@ -1,38 +0,0 @@ -# SETUP INTPUT SAMPLE SHEET -ln -s ../00-reads . -ln -s ../samples_id.txt . - -# Setup samplesheet -echo "sample,group,short_reads_1,short_reads_2,long_reads" > samplesheet.csv -cat samples_id.txt | while read in; do - echo "${in},,00-reads/${in}_R1.fastq.gz,00-reads/${in}_R2.fastq.gz," -done >> samplesheet.csv - -scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") - -cat < mag_all.sbatch -#!/bin/sh -#SBATCH --ntasks 1 -#SBATCH --cpus-per-task 2 -#SBATCH --mem 8G -#SBATCH --time 72:00:00 -#SBATCH --partition long_idx -#SBATCH --output $(date '+%Y%m%d')_mag_all.log -#SBATCH --chdir $scratch_dir - -# module load Nextflow/23.10.0 singularity -export NXF_OPTS="-Xms500M -Xmx8G" - -nextflow run /data/ucct/bi/pipelines/nf-core-mag/nf-core-mag-3.0.3/3_0_3/main.nf \\ - -c ../../DOC/mag.config \\ - -profile singularity \\ - --input samplesheet.csv \\ - --kraken2_db '/data/ucct/bi/references/kraken/minikraken_8GB_20200312.tgz' \\ - --skip_spadeshybrid true \\ - --skip_concoct true \\ - --refine_bins_dastool true \\ - --outdir $(date '+%Y%m%d')_mag_all \\ - -resume -EOF - -echo "sbatch mag_all.sbatch" > _01_run_mag_all.sh diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all deleted file mode 100644 index caa07af92..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_all +++ /dev/null @@ -1,5 +0,0 @@ -mkdir 00-reads -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R1*.fastq.gz %_R1.fastq.gz" | bash; cd - -cd 00-reads; cat ../samples_id.txt | xargs -I % echo "ln -s ../../RAW/%_*R2*.fastq.gz %_R2.fastq.gz" | bash; cd - - -mv ANALYSIS03_MAG_ALL $(date '+%Y%m%d')_ANALYSIS03_MAG_ALL diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics b/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics deleted file mode 100644 index 72d7d463c..000000000 --- a/bu_isciii/templates/mag/ANALYSIS/lablog_mag_taxonomics +++ /dev/null @@ -1 +0,0 @@ -mv ANALYSIS02_MAG_TAXONOMICS $(date '+%Y%m%d')_ANALYSIS02_MAG_TAXONOMICS diff --git a/bu_isciii/templates/mag/DOC/mag.config b/bu_isciii/templates/mag/DOC/mag.config deleted file mode 100644 index 6d78de98c..000000000 --- a/bu_isciii/templates/mag/DOC/mag.config +++ /dev/null @@ -1,75 +0,0 @@ -/* - HPC XTUTATIS CONFIGURATION -*/ - -singularity { - enabled = true - autoMounts = true - singularity.cacheDir = '/data/ucct/bi/pipelines/singularity-images' -} - -process { - executor = 'slurm' - queue = 'middle_idx' - jobName = { "$task.name - $task.hash" } - conda = null - - errorStrategy = { task.exitStatus in ((130..145) + 104) ? 'retry' : 'finish' } - - withName:'SPADES|MEGAHIT' { - errorStrategy = { task.exitStatus in [143,137,21,12,1] ? 'retry' : 'finish' } - maxRetries = 2 - cpus = { 10 * task.attempt } - memory = { 64.GB * task.attempt } - time = { 24.h } - } - withName:'MAXBIN2' { - // often fails when insufficient information, so we allow it to gracefully fail without failing the pipeline - errorStrategy = { task.exitStatus in [ 1, 255 ] ? 'ignore' : 'retry' } - time = { 8.h * task.attempt } - } - // TODO: This bining tool takes ~24h to finish... skip was added in lablog, however it can be enabeled. - // withName:'CONCOCT_CONCOCT' { - // errorStrategy = { task.exitStatus in [140] ? 'retry' : 'finish' } - // maxRetries = 2 - // cpus = { 8 * task.attempt } - // memory = { 64.GB * task.attempt } - // time = { 24.h * task.attempt } - // } - withName:'CHECKM_LINEAGEWF' { - errorStrategy = { task.exitStatus in [1] ? 'retry' : 'finish' } - maxRetries = 3 - cpus = { 8 * task.attempt } - memory = { 32.GB * task.attempt } - time = { 4.h * task.attempt } - } - withName:'BOWTIE2_PHIX_REMOVAL_BUILD'{ - time = 18.h - } -} - -params { - max_memory = 376.GB - max_cpus = 32 - max_time = '48.h' -} - -/* - CUSTOM OUTPUT FOLDER STRUCTURE -- modules.config -*/ -params { publish_dir_mode = 'copy' } -process { - withName: 'MULTIQC' { - publishDir = [ - path: { "${params.outdir}/99-stats" }, - mode: params.publish_dir_mode, - saveAs: { filename -> - if (filename.equals('versions.yml') || filename.endsWith('.csv')) { - null - } else { - filename - } - } - ] - } -} diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results deleted file mode 100644 index f467632eb..000000000 --- a/bu_isciii/templates/mag/RESULTS/lablog_mag_all_results +++ /dev/null @@ -1,25 +0,0 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" -mkdir -p $DELIVERY_FOLDER/mag_all - -# Taxprofiling service -cd $DELIVERY_FOLDER/mag_all -ANALYSIS_FOLDER=../../../ANALYSIS/*_ANALYSIS03_MAG_ALL/*_mag_all - -# multiqc report -ln -s ${ANALYSIS_FOLDER}/99-stats/multiqc_report.html . - -# quast reports -mkdir -p quast -for quast_report in ${ANALYSIS_FOLDER}/Assembly/*/QC/*/QUAST/report.html; do - assembly_tool=$(echo "$quast_report" | awk -F'/' '{print $8}') - sample_name=$(echo "$quast_report" | awk -F'/' '{print $10}') - ln -s "../$quast_report" "quast/${sample_name}.${assembly_tool}.report.html" -done - -# krona reports -mkdir -p krona -for krona_report in ${ANALYSIS_FOLDER}/Taxonomy/*/*/taxonomy.krona.html; do - taxonomy_tool=$(echo "$krona_report" | awk -F'/' '{print $8}') - sample_name=$(echo "$krona_report" | awk -F'/' '{print $9}') - ln -s "../$krona_report" "krona/${sample_name}.${taxonomy_tool}.report.html" -done \ No newline at end of file diff --git a/bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results b/bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results deleted file mode 100755 index 11667ce47..000000000 --- a/bu_isciii/templates/mag/RESULTS/lablog_mag_taxonomics_results +++ /dev/null @@ -1,6 +0,0 @@ -mkdir $(date '+%Y%m%d')_entrega01 -cd $(date '+%Y%m%d')_entrega01 - -#Create symbolic links depending on the analysis -#Individual files -ln -s ../../ANALYSIS/*_ANALYSIS02_MAG_TAXONOMICS/99-stats/multiqc_report.html ./krona_results.html diff --git a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog b/bu_isciii/templates/taxprofiler/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog similarity index 99% rename from bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog rename to bu_isciii/templates/taxprofiler/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog index 879010d8d..f7f5c0127 100644 --- a/bu_isciii/templates/mag/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog +++ b/bu_isciii/templates/taxprofiler/ANALYSIS/ANALYSIS01_TAXPROFILER/lablog @@ -67,7 +67,7 @@ cat < taxprofiler.sbatch # module load Nextflow/23.10.0 singularity export NXF_OPTS="-Xms500M -Xmx6G" -nextflow run /data/ucct/bi/pipelines/nf-core-taxprofiler/nf-core-taxprofiler-1.1.8 \\ +nextflow run /data/ucct/bi/pipelines/nf-core-taxprofiler/nf-core-taxprofiler-1.2.1 \\ -profile singularity \\ -c ../../DOC/taxprofiler.config \\ --input samplesheet.csv \\ diff --git a/bu_isciii/templates/mag/ANALYSIS/lablog_taxprofiler b/bu_isciii/templates/taxprofiler/ANALYSIS/lablog_taxprofiler similarity index 100% rename from bu_isciii/templates/mag/ANALYSIS/lablog_taxprofiler rename to bu_isciii/templates/taxprofiler/ANALYSIS/lablog_taxprofiler diff --git a/bu_isciii/templates/mag/DOC/databasesheet.csv b/bu_isciii/templates/taxprofiler/DOC/databasesheet.csv similarity index 100% rename from bu_isciii/templates/mag/DOC/databasesheet.csv rename to bu_isciii/templates/taxprofiler/DOC/databasesheet.csv diff --git a/bu_isciii/templates/mag/DOC/taxprofiler.config b/bu_isciii/templates/taxprofiler/DOC/taxprofiler.config similarity index 100% rename from bu_isciii/templates/mag/DOC/taxprofiler.config rename to bu_isciii/templates/taxprofiler/DOC/taxprofiler.config diff --git a/bu_isciii/templates/mag/RAW/README b/bu_isciii/templates/taxprofiler/RAW/README similarity index 100% rename from bu_isciii/templates/mag/RAW/README rename to bu_isciii/templates/taxprofiler/RAW/README diff --git a/bu_isciii/templates/mag/REFERENCES/README b/bu_isciii/templates/taxprofiler/REFERENCES/README similarity index 100% rename from bu_isciii/templates/mag/REFERENCES/README rename to bu_isciii/templates/taxprofiler/REFERENCES/README diff --git a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results b/bu_isciii/templates/taxprofiler/RESULTS/lablog_taxprofiler_results similarity index 62% rename from bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results rename to bu_isciii/templates/taxprofiler/RESULTS/lablog_taxprofiler_results index cfb01fa4c..efcc0666a 100755 --- a/bu_isciii/templates/mag/RESULTS/lablog_taxprofiler_results +++ b/bu_isciii/templates/taxprofiler/RESULTS/lablog_taxprofiler_results @@ -5,10 +5,9 @@ mkdir -p $DELIVERY_FOLDER/taxprofiler cd $DELIVERY_FOLDER/taxprofiler # Links to reports -ln -s ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/multiqc/multiqc_report.html . +ln -s ../../../ANALYSIS/*_TAXPROFILER/multiqc/multiqc_report.html . mkdir -p krona -for file in ../../../ANALYSIS/*ANALYSIS01_TAXPROFILER/krona/*.html; do +for file in ../../../ANALYSIS/*_TAXPROFILER/krona/*.html; do base=$(basename "$file") ln -s "../$file" "krona/database_${base}" done - diff --git a/bu_isciii/templates/mag/TMP/README b/bu_isciii/templates/taxprofiler/TMP/README similarity index 100% rename from bu_isciii/templates/mag/TMP/README rename to bu_isciii/templates/taxprofiler/TMP/README From 9dccafe27375a8a53798d4235a985e230afb1be4 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:49:04 +0100 Subject: [PATCH 12/25] Updated chewbbaca results lablog to include cgMLST_MSA.fasta --- bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results index a87875198..2264d5a14 100644 --- a/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results +++ b/bu_isciii/templates/chewbbaca/RESULTS/lablog_mlst_results @@ -1,4 +1,4 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" mkdir -p $DELIVERY_FOLDER mkdir $DELIVERY_FOLDER/mlst @@ -10,5 +10,6 @@ cd $DELIVERY_FOLDER/mlst ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/allelecall_report.html . ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/distance_matrix_symmetric.tsv . ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/results_alleles.tsv . +ln -s ../../../ANALYSIS/*CHEWBBACA/*-chewbbaca/allele_calling_evaluation/cgMLST_MSA.fasta . ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.nwk ln -s ../../../ANALYSIS/*CHEWBBACA/*-grapetree/*.svg From e7540d687dd8fd1eef5aa3b1c639af3469749a42 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:49:42 +0100 Subject: [PATCH 13/25] Updated exometrio and wgstrio results lablogs to not include exomiser --- bu_isciii/templates/exometrio/RESULTS/lablog_exome_results | 3 --- bu_isciii/templates/wgstrio/RESULTS/lablog_wgstrio_results | 6 +----- 2 files changed, 1 insertion(+), 8 deletions(-) diff --git a/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results b/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results index 5425bdfc4..f172d6475 100755 --- a/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results +++ b/bu_isciii/templates/exometrio/RESULTS/lablog_exome_results @@ -11,6 +11,3 @@ ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/variants_*filterAF*.tab . mkdir annotation_tables cd annotation_tables; ln -s ../../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/vep_annot*.txt . ; cd - -# For exomeEB services -# ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/exomiser/exomiser.html . - diff --git a/bu_isciii/templates/wgstrio/RESULTS/lablog_wgstrio_results b/bu_isciii/templates/wgstrio/RESULTS/lablog_wgstrio_results index 49c6d7532..56e7cf947 100755 --- a/bu_isciii/templates/wgstrio/RESULTS/lablog_wgstrio_results +++ b/bu_isciii/templates/wgstrio/RESULTS/lablog_wgstrio_results @@ -8,8 +8,4 @@ ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/99-stats/hsMetrics_all.out mapping_metric ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/01-sarek/multiqc/multiqc_report.html . ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/variants_*filterAF*.tab . -ln -s ../../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/filter_heritance annotation_tables - -# For exomeEB services -# ln -s ../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/exomiser/exomiser.html . - +ln -s ../../../ANALYSIS/*ANALYSIS01_EXOME/03-annotation/filter_inheritance annotation_tables . From 35d6e8daedb2bf2691d86c39925d5836f207ce5e Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:50:11 +0100 Subject: [PATCH 14/25] Updated exometrio and wgstrio 03-annotation/lablogs to remove aux from _01.sh scripts --- .../ANALYSIS01_EXOME/03-annotation/lablog | 38 +++++----- .../ANALYSIS01_GENOME/03-annotation/lablog | 71 +++++++++---------- 2 files changed, 54 insertions(+), 55 deletions(-) diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog index 79bde1ca1..632e4d676 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog @@ -13,15 +13,15 @@ mkdir -p logs #----------------------------------------------------------------------------------------------------- # 1 . Lablog to modify VCF ID field before running VEP. -echo "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"} {if( \$0 ~ /^#/ ){print \$0}else{printf \"%s\t%s\t%s\t\", \$1,\$2,\$1\"_\"\$2\"_\"\$4\"_\"\$5 ; for (i=4; i<=NF; i++){printf \"%s\t\",\$i} ; printf \"\n\"}}' ../02-postprocessing/variants_fil.vcf > ./vep/variants_fil_mod.vcf" > aux_01_bcftools_query.sh -echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> aux_01_bcftools_query.sh +echo "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"} {if( \$0 ~ /^#/ ){print \$0}else{printf \"%s\t%s\t%s\t\", \$1,\$2,\$1\"_\"\$2\"_\"\$4\"_\"\$5 ; for (i=4; i<=NF; i++){printf \"%s\t\",\$i} ; printf \"\n\"}}' ../02-postprocessing/variants_fil.vcf > ./vep/variants_fil_mod.vcf" > _01_bcftools_query.sh +echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> _01_bcftools_query.sh # 2. Create variant table. -echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> aux_01_bcftools_query.sh -echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> aux_01_bcftools_query.sh +echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> _01_bcftools_query.sh +echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> _01_bcftools_query.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./aux_01_bcftools_query.sh &" > _01_run_bcftools_query.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./_01_bcftools_query.sh &" > _01_run_bcftools_query.sh #------------------------------------------------------------------------------------------------------ @@ -45,15 +45,15 @@ echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_pl echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh # 6. Filter variants_annot_all.tab -echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> aux_03_awk.sh +echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> _03_awk.sh -echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> aux_03_awk.sh +echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> _03_awk.sh -echo "rm variants_annot_filterAF.tab" >> aux_03_awk.sh +echo "rm variants_annot_filterAF.tab" >> _03_awk.sh #------------------------------------------------------------------------------------------------------- @@ -71,23 +71,23 @@ sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile.yml # THE FILE "spring.log" MUST BE DELETED IN THE CORRESPONDING NODE -echo "java -Xms2g -Xmx4g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > aux_04_exomiser_exome.sh -echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/aux_04_exomiser_exome.sh &" > _04_exomiser_exome.sh +echo "java -Xms2g -Xmx4g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > _04_exomiser_exome.sh +echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/_04_exomiser_exome.sh &" > _04_exomiser_exome.sh #--------------------------------------------------------------------------------------------------------- ## Lablog to modify the output reported by exomiser and create a final file with a personalized format. -# Grep variant id for each inheritance model -cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_heritance.sh +# Grep variant id for each ininheritance model +cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_inheritance.sh -# Grep variants for each inheritance models from the full annotated variants file -cat inheritance_types.txt | xargs -I % echo "grep -f ./id_%.txt ./variants_annot_all.tab > ./vep_annot_%.txt" >> _05_filter_heritance.sh +# Grep variants for each ininheritance models from the full annotated variants file +cat ininheritance_types.txt | xargs -I % echo "grep -f ./id_%.txt ./variants_annot_all.tab > ./vep_annot_%.txt" >> _05_filter_inheritance.sh -cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./vep_annot_%.txt > ./vep_annot_%_final.txt" >> _05_filter_heritance.sh +cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./vep_annot_%.txt > ./vep_annot_%_final.txt" >> _05_filter_inheritance.sh -echo "rm id_*" >> _05_filter_heritance.sh -cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_heritance.sh +echo "rm id_*" >> _05_filter_inheritance.sh +cat ininheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_inheritance.sh # annot_all table is huge, lets shrink it a little bit -echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance.sh +echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_inheritance.sh diff --git a/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog b/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog index 93774e4ab..966c27fb0 100644 --- a/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog +++ b/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog @@ -1,5 +1,4 @@ -# module load singularity -# module load Java/17.0.2.lua R/4.2.1 +# module load singularity R scratch_dir=$(echo $PWD | sed "s/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g") @@ -8,7 +7,7 @@ ln -s /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/d mkdir -p vep mkdir -p logs mkdir -p exomiser/{exomiser,exomiser_exome,exomiser_genes} -mkdir -p filter_heritance/{filter_heritance,filter_heritance_genes,filter_heritance_exome} +mkdir -p filter_ininheritance/{filter_ininheritance,filter_ininheritance_genes,filter_ininheritance_exome} @@ -16,14 +15,14 @@ mkdir -p filter_heritance/{filter_heritance,filter_heritance_genes,filter_herita #------------------------------------------------------------------------------------------------------------------ # 1 . Lablog to modify VCF ID field before running VEP. -echo "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"} {if( \$0 ~ /^#/ ){print \$0}else{printf \"%s\t%s\t%s\t\", \$1,\$2,\$1\"_\"\$2\"_\"\$4\"_\"\$5 ; for (i=4; i<=NF; i++){printf \"%s\t\",\$i} ; printf \"\n\"}}' ../02-postprocessing/variants_fil.vcf > ./vep/variants_fil_mod.vcf" > aux_01_bcftools_query.sh -echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> aux_01_bcftools_query.sh +echo "awk 'BEGIN{FS=\"\t\";OFS=\"\t\"} {if( \$0 ~ /^#/ ){print \$0}else{printf \"%s\t%s\t%s\t\", \$1,\$2,\$1\"_\"\$2\"_\"\$4\"_\"\$5 ; for (i=4; i<=NF; i++){printf \"%s\t\",\$i} ; printf \"\n\"}}' ../02-postprocessing/variants_fil.vcf > ./vep/variants_fil_mod.vcf" > _01_bcftools_query.sh +echo "sed -i 's/\t$//' ./vep/variants_fil_mod.vcf" >> _01_bcftools_query.sh # 2. Create variant table. -echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> aux_01_bcftools_query.sh -echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> aux_01_bcftools_query.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./aux_01_bcftools_query.sh &" > _01_run_bcftools_query.sh +echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/bcftools:1.12--h45bccc9_1 bcftools query -H -f '%CHROM\t%POS\t%ID\t%REF\t%ALT\t%FILTER\t[%GT\t%DP\t%AD\t%GQ\t]\n' ${scratch_dir}/vep/variants_fil_mod.vcf > ${scratch_dir}/vep/variants.table" >> _01_bcftools_query.sh +echo "sed -i -r 's/(#|\[[0-9]+\])//g' ./vep/variants.table;sed -i 's/:/_/g' ./vep/variants.table;sed -i 's/ //g' ./vep/variants.table;sed -i 's/\t*$//g' ./vep/variants.table " >> _01_bcftools_query.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/BCFTOOLSQUERY.log --job-name BCFTOOLSQUERY bash ./_01_bcftools_query.sh &" > _01_run_bcftools_query.sh #-------------------------------------------------------------------------------------------------------------------- @@ -48,15 +47,15 @@ echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_pl echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh # 6. Filter variants_annot_all.tab -echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> aux_03_awk.sh +echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> _03_awk.sh -echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> aux_03_awk.sh +echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> _03_awk.sh -echo "rm variants_annot_filterAF.tab" >> aux_03_awk.sh +echo "rm variants_annot_filterAF.tab" >> _03_awk.sh #--------------------------------------------------------------------------------------------------------------------------------- @@ -73,8 +72,8 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile.yml #sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > aux1_04_exomiser_ALL.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/aux1_04_exomiser_ALL.sh &" > _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > 1_04_exomiser_ALL.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/1_04_exomiser_ALL.sh &" > _04_exomiser_ALL.sh ## 8. Running exomiser_exome @@ -90,8 +89,8 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile_exome.yml sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile_exome.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile_exome.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_exome.yml; rm /tmp/spring.log" > aux2_04_exomiser_exome.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_exome.log --job-name EXOMISER bash ${scratch_dir}/aux2_04_exomiser_exome.sh &" >> _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_exome.yml; rm /tmp/spring.log" > 2_04_exomiser_exome.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_exome.log --job-name EXOMISER bash ${scratch_dir}/2_04_exomiser_exome.sh &" >> _04_exomiser_ALL.sh ## 9. Running exomiser_genes @@ -107,35 +106,35 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile_genes.yml sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile_genes.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile_genes.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_genes.yml; rm /tmp/spring.log" > aux3_04_exomiser_genes.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_genes.log --job-name EXOMISER bash ${scratch_dir}/aux3_04_exomiser_genes.sh &" >> _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_genes.yml; rm /tmp/spring.log" > 3_04_exomiser_genes.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_genes.log --job-name EXOMISER bash ${scratch_dir}/3_04_exomiser_genes.sh &" >> _04_exomiser_ALL.sh #-------------------------------------------------------------------------------------------------------- #10. Lablog to modify the output reported by exomiser and create a final file with a personalized format. For each exomiser analysis (whole genome, exome, genes) -# Grep variant id for each inheritance model +# Grep variant id for each ininheritance model -cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_heritance/filter_heritance/id_%.txt " > _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_genes/exomiser_genes_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_heritance/filter_heritance_genes/id_%_genes.txt " >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_exome/exomiser_exome_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_heritance/filter_heritance_exome/id_%_exome.txt " >> _05_filter_heritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance/id_%.txt " > _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_genes/exomiser_genes_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt " >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_exome/exomiser_exome_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt " >> _05_filter_inheritance_ALL.sh -# Grep variants for each inheritance models from the full annotated variants file +# Grep variants for each ininheritance models from the full annotated variants file -cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_heritance/filter_heritance/id_%.txt ./variants_annot_all.tab > ./filter_heritance/filter_heritance/vep_annot_%.txt" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_heritance/filter_heritance_genes/id_%_genes.txt ./variants_annot_all.tab > ./filter_heritance/filter_heritance_genes/vep_annot_%_genes.txt" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_heritance/filter_heritance_exome/id_%_exome.txt ./variants_annot_all.tab > ./filter_heritance/filter_heritance_exome/vep_annot_%_exome.txt" >> _05_filter_heritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance/id_%.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_heritance/filter_heritance/vep_annot_%.txt > ./filter_heritance/filter_heritance/vep_annot_%_final.txt" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_heritance/filter_heritance_genes/vep_annot_%_genes.txt > ./filter_heritance/filter_heritance_genes/vep_annot_%_genes_final.txt" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_heritance/filter_heritance_exome/vep_annot_%_exome.txt > ./filter_heritance/filter_heritance_exome/vep_annot_%_exome_final.txt" >> _05_filter_heritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance/vep_annot_%.txt > ./filter_inheritance/filter_inheritance/vep_annot_%_final.txt" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes_final.txt" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome_final.txt" >> _05_filter_inheritance_ALL.sh -echo "rm ./filter_heritance/filter_heritance/id_*" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "rm ./filter_heritance/filter_heritance/vep_annot_%.txt" >> _05_filter_heritance_ALL.sh -echo "rm ./filter_heritance/filter_heritance_genes/id_*" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "rm ./filter_heritance/filter_heritance_genes/vep_annot_%_genes.txt" >> _05_filter_heritance_ALL.sh -echo "rm ./filter_heritance/filter_heritance_exome/id_*" >> _05_filter_heritance_ALL.sh -cat inheritance_types.txt | xargs -I % echo "rm ./filter_heritance/filter_heritance_exome/vep_annot_%_exome.txt" >> _05_filter_heritance_ALL.sh +echo "rm ./filter_inheritance/filter_inheritance/id_*" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh +echo "rm ./filter_inheritance/filter_inheritance_genes/id_*" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh +echo "rm ./filter_inheritance/filter_inheritance_exome/id_*" >> _05_filter_inheritance_ALL.sh +cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh # annot_all table is huge, lets shrink it a little bit -echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_heritance_ALL.sh +echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_inheritance_ALL.sh From 21806965ebebce0615b3b90ea9d5cafcf0d4c5d1 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:50:22 +0100 Subject: [PATCH 15/25] Created plasmid's results lablog --- .../plasmidid/RESULTS/lablog_plasmidid_results | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 bu_isciii/templates/plasmidid/RESULTS/lablog_plasmidid_results diff --git a/bu_isciii/templates/plasmidid/RESULTS/lablog_plasmidid_results b/bu_isciii/templates/plasmidid/RESULTS/lablog_plasmidid_results new file mode 100644 index 000000000..1cd63cd27 --- /dev/null +++ b/bu_isciii/templates/plasmidid/RESULTS/lablog_plasmidid_results @@ -0,0 +1,10 @@ +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" + +mkdir -p "${DELIVERY_FOLDER}/plasmidid" + +cd ${DELIVERY_FOLDER}/plasmidid + +ln -s ../../../ANALYSIS/*_PLASMIDID/*/*/images/*.png . +ln -s ../../../ANALYSIS/*_PLASMIDID/*/*/*.html . + +cd - From 72b73565f364e8c4a38dda833373fbca24073a32 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:50:58 +0100 Subject: [PATCH 16/25] Updated mtbseq's lablog to remove unnecessary single quotes --- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog index d6adc353c..b1871686d 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog @@ -13,9 +13,9 @@ cat ../samples_id.txt | xargs -I @@ echo -e "srun --job-name MTBSEQ.@@ --output # classification echo "mkdir classification_all" > _03_gather_results.sh -echo 'FIRST_SAMPLE=$(head -n1 samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep "^'\'''"$analysis_year"'" */Classification/Strain_Classification.tab | cut -d ":" -f 2 >> classification_all/strain_classification_all.tab' >> _03_gather_results.sh +echo "FIRST_SAMPLE=\$(head -n1 ../samples_id.txt); head -n 1 \${FIRST_SAMPLE}/Classification/Strain_Classification.tab > classification_all/strain_classification_all.tab; grep \"^'\$analysis_year\" */Classification/Strain_Classification.tab | cut -d \":\" -f 2 | sed \"s/'//g\" >> classification_all/strain_classification_all.tab" >> _03_gather_results.sh # resistances echo "mkdir resistances_all" >> _03_gather_results.sh cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _03_gather_results.sh # stats -echo 'mkdir stats_all; FIRST_SAMPLE=$(head -n1 ../samples_id.txt); head -n 1 ${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep "^'\'''"$analysis_year"'" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 >> stats_all/statistics_all.tab' >> _03_gather_results.sh +echo "mkdir stats_all; FIRST_SAMPLE=\$(head -n1 ../samples_id.txt); head -n 1 \${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'\$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 | sed \"s/'//g\" >> stats_all/statistics_all.tab" >> _03_gather_results.sh From 6253efe634d97643322de8e336b440014fc72b4f Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 17:51:36 +0100 Subject: [PATCH 17/25] Fixed snippy's lablog to be better explained and updated its results' lablog --- .../ANALYSIS01_SNIPPY/04-snippy/lablog | 40 ++++++++++++++----- .../snippy/RESULTS/lablog_snippy_results | 3 +- 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog index e9f837feb..0c0cd03db 100644 --- a/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog +++ b/bu_isciii/templates/snippy/ANALYSIS/ANALYSIS01_SNIPPY/04-snippy/lablog @@ -4,31 +4,49 @@ scratch_dir=$(echo $PWD | sed 's/\/data\/ucct\/bi\/scratch_tmp/\/scratch/g') mkdir logs +# BLOCK 1: CREATION OF INPUT.TAB AND COMMANDS.OUT + +## When creating input.tab, run the following line if you want to use .fastq.gz files for your analysis. If not, comment it. cat ../samples_id.txt | while read in; do echo -e "${in}\t${scratch_dir}/../02-preprocessing/${in}/${in}_R1_filtered.fastq.gz\t${scratch_dir}/../02-preprocessing/${in}/${in}_R2_filtered.fastq.gz"; done >> input.tab -ls ${scratch_dir}/../../../REFERENCES | xargs -I %% singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/%% --cpus 5 > commands.out +## If you are going to include .fasta files in the analysis, run the following line. Run the previous line as well if you also have .fastq.gz files for your analysis. +## Bear in mind this line considers the .fasta files are inside a certain folder from REFERENCES! Change this line accordingly. +## ls ../../../REFERENCES/*/*.fasta | cut -d '/' -f6 | sed 's/.fasta//g' | while read in; do paste <(echo ${in}) <(echo ${scratch_dir}/../../../REFERENCES/*/${in}.fasta); done >> input.tab + +singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-multi ${scratch_dir}/input.tab --mincov 9 --mapqual 10 --basequal 5 --minqual 30 --ref ${scratch_dir}/../../../REFERENCES/%% --cpus 5 > commands.out + +# BLOCK 2: CREATION OF _00_snippy.sh head -n -1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _00_snippy.sh + +# BLOCK 3: CREATION OF _01_snippy_core.sh + +## A) BY DEFAULT: snippy-core will run without masking any positions. tail -n 1 commands.out | sed -e "s@^@srun --chdir ${scratch_dir} --output logs/SNIPPY_CORE.%j.log --job-name SNIPPY --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 @" | awk '{print $0" &"}' > _01_snippy_core.sh +## B) If you want to mask complex variants, uncomment these lines: +## echo "grep \"complex\" ./*/snps.vcf | cut -f 1,2,4,5 | cut -d \":\" -f 2 | sort -u | awk '{pos1=\$2; len_ref=length(\$3); printf \"%s\t%s\t%s\n\", \$1, pos1-1, pos1+len_ref+1}' | grep -v \"^#\" > mask_complex_variants.bed" > _01_snippy_core.sh +## ls ${scratch_dir}/../../../REFERENCES | xargs -I %% echo "snippy-core --debug --mask ./mask_complex_variants.bed --mask-char 'N' --ref '../../../REFERENCES/%%' $(cat ../samples_id.txt | xargs)" >> _01_snippy_core.sh +## C) If you want to mask low-coverage variants, uncomment these lines: +## echo "awk -F'\t' '/^NZ_/ {split($9, format, ":"); split($10, values, ":"); for (i in format) if (format[i] == "DP" && values[i] < 10) print $1, $2 - 1, $2}' OFS='\t' ./*/snps.vcf > mask_low_coverage_variants.bed" > _01_snippy_core.sh +## ls ${scratch_dir}/../../../REFERENCES | xargs -I %% echo "snippy-core --debug --mask ./mask_low_coverage_variants.bed --mask-char 'N' --ref '../../../REFERENCES/%%' $(cat ../samples_id.txt | xargs)" >> _01_snippy_core.sh -# Execute core genome SNIPPY -# CODE CONTEXT: this block was used in the service: AZORHIZOBIOUMOUTBREAK01 on november 2022 -# Comment las line from _00_snippy.sh -# echo "grep \"complex\" ./*/snps.vcf | cut -f 1,2,4,5 | cut -d \":\" -f 2 | sort -u | awk '{pos1=\$2; len_ref=length(\$3); printf \"%s\t%s\t%s\n\", \$1, pos1-1, pos1+len_ref+1}' | grep -v \"^#\" > mask_complex_variants.bed" > _01_snippy_core.sh -# ls ${scratch_dir}/../../../REFERENCES | xargs -I %% echo "snippy-core --debug --mask ./mask_complex_variants.bed --mask-char 'N' --ref '../../../REFERENCES/%%' $(cat ../samples_id.txt | xargs)" >> _01_snippy_core.sh +# BLOCK 4: CREATION OF _02_phylo_aln.sh echo "srun --chdir ${scratch_dir} --output logs/SNIP-SITES.%j.log --job-name SNIP-SITES --cpus-per-task 5 --mem 49152 --partition short_idx --time 02:00:00 env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snp-sites -b -c -o phylo.aln core.full.aln &" > _02_phylo_aln.sh -# awk 'BEGIN{FS="[> ]"} /^>/{val=$2;next} {print val,length($0)}' phylo.aln +## Run this line if you want to know the size of phylo.aln +## awk 'BEGIN{FS="[> ]"} /^>/{val=$2;next} {print val,length($0)}' phylo.aln + +## Run this line to compare samples in pairs (CHANGE THIS LINE ACCORDING TO YOUR NEEDS, THIS IS JUST AN EXAMPLE!) +#€ awk '$4 != $5 || $4 != $6 || $5 != $6' core.tab > differences.txt -#code to compare samples inpairs -# awk '$4 != $5 || $4 != $6 || $5 != $6' core.tab > differences.txt +# BLOCK 5: CREATION OF _03_gubbins.sh to run GUBBINS (in order to filter recombinant sites): -## GUBBINS commands echo "env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snippy-clean_full_aln core.full.aln > clean.full.aln" > _03_gubbins.sh echo "singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/gubbins:3.3.5--py39pl5321he4a0461_0 run_gubbins.py --threads 20 -p gubbins clean.full.aln" >> _03_gubbins.sh echo "env - PATH="$PATH" singularity exec -B ${scratch_dir}/../../../ /data/ucct/bi/pipelines/singularity-images/snippy:4.6.0--hdfd78af_4 snp-sites -c gubbins.filtered_polymorphic_sites.fasta > clean.core.aln" >> _03_gubbins.sh -# Run gubbins + +## Run gubbins echo "srun --chdir ${scratch_dir} --output logs/GUBBINS.%j.log --job-name GUBBINS --cpus-per-task 20 --mem 49152 --partition short_idx --time 02:00:00 bash _03_gubbins.sh &" > _03_run_gubbins.sh diff --git a/bu_isciii/templates/snippy/RESULTS/lablog_snippy_results b/bu_isciii/templates/snippy/RESULTS/lablog_snippy_results index af98615dd..09820f9e7 100644 --- a/bu_isciii/templates/snippy/RESULTS/lablog_snippy_results +++ b/bu_isciii/templates/snippy/RESULTS/lablog_snippy_results @@ -1,11 +1,10 @@ -DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega" +DELIVERY_FOLDER="$(date '+%Y%m%d')_entrega01" mkdir $DELIVERY_FOLDER mkdir "${DELIVERY_FOLDER}/snp" # SNIPPY service cd $DELIVERY_FOLDER/snp -ln -s ../../../ANALYSIS/*SNIPPY/*snippy/*xlsx . ln -s ../../../ANALYSIS/*SNIPPY/*iqtree/phylo.iqtree.bootstrap.treefile phylo.iqtree.bootstrap.nwk ln -s ../../../ANALYSIS/*SNIPPY/99-stats/variants_stats.txt . ln -s ../../../ANALYSIS/*SNIPPY/99-stats/mapping_stats_summary.txt . From 45c5a666548372838c98f0dcd3a7a9dcd5943867 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:30:17 +0100 Subject: [PATCH 18/25] Updated version in pyproject.toml and __main__.py --- bu_isciii/__main__.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/__main__.py b/bu_isciii/__main__.py index f06da05e5..26aa67b6b 100755 --- a/bu_isciii/__main__.py +++ b/bu_isciii/__main__.py @@ -57,7 +57,7 @@ def run_bu_isciii(): ) # stderr.print("[green] `._,._,'\n", highlight=False) - __version__ = "2.2.4" + __version__ = "2.2.5" stderr.print( "[grey39] BU-ISCIII-tools version {}".format(__version__), highlight=False ) diff --git a/pyproject.toml b/pyproject.toml index ddc428d66..1192212f9 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "buisciii-tools" -version = "2.2.4" +version = "2.2.5" dynamic = ["dependencies"] authors = [ From de4a16d265ac1789104ed537e51d5e5142c5472c Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:30:43 +0100 Subject: [PATCH 19/25] Fixed URL from taxprofiler.md --- bu_isciii/assets/reports/results/taxprofiler.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/assets/reports/results/taxprofiler.md b/bu_isciii/assets/reports/results/taxprofiler.md index d16ab8597..d847ab3e5 100644 --- a/bu_isciii/assets/reports/results/taxprofiler.md +++ b/bu_isciii/assets/reports/results/taxprofiler.md @@ -1,6 +1,6 @@ ## Taxprofiler -Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/1.1.8] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. +Here we describe the results from the (nf-core/taxprofiler)[https://nf-co.re/taxprofiler/] pipeline for multispecies taxonomic classification and profiling of shorgun short- and long-read. * `taxprofiler/multiqc_report.html​`: Final HTML report collecting numerical stats from each module executed in this pipeline. * `taxprofiler/krona/database_*.html`: Interactive HTML files generated by Krona, displaying the results of taxonomic classification for supported tools (Kraken2, Centrifuge, Kaiju, and MALT) From abe2afe1966595a95d3da276f388a422aa29863f Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:31:08 +0100 Subject: [PATCH 20/25] Fixed singularity bind in lablog from chewbbaca/REFERENCES --- bu_isciii/templates/chewbbaca/REFERENCES/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/chewbbaca/REFERENCES/lablog b/bu_isciii/templates/chewbbaca/REFERENCES/lablog index 6e8f58f47..aa194ab2a 100644 --- a/bu_isciii/templates/chewbbaca/REFERENCES/lablog +++ b/bu_isciii/templates/chewbbaca/REFERENCES/lablog @@ -10,4 +10,4 @@ scratch_dir=$(echo $PWD | sed "s/\/data\ucct/\/bi\/scratch_tmp/\/scratch/g") echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-CREATE-SCHEMA.%j.log --job-name CHEWBBACA-CREATE-SCHEMA --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../ /data/ucct/bi/pipelines/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py CreateSchema -i ${scratch_dir}/%% -o ./created_schema --cpu 4 &" > _01_create_schema.sh # cgMLST extraction -echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-EXTRACT-CGMLST.%j.log --job-name CHEWBBACA-EXTRACT-CGMLST --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../../ /data/ucct/bi/pipelines/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py ExtractCgMLST -i ${scratch_dir}/../ANALYSIS/*/*-chewbbaca/allele_calling/results_alleles.tsv -o ./results_cgmlst &" > _02_extract_cgmlst.sh +echo "srun --chdir $scratch_dir --output logs/CHEWBBACA-EXTRACT-CGMLST.%j.log --job-name CHEWBBACA-EXTRACT-CGMLST --partition middle_idx --time 12:00:00 singularity exec --bind ${scratch_dir}/../ /data/ucct/bi/pipelines/singularity-images/chewbbaca:3.3.3--pyhdfd78af_0 chewBBACA.py ExtractCgMLST -i ${scratch_dir}/../ANALYSIS/*/*-chewbbaca/allele_calling/results_alleles.tsv -o ./results_cgmlst &" > _02_extract_cgmlst.sh From 24ef3127d8ce6abbe7c90b3e2fc66b43f2ea63aa Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:31:27 +0100 Subject: [PATCH 21/25] Fixed irma_stats.txt name in create_irma_stats_flu.sh --- .../ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh index 11f075497..f0e1cdbde 100755 --- a/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh +++ b/bu_isciii/templates/IRMA/ANALYSIS/ANALYSIS01_IRMA/04-irma/create_irma_stats_flu.sh @@ -1,5 +1,5 @@ -echo -e "sample_ID\tTotalReads\tMappedReads\t%MappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats.txt +echo -e "sample_ID\tTotalReads\tMappedReads\t%MappedReads\tFlu_type\tReads_HA\tReads_MP\tReads_NA\tReads_NP\tReads_NS\tReads_PA\tReads_PB1\tReads_PB2" > irma_stats_flu.txt cat ../samples_id.txt | while read in do @@ -24,11 +24,11 @@ else LINE=$(paste <(echo $SAMPLE_ID) <(echo $TOTAL_READS) <(echo $MAPPEDREADS) <(echo $PCTMAPPED) <(echo $FLU_TYPE) <(echo $HA) <(echo $MP) <(echo $NA) <(echo $NP) <(echo $NS) <(echo $PA) <(echo $PB1) <(echo $PB2)) fi -echo "$LINE" >> irma_stats.txt +echo "$LINE" >> irma_stats_flu.txt done -ANY_C=$(grep "C_" irma_stats.txt) +ANY_C=$(grep "C_" irma_stats_flu.txt) if [[ -n "$ANY_C" ]]; then - sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats.txt + sed -i 's/Reads_PB2/Reads_PB2\tReads_HE/g' irma_stats_flu.txt fi From 5c5cf7b84a142a5b2d9287ef68a8fbec31d1e1dc Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:31:50 +0100 Subject: [PATCH 22/25] Fixed minor mistake in mtbseq/lablog --- .../mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog index b1871686d..4ebfbcedb 100644 --- a/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog +++ b/bu_isciii/templates/mtbseq/ANALYSIS/ANALYSIS01_MTBSEQ/03-MTBSeq/lablog @@ -18,4 +18,4 @@ echo "FIRST_SAMPLE=\$(head -n1 ../samples_id.txt); head -n 1 \${FIRST_SAMPLE}/Cl echo "mkdir resistances_all" >> _03_gather_results.sh cat ../samples_id.txt | xargs -I % echo "cp %/Amend/NONE_joint_cf4_cr4_fr75_ph4_samples1_amended.tab resistances_all/%_var_res.tab" >> _03_gather_results.sh # stats -echo "mkdir stats_all; FIRST_SAMPLE=\$(head -n1 ../samples_id.txt); head -n 1 \${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'\$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d ":" -f 2 | sed \"s/'//g\" >> stats_all/statistics_all.tab" >> _03_gather_results.sh +echo "mkdir stats_all; FIRST_SAMPLE=\$(head -n1 ../samples_id.txt); head -n 1 \${FIRST_SAMPLE}/Statistics/Mapping_and_Variant_Statistics.tab > stats_all/statistics_all.tab; grep \"^'\$analysis_year\" */Statistics/Mapping_and_Variant_Statistics.tab | cut -d \":\" -f 2 | sed \"s/'//g\" >> stats_all/statistics_all.tab" >> _03_gather_results.sh From 232db7a5b47c81569cdb69b68ab180b2d957d357 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:32:12 +0100 Subject: [PATCH 23/25] Fixed minor mistakes in exometrio and wgstrio 03-annotation lablogs --- .../ANALYSIS01_EXOME/03-annotation/lablog | 24 ++++----- .../ANALYSIS01_GENOME/03-annotation/lablog | 53 +++++++++---------- 2 files changed, 37 insertions(+), 40 deletions(-) diff --git a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog index 632e4d676..6d3776289 100644 --- a/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog +++ b/bu_isciii/templates/exometrio/ANALYSIS/ANALYSIS01_EXOME/03-annotation/lablog @@ -45,15 +45,15 @@ echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_pl echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh # 6. Filter variants_annot_all.tab -echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> _03_awk.sh +echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> aux_03_awk.sh -echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> _03_awk.sh +echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> aux_03_awk.sh -echo "rm variants_annot_filterAF.tab" >> _03_awk.sh +echo "rm variants_annot_filterAF.tab" >> aux_03_awk.sh #------------------------------------------------------------------------------------------------------- @@ -71,23 +71,23 @@ sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile.yml # THE FILE "spring.log" MUST BE DELETED IN THE CORRESPONDING NODE -echo "java -Xms2g -Xmx4g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > _04_exomiser_exome.sh -echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/_04_exomiser_exome.sh &" > _04_exomiser_exome.sh +echo "java -Xms2g -Xmx4g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > aux_04_exomiser_exome.sh +echo "srun --partition short_idx --mem 100G --time 2:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/aux_04_exomiser_exome.sh &" > _04_exomiser_exome.sh #--------------------------------------------------------------------------------------------------------- ## Lablog to modify the output reported by exomiser and create a final file with a personalized format. -# Grep variant id for each ininheritance model -cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_inheritance.sh +# Grep variant id for each inheritance model +cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./id_%.txt " >> _05_filter_inheritance.sh -# Grep variants for each ininheritance models from the full annotated variants file -cat ininheritance_types.txt | xargs -I % echo "grep -f ./id_%.txt ./variants_annot_all.tab > ./vep_annot_%.txt" >> _05_filter_inheritance.sh +# Grep variants for each inheritance models from the full annotated variants file +cat inheritance_types.txt | xargs -I % echo "grep -f ./id_%.txt ./variants_annot_all.tab > ./vep_annot_%.txt" >> _05_filter_inheritance.sh -cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./vep_annot_%.txt > ./vep_annot_%_final.txt" >> _05_filter_inheritance.sh +cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./vep_annot_%.txt > ./vep_annot_%_final.txt" >> _05_filter_inheritance.sh echo "rm id_*" >> _05_filter_inheritance.sh -cat ininheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_inheritance.sh +cat inheritance_types.txt | xargs -I % echo "rm ./vep_annot_%.txt" >> _05_filter_inheritance.sh # annot_all table is huge, lets shrink it a little bit echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS_ALL.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_inheritance.sh diff --git a/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog b/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog index 966c27fb0..009af2365 100644 --- a/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog +++ b/bu_isciii/templates/wgstrio/ANALYSIS/ANALYSIS01_GENOME/03-annotation/lablog @@ -7,10 +7,7 @@ ln -s /data/ucct/bi/references/eukaria/homo_sapiens/cache_vep/custom_databases/d mkdir -p vep mkdir -p logs mkdir -p exomiser/{exomiser,exomiser_exome,exomiser_genes} -mkdir -p filter_ininheritance/{filter_ininheritance,filter_ininheritance_genes,filter_ininheritance_exome} - - - +mkdir -p filter_inheritance/{filter_inheritance,filter_inheritance_genes,filter_inheritance_exome} #------------------------------------------------------------------------------------------------------------------ @@ -47,15 +44,15 @@ echo "sed -i 's/#Uploaded_variation/ID/' ./vep/vep_annot_head.txt" >> _03_Vep_pl echo "srun --partition short_idx --mem 200G --time 12:00:00 --chdir ${scratch_dir} --output logs/MERGE_ALL.log --job-name MERGE_ALL Rscript Merge_All.R" >> _03_Vep_plugin_dbNSFP_parse.sh -echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh +echo "srun --partition short_idx --time 2:00:00 --chdir ${scratch_dir} --output logs/AWK.log --job-name AWK bash ./aux_03_awk.sh &" >> _03_Vep_plugin_dbNSFP_parse.sh # 6. Filter variants_annot_all.tab -echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> _03_awk.sh +echo "awk 'NR>1 && \$58 > 0.001 {print \$0}' variants_annot_all.tab > ./variants_annot_filterAF.tab" >> aux_03_awk.sh -echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> _03_awk.sh +echo "cat header_vep_final_annot.txt variants_annot_filterAF.tab > variants_annot_filterAF_head.tab" >> aux_03_awk.sh -echo "rm variants_annot_filterAF.tab" >> _03_awk.sh +echo "rm variants_annot_filterAF.tab" >> aux_03_awk.sh #--------------------------------------------------------------------------------------------------------------------------------- @@ -72,8 +69,8 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile.yml #sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > 1_04_exomiser_ALL.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/1_04_exomiser_ALL.sh &" > _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile.yml; rm /tmp/spring.log" > aux1_04_exomiser_ALL.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER.log --job-name EXOMISER bash ${scratch_dir}/aux1_04_exomiser_ALL.sh &" > _04_exomiser_ALL.sh ## 8. Running exomiser_exome @@ -89,8 +86,8 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile_exome.yml sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile_exome.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile_exome.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_exome.yml; rm /tmp/spring.log" > 2_04_exomiser_exome.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_exome.log --job-name EXOMISER bash ${scratch_dir}/2_04_exomiser_exome.sh &" >> _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_exome.yml; rm /tmp/spring.log" > aux2_04_exomiser_exome.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_exome.log --job-name EXOMISER bash ${scratch_dir}/aux2_04_exomiser_exome.sh &" >> _04_exomiser_ALL.sh ## 9. Running exomiser_genes @@ -106,35 +103,35 @@ sed -i "s|PROBAND|${proband}|g" ./exomiser_configfile_genes.yml sed -i "s|BED_FILE|${bed_file}|g" ./exomiser_configfile_genes.yml sed -i "s|OUTPUT_FOLDER|${output_folder}|g" ./exomiser_configfile_genes.yml -echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_genes.yml; rm /tmp/spring.log" > 3_04_exomiser_genes.sh -echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_genes.log --job-name EXOMISER bash ${scratch_dir}/3_04_exomiser_genes.sh &" >> _04_exomiser_ALL.sh +echo "java -Xms300g -Xmx300g -jar exomiser-cli-13.0.0.jar --analysis ${scratch_dir}/exomiser_configfile_genes.yml; rm /tmp/spring.log" > aux3_04_exomiser_genes.sh +echo "srun --partition short_idx --mem 350G --time 12:00:00 --chdir /data/ucct/bi/pipelines/exomiser/exomiser-cli-13.0.0 --output logs/EXOMISER_genes.log --job-name EXOMISER bash ${scratch_dir}/aux3_04_exomiser_genes.sh &" >> _04_exomiser_ALL.sh #-------------------------------------------------------------------------------------------------------- #10. Lablog to modify the output reported by exomiser and create a final file with a personalized format. For each exomiser analysis (whole genome, exome, genes) -# Grep variant id for each ininheritance model +# Grep variant id for each inheritance model -cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance/id_%.txt " > _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_genes/exomiser_genes_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt " >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_exome/exomiser_exome_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt " >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser/exomiser_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance/id_%.txt " > _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_genes/exomiser_genes_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt " >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep 'PASS' ./exomiser/exomiser_exome/exomiser_exome_%.variants.tsv | awk '{print \$1\"_\"\$2\"_\"\$3\"_\"\$4}' > ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt " >> _05_filter_inheritance_ALL.sh -# Grep variants for each ininheritance models from the full annotated variants file +# Grep variants for each inheritance models from the full annotated variants file -cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance/id_%.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance/id_%.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_genes/id_%_genes.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "grep -f ./filter_inheritance/filter_inheritance_exome/id_%_exome.txt ./variants_annot_all.tab > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance/vep_annot_%.txt > ./filter_inheritance/filter_inheritance/vep_annot_%_final.txt" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes_final.txt" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome_final.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance/vep_annot_%.txt > ./filter_inheritance/filter_inheritance/vep_annot_%_final.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt > ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes_final.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "cat header_vep_final_annot.txt ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt > ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome_final.txt" >> _05_filter_inheritance_ALL.sh echo "rm ./filter_inheritance/filter_inheritance/id_*" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance/vep_annot_%.txt" >> _05_filter_inheritance_ALL.sh echo "rm ./filter_inheritance/filter_inheritance_genes/id_*" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_genes/vep_annot_%_genes.txt" >> _05_filter_inheritance_ALL.sh echo "rm ./filter_inheritance/filter_inheritance_exome/id_*" >> _05_filter_inheritance_ALL.sh -cat ininheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh +cat inheritance_types.txt | xargs -I % echo "rm ./filter_inheritance/filter_inheritance_exome/vep_annot_%_exome.txt" >> _05_filter_inheritance_ALL.sh # annot_all table is huge, lets shrink it a little bit echo "srun --partition short_idx --chdir ${scratch_dir} --output logs/COMPRESS.log --job-name COMPRESS_ANNOT_ALL gzip variants_annot_all.tab &" >> _05_filter_inheritance_ALL.sh From 60707ed301d94104d256f3b7e2462747866716b9 Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:35:21 +0100 Subject: [PATCH 24/25] Updated irma_stats.txt names in IRMA's results lablog --- bu_isciii/templates/IRMA/RESULTS/lablog_irma_results | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results index bab5ec1fd..f643ee890 100644 --- a/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results +++ b/bu_isciii/templates/IRMA/RESULTS/lablog_irma_results @@ -16,7 +16,7 @@ while true; do ln -s ../../ANALYSIS/*_IRMA/04-irma/A_H* . ln -s ../../ANALYSIS/*_IRMA/04-irma/B . ln -s ../../ANALYSIS/*_IRMA/04-irma/C . - tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/clean_irma_stats.txt | cut -f4 | sort | uniq -c > flu_type_summary.txt + tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/clean_irma_stats_flu.txt | cut -f4 | sort | uniq -c > flu_type_summary.txt break elif [ "$ORGANISM" == "2" ]; then ORGANISM="RSV" @@ -26,7 +26,7 @@ while true; do ln -s ../../ANALYSIS/*_IRMA/04-irma/B . ln -s ../../ANALYSIS/*_IRMA/04-irma/AD . ln -s ../../ANALYSIS/*_IRMA/04-irma/BD . - tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/irma_stats.txt | cut -f5 | sort | uniq -c > rsv_type_summary.txt + tail -n +2 ../../ANALYSIS/*_IRMA/04-irma/irma_stats_rsv.txt | cut -f5 | sort | uniq -c > rsv_type_summary.txt break else echo "Invalid input. Please enter 1 or 2." From d388dcead0eeb419203afacff26386b42dde2f6a Mon Sep 17 00:00:00 2001 From: victor5lm Date: Wed, 8 Jan 2025 22:54:21 +0100 Subject: [PATCH 25/25] Updated CHANGELOG.md --- CHANGELOG.md | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 97ead2a86..fb358f34a 100755 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,43 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [2.2.5] - 2025-01-09 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.2.5 + +### Credits + +- [Victor Lopez](https://github.com/victor5lm) + +### Template fixes and updates + +- Changed mag.md by taxprofiler.md in assets/reports [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Modified irma_output.md to include only taxprofiler [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated services.json with taxprofiler [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Created a lablog file for chewbbaca/REFERENCES [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated lablog_irma and renamed ANALYSIS01 folders [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated IRMA template and its files to include RSV [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated chewbbaca results' lablog to include cgMLST_MSA.fasta [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated exometrio and wgstrio results lablogs not to include exomiser's html [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Created plasmidid's results lablog [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated mtbseq's lablog to remove unnecessary single quotes [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Fixed snippy's lablog to be better explained and updated its results' lablog [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). +- Updated version in pyproject.toml and __main__.py [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). + +### Modules + +#### Added enhancements + +#### Fixes + +#### Changed + +- Replaced setup.py by pyproject.toml [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). + +#### Removed + +- Removed MAG template and replaced it with taxprofiler [#396](https://github.com/BU-ISCIII/buisciii-tools/pull/396). + +### Requirements + ## [2.2.4] - 2024-12-27 : https://github.com/BU-ISCIII/buisciii-tools/releases/tag/2.2.4 ### Credits