From 9936b9a5e9157d4f5816b05c15842f3635b3458a Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 19 Jan 2024 16:13:57 +0100 Subject: [PATCH 1/5] add arriba component (#1) * add arriba component * changes based on #2 * update changelog * quote arguments * add command to requirements --- CHANGELOG.md | 2 + src/arriba/config.vsh.yaml | 386 ++++++++++++++++++++++++++++ src/arriba/help.txt | 198 ++++++++++++++ src/arriba/script.sh | 47 ++++ src/arriba/test.sh | 45 ++++ src/arriba/test_data/A.bam | Bin 0 -> 296 bytes src/arriba/test_data/README.md | 8 + src/arriba/test_data/annotation.gtf | 6 + src/arriba/test_data/blacklist.tsv | 0 src/arriba/test_data/genome.fasta | 4 + 10 files changed, 696 insertions(+) create mode 100644 src/arriba/config.vsh.yaml create mode 100644 src/arriba/help.txt create mode 100644 src/arriba/script.sh create mode 100644 src/arriba/test.sh create mode 100644 src/arriba/test_data/A.bam create mode 100644 src/arriba/test_data/README.md create mode 100644 src/arriba/test_data/annotation.gtf create mode 100644 src/arriba/test_data/blacklist.tsv create mode 100644 src/arriba/test_data/genome.fasta diff --git a/CHANGELOG.md b/CHANGELOG.md index d0b30ae0..d2610ff9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,8 @@ ## NEW FEATURES +* `arriba`: Detect gene fusions from RNA-seq data (PR #1). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/arriba/config.vsh.yaml b/src/arriba/config.vsh.yaml new file mode 100644 index 00000000..601ad6c2 --- /dev/null +++ b/src/arriba/config.vsh.yaml @@ -0,0 +1,386 @@ +functionality: + name: arriba + description: Detect gene fusions from RNA-Seq data + info: + keywords: [Gene fusion, RNA-Seq] + homepage: https://arriba.readthedocs.io/en/latest/ + documentation: https://arriba.readthedocs.io/en/latest/ + repository: https://github.com/suhrig/arriba + reference: "doi:10.1101/gr.257246.119" + licence: MIT + requirements: + cpus: 1 + commands: [ arriba ] + argument_groups: + - name: Inputs + arguments: + - name: --bam + alternatives: -x + type: file + description: | + File in SAM/BAM/CRAM format with main alignments as generated by STAR + (Aligned.out.sam). Arriba extracts candidate reads from this file. + required: true + example: Aligned.out.bam + - name: --genome + alternatives: -a + type: file + description: | + FastA file with genome sequence (assembly). The file may be gzip-compressed. An + index with the file extension .fai must exist only if CRAM files are processed. + required: true + example: assembly.fa + - name: --gene_annotation + alternatives: -g + type: file + description: | + GTF file with gene annotation. The file may be gzip-compressed. + required: true + example: annotation.gtf + - name: --known_fusions + alternatives: -k + type: file + description: | + File containing known/recurrent fusions. Some cancer entities are often + characterized by fusions between the same pair of genes. In order to boost + sensitivity, a list of known fusions can be supplied using this parameter. The list + must contain two columns with the names of the fused genes, separated by tabs. + required: false + example: known_fusions.tsv + - name: --blacklist + alternatives: -b + type: file + description: | + File containing blacklisted events (recurrent artifacts and transcripts + observed in healthy tissue). + required: false + example: blacklist.tsv + - name: --structural_variants + alternatives: -d + type: file + description: | + Tab-separated file with coordinates of structural variants found using + whole-genome sequencing data. These coordinates serve to increase sensitivity + towards weakly expressed fusions and to eliminate fusions with low evidence. + required: false + example: structural_variants_from_WGS.tsv + - name: --tags + alternatives: -t + type: file + description: | + Tab-separated file containing fusions to annotate with tags in the 'tags' column. + The first two columns specify the genes; the third column specifies the tag. The + file may be gzip-compressed. + required: false + example: tags.tsv + - name: --protein_domains + alternatives: -p + type: file + description: | + File in GFF3 format containing coordinates of the protein domains of genes. The + protein domains retained in a fusion are listed in the column + 'retained_protein_domains'. The file may be gzip-compressed. + required: false + example: protein_domains.gff3 + - name: Outputs + arguments: + - name: --fusions + alternatives: -o + type: file + direction: output + description: | + Output file with fusions that have passed all filters. + required: true + example: fusions.tsv + - name: --fusions_discarded + alternatives: -O + type: file + direction: output + description: | + Output file with fusions that were discarded due to filtering. + required: false + example: fusions.discarded.tsv + - name: Arguments + arguments: + - name: --max_genomic_breakpoint_distance + alternatives: -D + type: long + description: | + When a file with genomic breakpoints obtained via + whole-genome sequencing is supplied via the --structural_variants + parameter, this parameter determines how far a + genomic breakpoint may be away from a + transcriptomic breakpoint to consider it as a + related event. For events inside genes, the + distance is added to the end of the gene; for + intergenic events, the distance threshold is + applied as is. Default: 100000. + required: false + - name: --strandedness + alternatives: -s + type: string + description: | + Whether a strand-specific protocol was used for library preparation, + and if so, the type of strandedness (auto/yes/no/reverse). When + unstranded data is processed, the strand can sometimes be inferred from + splice-patterns. But in unclear situations, stranded data helps + resolve ambiguities. Default: auto + choices: ["auto", "yes", "no", "reverse"] + required: false + - name: --interesting_contigs + alternatives: -i + type: string + description: | + List of interesting contigs. Fusions between genes + on other contigs are ignored. Cfontigs can be specified with or without the + prefix "chr". Asterisks (*) are treated as wild-cards. + Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* + required: false + multiple: true + multiple_sep: "," + example: ["1", "2", "AC_*", "NC_*"] + - name: --viral_contigs + alternatives: -v + type: string + description: | + Comma-/space-separated list of viral contigs. Asterisks (*) are treated as + wild-cards. + Default: AC_* NC_* + required: false + multiple: true + multiple_sep: "," + example: ["AC_*", "NC_*"] + - name: --disable_filters + alternatives: -f + type: string + description: | + Comma-/space-separated list of filters to disable. By default all filters are + enabled. + choices: [ homologs, low_entropy, isoforms, + top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, + non_coding_neighbors, mismatches, duplicates, no_genomic_support, + genomic_support, intronic, end_to_end, relative_support, + low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, + same_gene, long_gap, internal_tandem_duplication, small_insert_size, + read_through, inconsistently_clipped, intragenic_exonic, + marginal_read_through, spliced, hairpin, blacklist, min_support, + select_best, in_vitro, short_anchor, known_fusions, no_coverage, + homopolymer, many_spliced ] + required: false + multiple: true + multiple_sep: "," + - name: --max_e_value + alternatives: -E + type: double + description: | + Arriba estimates the number of fusions with a given number of supporting + reads which one would expect to see by random chance. If the expected number + of fusions (e-value) is higher than this threshold, the fusion is + discarded by the 'relative_support' filter. Note: Increasing this + threshold can dramatically increase the number of false positives and may + increase the runtime of resource-intensive steps. Fractional values are + possible. Default: 0.300000 + required: false + - name: --min_supporting_reads + alternatives: -S + type: integer + description: | + The 'min_support' filter discards all fusions with fewer than + this many supporting reads (split reads and discordant mates + combined). Default: 2 + required: false + example: 2 + - name: --max_mismappers + alternatives: -m + type: double + description: | + When more than this fraction of supporting reads turns out to be + mismappers, the 'mismappers' filter discards the fusion. Default: + 0.800000 + required: false + example: 0.8 + - name: --max_homolog_identity + alternatives: -L + type: double + description: | + Genes with more than the given fraction of sequence identity are + considered homologs and removed by the 'homologs' filter. + Default: 0.300000 + required: false + example: 0.3 + - name: --homopolymer_length + alternatives: -H + type: integer + description: | + The 'homopolymer' filter removes breakpoints adjacent to + homopolymers of the given length or more. Default: 6 + required: false + example: 6 + - name: --read_through_distance + alternatives: -R + type: integer + description: | + The 'read_through' filter removes read-through fusions + where the breakpoints are less than the given distance away + from each other. Default: 10000 + required: false + example: 10000 + - name : --min_anchor_length + alternatives: -A + type: integer + description: | + Alignment artifacts are often characterized by split reads coming + from only one gene and no discordant mates. Moreover, the split + reads only align to a short stretch in one of the genes. The + 'short_anchor' filter removes these fusions. This parameter sets + the threshold in bp for what the filter considers short. Default: 23 + required: false + example: 23 + - name: --many_spliced_events + alternatives: -M + type: integer + description: | + The 'many_spliced' filter recovers fusions between genes that + have at least this many spliced breakpoints. Default: 4 + required: false + example: 4 + - name: --max_kmer_content + alternatives: -K + type: double + description: | + The 'low_entropy' filter removes reads with repetitive 3-mers. If + the 3-mers make up more than the given fraction of the sequence, then + the read is discarded. Default: 0.600000 + required: false + example: 0.6 + - name: --max_mismatch_pvalue + alternatives: -V + type: double + description: | + The 'mismatches' filter uses a binomial model to calculate a + p-value for observing a given number of mismatches in a read. If + the number of mismatches is too high, the read is discarded. + Default: 0.010000 + required: false + example: 0.05 + - name: --fragment_length + alternatives: -F + type: integer + description: | + When paired-end data is given, the fragment length is estimated + automatically and this parameter has no effect. But when single-end + data is given, the mean fragment length should be specified to + effectively filter fusions that arise from hairpin structures. + Default: 200 + required: false + example: 200 + - name: --max_reads + alternatives: -U + type: integer + description: | + Subsample fusions with more than the given number of supporting reads. This + improves performance without compromising sensitivity, as long as the + threshold is high. Counting of supporting reads beyond the threshold is + inaccurate, obviously. Default: 300 + required: false + example: 300 + - name: --quantile + alternatives: -Q + type: double + description: | + Highly expressed genes are prone to produce artifacts during library + preparation. Genes with an expression above the given quantile are eligible + for filtering by the 'in_vitro' filter. Default: 0.998000 + required: false + example: 0.998 + - name: --exonic_fraction + alternatives: -e + type: double + description: | + The breakpoints of false-positive predictions of intragenic events + are often both in exons. True predictions are more likely to have at + least one breakpoint in an intron, because introns are larger. If the + fraction of exonic sequence between two breakpoints is smaller than + the given fraction, the 'intragenic_exonic' filter discards the + event. Default: 0.330000 + required: false + example: 0.33 + - name: --top_n + alternatives: -T + type: integer + description: | + Only report viral integration sites of the top N most highly expressed viral + contigs. Default: 5 + required: false + example: 5 + - name: --covered_fraction + alternatives: -C + type: double + description: | + Ignore virally associated events if the virus is not fully + expressed, i.e., less than the given fraction of the viral contig is + transcribed. Default: 0.050000 + required: false + example: 0.05 + - name: --max_itd_length + alternatives: -l + type: integer + description: | + Maximum length of internal tandem duplications. Note: Increasing + this value beyond the default can impair performance and lead to many + false positives. Default: 100 + required: false + example: 100 + - name: --min_itd_allele_fraction + alternatives: -z + type: double + description: | + Required fraction of supporting reads to report an internal + tandem duplication. Default: 0.070000 + required: false + example: 0.07 + - name: --min_itd_supporting_reads + alternatives: -Z + type: integer + description: | + Required absolute number of supporting reads to report an + internal tandem duplication. Default: 10 + required: false + example: 10 + - name: --skip_duplicate_marking + alternatives: -u + type: boolean_true + description: | + Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a + preceding program using the BAM_FDUP flag. This makes sense when unique molecular + identifiers (UMI) are used. + - name: --extra_information + alternatives: -X + type: boolean_true + description: | + To reduce the runtime and file size, by default, the columns 'fusion_transcript', + 'peptide_sequence', and 'read_identifiers' are left empty in the file containing + discarded fusion candidates (see parameter -O). When this flag is set, this extra + information is reported in the discarded fusions file. + - name: --fill_gaps + alternatives: -I + type: boolean_true + description: | + If assembly of the fusion transcript sequence from the supporting reads is incomplete + (denoted as '...'), fill the gaps using the assembly sequence wherever possible. + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +platforms: + - type: docker + image: quay.io/biocontainers/arriba:2.4.0--h0033a41_2 + setup: + - type: docker + run: | + arriba -h | grep 'Version:' 2>&1 | sed 's/Version:\s\(.*\)/arriba: "\1"/' > /var/software_versions.txt + - type: nextflow diff --git a/src/arriba/help.txt b/src/arriba/help.txt new file mode 100644 index 00000000..7d8fc76d --- /dev/null +++ b/src/arriba/help.txt @@ -0,0 +1,198 @@ +```bash +arriba -h +``` + +Arriba gene fusion detector +--------------------------- +Version: 2.4.0 + +Arriba is a fast tool to search for aberrant transcripts such as gene fusions. +It is based on chimeric alignments found by the STAR RNA-Seq aligner. + +Usage: arriba [-c Chimeric.out.sam] -x Aligned.out.bam \ + -g annotation.gtf -a assembly.fa [-b blacklists.tsv] [-k known_fusions.tsv] \ + [-t tags.tsv] [-p protein_domains.gff3] [-d structural_variants_from_WGS.tsv] \ + -o fusions.tsv [-O fusions.discarded.tsv] \ + [OPTIONS] + + -c FILE File in SAM/BAM/CRAM format with chimeric alignments as generated by STAR + (Chimeric.out.sam). This parameter is only required, if STAR was run with the + parameter '--chimOutType SeparateSAMold'. When STAR was run with the parameter + '--chimOutType WithinBAM', it suffices to pass the parameter -x to Arriba and -c + can be omitted. + + -x FILE File in SAM/BAM/CRAM format with main alignments as generated by STAR + (Aligned.out.sam). Arriba extracts candidate reads from this file. + + -g FILE GTF file with gene annotation. The file may be gzip-compressed. + + -G GTF_FEATURES Comma-/space-separated list of names of GTF features. + Default: gene_name=gene_name|gene_id gene_id=gene_id + transcript_id=transcript_id feature_exon=exon feature_CDS=CDS + + -a FILE FastA file with genome sequence (assembly). The file may be gzip-compressed. An + index with the file extension .fai must exist only if CRAM files are processed. + + -b FILE File containing blacklisted events (recurrent artifacts and transcripts + observed in healthy tissue). + + -k FILE File containing known/recurrent fusions. Some cancer entities are often + characterized by fusions between the same pair of genes. In order to boost + sensitivity, a list of known fusions can be supplied using this parameter. The list + must contain two columns with the names of the fused genes, separated by tabs. + + -o FILE Output file with fusions that have passed all filters. + + -O FILE Output file with fusions that were discarded due to filtering. + + -t FILE Tab-separated file containing fusions to annotate with tags in the 'tags' column. + The first two columns specify the genes; the third column specifies the tag. The + file may be gzip-compressed. + + -p FILE File in GFF3 format containing coordinates of the protein domains of genes. The + protein domains retained in a fusion are listed in the column + 'retained_protein_domains'. The file may be gzip-compressed. + + -d FILE Tab-separated file with coordinates of structural variants found using + whole-genome sequencing data. These coordinates serve to increase sensitivity + towards weakly expressed fusions and to eliminate fusions with low evidence. + + -D MAX_GENOMIC_BREAKPOINT_DISTANCE When a file with genomic breakpoints obtained via + whole-genome sequencing is supplied via the -d + parameter, this parameter determines how far a + genomic breakpoint may be away from a + transcriptomic breakpoint to consider it as a + related event. For events inside genes, the + distance is added to the end of the gene; for + intergenic events, the distance threshold is + applied as is. Default: 100000 + + -s STRANDEDNESS Whether a strand-specific protocol was used for library preparation, + and if so, the type of strandedness (auto/yes/no/reverse). When + unstranded data is processed, the strand can sometimes be inferred from + splice-patterns. But in unclear situations, stranded data helps + resolve ambiguities. Default: auto + + -i CONTIGS Comma-/space-separated list of interesting contigs. Fusions between genes + on other contigs are ignored. Cfontigs can be specified with or without the + prefix "chr". Asterisks (*) are treated as wild-cards. + Default: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y AC_* NC_* + + -v CONTIGS Comma-/space-separated list of viral contigs. Asterisks (*) are treated as + wild-cards. + Default: AC_* NC_* + + -f FILTERS Comma-/space-separated list of filters to disable. By default all filters are + enabled. Valid values: homologs, low_entropy, isoforms, + top_expressed_viral_contigs, viral_contigs, uninteresting_contigs, + non_coding_neighbors, mismatches, duplicates, no_genomic_support, + genomic_support, intronic, end_to_end, relative_support, + low_coverage_viral_contigs, merge_adjacent, mismappers, multimappers, + same_gene, long_gap, internal_tandem_duplication, small_insert_size, + read_through, inconsistently_clipped, intragenic_exonic, + marginal_read_through, spliced, hairpin, blacklist, min_support, + select_best, in_vitro, short_anchor, known_fusions, no_coverage, + homopolymer, many_spliced + + -E MAX_E-VALUE Arriba estimates the number of fusions with a given number of supporting + reads which one would expect to see by random chance. If the expected number + of fusions (e-value) is higher than this threshold, the fusion is + discarded by the 'relative_support' filter. Note: Increasing this + threshold can dramatically increase the number of false positives and may + increase the runtime of resource-intensive steps. Fractional values are + possible. Default: 0.300000 + + -S MIN_SUPPORTING_READS The 'min_support' filter discards all fusions with fewer than + this many supporting reads (split reads and discordant mates + combined). Default: 2 + + -m MAX_MISMAPPERS When more than this fraction of supporting reads turns out to be + mismappers, the 'mismappers' filter discards the fusion. Default: + 0.800000 + + -L MAX_HOMOLOG_IDENTITY Genes with more than the given fraction of sequence identity are + considered homologs and removed by the 'homologs' filter. + Default: 0.300000 + + -H HOMOPOLYMER_LENGTH The 'homopolymer' filter removes breakpoints adjacent to + homopolymers of the given length or more. Default: 6 + + -R READ_THROUGH_DISTANCE The 'read_through' filter removes read-through fusions + where the breakpoints are less than the given distance away + from each other. Default: 10000 + + -A MIN_ANCHOR_LENGTH Alignment artifacts are often characterized by split reads coming + from only one gene and no discordant mates. Moreover, the split + reads only align to a short stretch in one of the genes. The + 'short_anchor' filter removes these fusions. This parameter sets + the threshold in bp for what the filter considers short. Default: 23 + + -M MANY_SPLICED_EVENTS The 'many_spliced' filter recovers fusions between genes that + have at least this many spliced breakpoints. Default: 4 + + -K MAX_KMER_CONTENT The 'low_entropy' filter removes reads with repetitive 3-mers. If + the 3-mers make up more than the given fraction of the sequence, then + the read is discarded. Default: 0.600000 + + -V MAX_MISMATCH_PVALUE The 'mismatches' filter uses a binomial model to calculate a + p-value for observing a given number of mismatches in a read. If + the number of mismatches is too high, the read is discarded. + Default: 0.010000 + + -F FRAGMENT_LENGTH When paired-end data is given, the fragment length is estimated + automatically and this parameter has no effect. But when single-end + data is given, the mean fragment length should be specified to + effectively filter fusions that arise from hairpin structures. + Default: 200 + + -U MAX_READS Subsample fusions with more than the given number of supporting reads. This + improves performance without compromising sensitivity, as long as the + threshold is high. Counting of supporting reads beyond the threshold is + inaccurate, obviously. Default: 300 + + -Q QUANTILE Highly expressed genes are prone to produce artifacts during library + preparation. Genes with an expression above the given quantile are eligible + for filtering by the 'in_vitro' filter. Default: 0.998000 + + -e EXONIC_FRACTION The breakpoints of false-positive predictions of intragenic events + are often both in exons. True predictions are more likely to have at + least one breakpoint in an intron, because introns are larger. If the + fraction of exonic sequence between two breakpoints is smaller than + the given fraction, the 'intragenic_exonic' filter discards the + event. Default: 0.330000 + + -T TOP_N Only report viral integration sites of the top N most highly expressed viral + contigs. Default: 5 + + -C COVERED_FRACTION Ignore virally associated events if the virus is not fully + expressed, i.e., less than the given fraction of the viral contig is + transcribed. Default: 0.050000 + + -l MAX_ITD_LENGTH Maximum length of internal tandem duplications. Note: Increasing + this value beyond the default can impair performance and lead to many + false positives. Default: 100 + + -z MIN_ITD_ALLELE_FRACTION Required fraction of supporting reads to report an internal + tandem duplication. Default: 0.070000 + + -Z MIN_ITD_SUPPORTING_READS Required absolute number of supporting reads to report an + internal tandem duplication. Default: 10 + + -u Instead of performing duplicate marking itself, Arriba relies on duplicate marking by a + preceding program using the BAM_FDUP flag. This makes sense when unique molecular + identifiers (UMI) are used. + + -X To reduce the runtime and file size, by default, the columns 'fusion_transcript', + 'peptide_sequence', and 'read_identifiers' are left empty in the file containing + discarded fusion candidates (see parameter -O). When this flag is set, this extra + information is reported in the discarded fusions file. + + -I If assembly of the fusion transcript sequence from the supporting reads is incomplete + (denoted as '...'), fill the gaps using the assembly sequence wherever possible. + + -h Print help and exit. + + Code repository: https://github.com/suhrig/arriba + Get help/report bugs: https://github.com/suhrig/arriba/issues + User manual: https://arriba.readthedocs.io/ + Please cite: https://doi.org/10.1101/gr.257246.119 \ No newline at end of file diff --git a/src/arriba/script.sh b/src/arriba/script.sh new file mode 100644 index 00000000..b0a9af8e --- /dev/null +++ b/src/arriba/script.sh @@ -0,0 +1,47 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +[[ "$par_skip_duplicate_marking" == "false" ]] && unset par_skip_duplicate_marking +[[ "$par_extra_information" == "false" ]] && unset par_extra_information +[[ "$par_fill_gaps" == "false" ]] && unset par_fill_gaps + +arriba \ + -x "$par_bam" \ + -a "$par_genome" \ + -g "$par_gene_annotation" \ + -o "$par_fusions" \ + ${par_known_fusions:+-k "${par_known_fusions}"} \ + ${par_blacklist:+-b "${par_blacklist}"} \ + ${par_structural_variants:+-d "${par_structural_variants}"} \ + ${par_tags:+-t "${par_tags}"} \ + ${par_protein_domains:+-p "${par_protein_domains}"} \ + ${par_fusions_discarded:+-O "${par_fusions_discarded}"} \ + ${par_max_genomic_breakpoint_distance:+-D "${par_max_genomic_breakpoint_distance}"} \ + ${par_strandedness:+-s "${par_strandedness}"} \ + ${par_interesting_contigs:+-i "${par_interesting_contigs}"} \ + ${par_viral_contigs:+-v "${par_viral_contigs}"} \ + ${par_disable_filters:+-f "${par_disable_filters}"} \ + ${par_max_e_value:+-E "${par_max_e_value}"} \ + ${par_min_supporting_reads:+-S "${par_min_supporting_reads}"} \ + ${par_max_mismappers:+-m "${par_max_mismappers}"} \ + ${par_max_homolog_identity:+-L "${par_max_homolog_identity}"} \ + ${par_homopolymer_length:+-H "${par_homopolymer_length}"} \ + ${par_read_through_distance:+-R "${par_read_through_distance}"} \ + ${par_min_anchor_length:+-A "${par_min_anchor_length}"} \ + ${par_many_spliced_events:+-M "${par_many_spliced_events}"} \ + ${par_max_kmer_content:+-K "${par_max_kmer_content}"} \ + ${par_max_mismatch_pvalue:+-V "${par_max_mismatch_pvalue}"} \ + ${par_fragment_length:+-F "${par_fragment_length}"} \ + ${par_max_reads:+-U "${par_max_reads}"} \ + ${par_quantile:+-Q "${par_quantile}"} \ + ${par_exonic_fraction:+-e "${par_exonic_fraction}"} \ + ${par_top_n:+-T "${par_top_n}"} \ + ${par_covered_fraction:+-C "${par_covered_fraction}"} \ + ${par_max_itd_length:+-l "${par_max_itd_length}"} \ + ${par_min_itd_allele_fraction:+-z "${par_min_itd_allele_fraction}"} \ + ${par_min_itd_supporting_reads:+-Z "${par_min_itd_supporting_reads}"} \ + ${par_skip_duplicate_marking:+-u} \ + ${par_extra_information:+-X} \ + ${par_fill_gaps:+-I} diff --git a/src/arriba/test.sh b/src/arriba/test.sh new file mode 100644 index 00000000..7f1243d2 --- /dev/null +++ b/src/arriba/test.sh @@ -0,0 +1,45 @@ +#!/bin/bash + +set -e + +dir_in="$meta_resources_dir/test_data" + +echo "> Run arriba with blacklist" +"$meta_executable" \ + --bam "$dir_in/A.bam" \ + --genome "$dir_in/genome.fasta" \ + --gene_annotation "$dir_in/annotation.gtf" \ + --blacklist "$dir_in/blacklist.tsv" \ + --fusions "fusions.tsv" \ + --fusions_discarded "fusions_discarded.tsv" \ + --interesting_contigs "1,2" + +echo ">> Checking output" +[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1 +[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1 +[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1 + +rm fusions.tsv fusions_discarded.tsv + +echo "> Run arriba without blacklist" +"$meta_executable" \ + --bam "$dir_in/A.bam" \ + --genome "$dir_in/genome.fasta" \ + --gene_annotation "$dir_in/annotation.gtf" \ + --fusions "fusions.tsv" \ + --fusions_discarded "fusions_discarded.tsv" \ + --interesting_contigs "1,2" \ + --disable_filters blacklist + +echo ">> Checking output" +[ ! -f "fusions.tsv" ] && echo "Output file fusions.tsv does not exist" && exit 1 +[ ! -f "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv does not exist" && exit 1 + +echo ">> Check if output is empty" +[ ! -s "fusions.tsv" ] && echo "Output file fusions.tsv is empty" && exit 1 +[ ! -s "fusions_discarded.tsv" ] && echo "Output file fusions_discarded.tsv is empty" && exit 1 + +echo "> Test successful" \ No newline at end of file diff --git a/src/arriba/test_data/A.bam b/src/arriba/test_data/A.bam new file mode 100644 index 0000000000000000000000000000000000000000..57511ab3537e519b0b82cc84d30637c43db5801e GIT binary patch literal 296 zcmb2|=3rp}f&Xj_PR>jW4GhIaUsBJcB_tGlD0s;8d9%?KKQQ<5<)c~_M+`fR5AZuNn@YW$`C3w~$m(~4>kgSYe=WVcgS$m0eJ(B*{v0Gy zWb`>`t;m5QABWctHzp`DoHAnC%-}Jhqp=%kuRNLqx)@fcG%)xnq;3Bwl9ZzHWI{{V zO6SacW@csY;x^769v(f91CGMtA3hhPefdz5R>D&zcH(ns+6U%G3CA^*WtEv_b7hr- z<7JJRWtp?vycRrkaF%R&I8EKy!|CXmh9BJaT~C`l92z7}mn%f(tw`?3vSeueGBM~E N7N1 +GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG +>2 +AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA From fb6cf022e35a320e2372d661648ee1f5167eab17 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 19 Jan 2024 17:03:46 +0100 Subject: [PATCH 2/5] document command --- src/arriba/test_data/README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/arriba/test_data/README.md b/src/arriba/test_data/README.md index 853beeb2..8cfb28f6 100644 --- a/src/arriba/test_data/README.md +++ b/src/arriba/test_data/README.md @@ -5,4 +5,9 @@ Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree __author__ = "Jan Forster" __copyright__ = "Copyright 2019, Jan Forster" __email__ = "j.forster@dkfz.de" -__license__ = "MIT" \ No newline at end of file +__license__ = "MIT" + +```bash +git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data +``` \ No newline at end of file From 57e506fa37953850edeec84a059f21d67ff06a4f Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 26 Jan 2024 13:44:57 +0100 Subject: [PATCH 3/5] add gitignore --- .gitignore | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..ca5262bc --- /dev/null +++ b/.gitignore @@ -0,0 +1,17 @@ +*.DS_Store +*__pycache__ + +# IDE ignores +.idea/ + +# R specific ignores +.Rhistory +.Rproj.user +*.Rproj + +# viash specific ignores +target/ + +# nextflow specific ignores +.nextflow* +work From d0461dbe98b89d871bfb332aa3f592118372ba2e Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 26 Jan 2024 14:07:47 +0100 Subject: [PATCH 4/5] add arriba test data script --- src/arriba/test_data/README.md | 13 ------------- src/arriba/test_data/script.sh | 10 ++++++++++ 2 files changed, 10 insertions(+), 13 deletions(-) delete mode 100644 src/arriba/test_data/README.md create mode 100755 src/arriba/test_data/script.sh diff --git a/src/arriba/test_data/README.md b/src/arriba/test_data/README.md deleted file mode 100644 index 8cfb28f6..00000000 --- a/src/arriba/test_data/README.md +++ /dev/null @@ -1,13 +0,0 @@ -# arriba test data - -Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/arriba/test. - -__author__ = "Jan Forster" -__copyright__ = "Copyright 2019, Jan Forster" -__email__ = "j.forster@dkfz.de" -__license__ = "MIT" - -```bash -git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers -cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data -``` \ No newline at end of file diff --git a/src/arriba/test_data/script.sh b/src/arriba/test_data/script.sh new file mode 100755 index 00000000..e5bbcf2c --- /dev/null +++ b/src/arriba/test_data/script.sh @@ -0,0 +1,10 @@ +# arriba test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/arriba/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/arriba/test/* src/arriba/test_data + From b3573a2926bf0a2a5ec78933cf47639909f124dc Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Fri, 26 Jan 2024 16:31:20 +0100 Subject: [PATCH 5/5] Add fastp component (#3) * initialise fastp component * add arguments to config * add test and script * update help * update changelog * fix test data script * address comments * Apply suggestions from code review Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> * extend test --------- Co-authored-by: Dries Schaumont <5946712+DriesSchaumont@users.noreply.github.com> --- CHANGELOG.md | 2 + src/fastp/config.vsh.yaml | 574 +++++++++++++++++++++++++++++++ src/fastp/help.txt | 93 +++++ src/fastp/script.sh | 105 ++++++ src/fastp/test.sh | 74 ++++ src/fastp/test_data/pe/a.1.fastq | 4 + src/fastp/test_data/pe/a.2.fastq | 4 + src/fastp/test_data/script.sh | 10 + src/fastp/test_data/se/a.fastq | 4 + 9 files changed, 870 insertions(+) create mode 100644 src/fastp/config.vsh.yaml create mode 100644 src/fastp/help.txt create mode 100644 src/fastp/script.sh create mode 100644 src/fastp/test.sh create mode 100644 src/fastp/test_data/pe/a.1.fastq create mode 100644 src/fastp/test_data/pe/a.2.fastq create mode 100755 src/fastp/test_data/script.sh create mode 100644 src/fastp/test_data/se/a.fastq diff --git a/CHANGELOG.md b/CHANGELOG.md index d2610ff9..63f69365 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ * `arriba`: Detect gene fusions from RNA-seq data (PR #1). +* `fastp`: An ultra-fast all-in-one FASTQ preprocessor (PR #3). + ## MAJOR CHANGES ## MINOR CHANGES diff --git a/src/fastp/config.vsh.yaml b/src/fastp/config.vsh.yaml new file mode 100644 index 00000000..c513ec92 --- /dev/null +++ b/src/fastp/config.vsh.yaml @@ -0,0 +1,574 @@ +functionality: + name: fastp + description: | + An ultra-fast all-in-one FASTQ preprocessor (QC/adapters/trimming/filtering/splitting/merging...). + + Features: + + - comprehensive quality profiling for both before and after filtering data (quality curves, base contents, KMER, Q20/Q30, GC Ratio, duplication, adapter contents...) + - filter out bad reads (too low quality, too short, or too many N...) + - cut low quality bases for per read in its 5' and 3' by evaluating the mean quality from a sliding window (like Trimmomatic but faster). + - trim all reads in front and tail + - cut adapters. Adapter sequences can be automatically detected, which means you don't have to input the adapter sequences to trim them. + - correct mismatched base pairs in overlapped regions of paired end reads, if one base is with high quality while the other is with ultra low quality + - trim polyG in 3' ends, which is commonly seen in NovaSeq/NextSeq data. Trim polyX in 3' ends to remove unwanted polyX tailing (i.e. polyA tailing for mRNA-Seq data) + - preprocess unique molecular identifier (UMI) enabled data, shift UMI to sequence name. + - report JSON format result for further interpreting. + - visualize quality control and filtering results on a single HTML page (like FASTQC but faster and more informative). + - split the output to multiple files (0001.R1.gz, 0002.R1.gz...) to support parallel processing. Two modes can be used, limiting the total split file number, or limitting the lines of each split file. + - support long reads (data from PacBio / Nanopore devices). + - support reading from STDIN and writing to STDOUT + - support interleaved input + - support ultra-fast FASTQ-level deduplication + info: + keywords: [RNA-Seq, Trimming, Quality control] + repository: https://github.com/OpenGene/fastp + documentation: https://github.com/OpenGene/fastp/blob/master/README.md + reference: "doi:10.1093/bioinformatics/bty560" + licence: MIT + argument_groups: + - name: Inputs + description: | + `fastp` supports both single-end (SE) and paired-end (PE) input. + + - for SE data, you only have to specify read1 input by `-i` or `--in1`. + - for PE data, you should also specify read2 input by `-I` or `--in2`. + arguments: + - name: --in1 + alternatives: [-i] + type: file + description: Input FastQ file. Must be single-end or paired-end R1. Can be gzipped. + required: true + example: in.R1.fq.gz + - name: --in2 + alternatives: [-I] + type: file + description: Input FastQ file. Must be paired-end R2. Can be gzipped. + required: false + example: in.R2.fq.gz + - name: Outputs + description: | + + - for SE data, you only have to specify read1 output by `-o` or `--out1`. + - for PE data, you should also specify read2 output by `-O` or `--out2`. + - if you don't specify the output file names, no output files will be written, but the QC will still be done for both data before and after filtering. + - the output will be gzip-compressed if its file name ends with `.gz` + arguments: + - name: --out1 + alternatives: [-o] + type: file + description: The single-end or paired-end R1 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: true + example: out.R1.fq.gz + direction: output + - name: --out2 + alternatives: [-O] + type: file + description: The paired-end R2 reads that pass QC. Will be gzipped if its file name ends with `.gz`. + required: false + example: out.R2.fq.gz + direction: output + - name: --unpaired1 + type: file + description: Store the reads that `read1` passes filters but its paired `read2` doesn't. + required: false + example: unpaired.R1.fq.gz + direction: output + - name: --unpaired2 + type: file + description: Store the reads that `read2` passes filters but its paired `read1` doesn't. + required: false + example: unpaired.R2.fq.gz + direction: output + - name: --failed_out + type: file + description: | + Store the reads that fail filters. + + If one read failed and is written to --failed_out, its failure reason will be appended to its read name. For example, failed_quality_filter, failed_too_short etc. + For PE data, if unpaired reads are not stored (by giving --unpaired1 or --unpaired2), the failed pair of reads will be put together. If one read passes the filters but its pair doesn't, the failure reason will be paired_read_is_failing. + required: false + example: failed.fq.gz + direction: output + - name: --overlapped_out + type: file + description: | + For each read pair, output the overlapped region if it has no any mismatched base. + direction: output + - name: Report output arguments + arguments: + - name: --json + alternatives: [-j] + type: file + description: | + The json format report file name + example: out.json + direction: output + - name: --html + type: file + description: | + The html format report file name + example: out.html + direction: output + - name: --report_title + type: string + description: | + The title of the html report, default is "fastp report". + example: fastp report + - name: Adapter trimming + description: | + Adapter trimming is enabled by default, but you can disable it by `-A` or `--disable_adapter_trimming`. Adapter sequences can be automatically detected for both PE/SE data. + + - For SE data, the adapters are evaluated by analyzing the tails of first ~1M reads. This evaluation may be inacurrate, and you can specify the adapter sequence by `-a` or `--adapter_sequence` option. If adapter sequence is specified, the auto detection for SE data will be disabled. + - For PE data, the adapters can be detected by per-read overlap analysis, which seeks for the overlap of each pair of reads. This method is robust and fast, so normally you don't have to input the adapter sequence even you know it. But you can still specify the adapter sequences for read1 by `--adapter_sequence`, and for read2 by `--adapter_sequence_r2`. If `fastp` fails to find an overlap (i.e. due to low quality bases), it will use these sequences to trim adapters for read1 and read2 respectively. + - For PE data, the adapter sequence auto-detection is disabled by default since the adapters can be trimmed by overlap analysis. However, you can specify `--detect_adapter_for_pe` to enable it. + - For PE data, `fastp` will run a little slower if you specify the sequence adapters or enable adapter auto-detection, but usually result in a slightly cleaner output, since the overlap analysis may fail due to sequencing errors or adapter dimers. + - The most widely used adapter is the Illumina TruSeq adapters. If your data is from the TruSeq library, you can add `--adapter_sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA --adapter_sequence_r2=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT` to your command lines, or enable auto detection for PE data by specifing `detect_adapter_for_pe`. + - `fastp` contains some built-in known adapter sequences for better auto-detection. If you want to make some adapters to be a part of the built-in adapters, please file an issue. + + You can also specify --adapter_fasta to give a FASTA file to tell fastp to trim multiple adapters in this FASTA file. Here is a sample of such adapter FASTA file: + + ``` + >Illumina TruSeq Adapter Read 1 + AGATCGGAAGAGCACACGTCTGAACTCCAGTCA + >Illumina TruSeq Adapter Read 2 + AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT + >polyA + AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA + ``` + + The adapter sequence in this file should be at least 6bp long, otherwise it will be skipped. And you can give whatever you want to trim, rather than regular sequencing adapters (i.e. polyA). + + `fastp` first trims the auto-detected adapter or the adapter sequences given by `--adapter_sequence | --adapter_sequence_r2`, then trims the adapters given by `--adapter_fasta` one by one. + + The sequence distribution of trimmed adapters can be found at the HTML/JSON reports. + arguments: + - name: --disable_adapter_trimming + alternatives: [-A] + type: boolean_true + description: | + Disable adapter trimming. + - name: --detect_adapter_for_pe + type: boolean_true + description: | + By default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. + - name: --adapter_sequence + alternatives: [-a] + type: string + description: | + The adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped + - name: --adapter_sequence_r2 + type: string + description: | + The adapter sequences to be trimmed for R2. This is used for PE data if R1/R2 are found overlapped. + - name: --adapter_fasta + type: file + description: | + A FASTA file containing all the adapter sequences to be trimmed. For SE data, if not specified, the adapters will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. + - name: Base trimming + arguments: + - name: --trim_front1 + alternatives: [-f] + type: integer + description: | + Trimming how many bases in front for read1, default is 0. + example: 0 + - name: --trim_tail1 + alternatives: [-t] + type: integer + description: | + Trimming how many bases in tail for read1, default is 0. + example: 0 + - name: --max_len1 + alternatives: [-b] + type: integer + min: 0 + description: | + If read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation. + - name: --trim_front2 + alternatives: [-F] + type: integer + description: | + Trimming how many bases in front for read2, default is 0. + example: 0 + - name: --trim_tail2 + alternatives: [-T] + type: integer + description: | + Trimming how many bases in tail for read2, default is 0. + example: 0 + - name: --max_len2 + alternatives: [-B] + type: integer + min: 0 + description: | + If read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. + - name: Merging mode + description: Allows merging paired-end reads into a single longer read if they are overlapping. + arguments: + - name: --merge + alternatives: [-m] + type: boolean_true + description: | + For paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + - name: --merged_out + type: file + description: | + In the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output. + direction: output + example: merged.fq.gz + - name: --include_unmerged + type: boolean_true + description: | + In the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. + - name: Additional input arguments + description: Affects how the input is read. + arguments: + - name: --interleaved_in + type: boolean_true + description: | + Indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. + - name: --fix_mgi_id + type: boolean_true + description: | + The MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. + - name: --phred64 + alternatives: ["-6"] + type: boolean_true + description: | + Indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) + - name: Additional output arguments + description: Affects how the output is written. + arguments: + - name: --compression + alternatives: ["-z"] + type: integer + description: | + Compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. + example: 4 + min: 1 + max: 9 + - name: --dont_overwrite + type: boolean_true + description: | + Don't overwrite existing files. Overwritting is allowed by default. + - name: Logging arguments + arguments: + - name: --verbose + alternatives: [-V] + type: boolean_true + description: Output verbose log information (i.e. when every 1M reads are processed). + - name: Processing arguments + arguments: + - name: --reads_to_process + type: long + description: | + Specify how many reads/pairs to be processed. Default 0 means process all reads. + example: 1000000 + min: 0 + - name: Deduplication arguments + arguments: + - name: --dedup + type: boolean_true + description: | + Enable deduplication to drop the duplicated reads/pairs + - name: --dup_calc_accuracy + type: integer + description: | + Accuracy level to calculate duplication (1~6). Higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. + example: 3 + min: 1 + max: 6 + - name: --dont_eval_duplication + type: boolean_true + description: | + Don't evaluate duplication rate to save time and use less memory. + - name: PolyG tail trimming arguments + arguments: + - name: --trim_poly_g + alternatives: [-g] + type: boolean_true + description: | + Force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: --poly_g_min_len + type: integer + description: | + The minimum length to detect polyG in the read tail. 10 by default. + example: 10 + min: 1 + - name: --disable_trim_poly_g + alternatives: [-G] + type: boolean_true + description: | + Disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + - name: PolyX tail trimming arguments + arguments: + - name: --trim_poly_x + alternatives: [-x] + type: boolean_true + description: | + Enable polyX trimming in 3' ends. + - name: --poly_x_min_len + type: integer + description: | + The minimum length to detect polyX in the read tail. 10 by default. + example: 10 + min: 1 + - name: Cut arguments + arguments: + - name: --cut_front + alternatives: ["-5"] + type: integer + description: | + Move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_tail + alternatives: ["-3"] + type: integer + description: | + Move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. + - name: --cut_right + alternatives: ["-r"] + type: integer + description: | + Move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. + - name: --cut_window_size + alternatives: ["-W"] + type: integer + description: | + The window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4. + example: 4 + min: 1 + - name: --cut_mean_quality + alternatives: ["-M"] + type: integer + description: | + The mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) + example: 20 + min: 0 + - name: --cut_front_window_size + type: integer + description: | + The window size option of cut_front, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_front_mean_quality + type: integer + description: | + The mean quality requirement option of cut_front, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_tail_window_size + type: integer + description: | + The window size option of cut_tail, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_tail_mean_quality + type: integer + description: | + The mean quality requirement option of cut_tail, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: --cut_right_window_size + type: integer + description: | + The window size option of cut_right, default to cut_window_size if not specified. + example: 4 + min: 1 + - name: --cut_right_mean_quality + type: integer + description: | + The mean quality requirement option of cut_right, default to cut_mean_quality if not specified. + example: 20 + min: 0 + - name: Quality filtering arguments + arguments: + - name: --disable_quality_filtering + alternatives: [-Q] + type: boolean_true + description: | + Quality filtering is enabled by default. If this option is specified, quality filtering is disabled. + - name: --qualified_quality_phred + alternatives: [-q] + type: integer + description: | + The quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. + example: 15 + min: 0 + - name: --unqualified_percent_limit + alternatives: [-u] + type: integer + description: | + How many percents of bases are allowed to be unqualified (0~100). Default 40 means 40%. + example: 40 + min: 0 + max: 100 + - name: --n_base_limit + alternatives: [-n] + type: integer + description: | + If one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5. + example: 5 + min: 0 + - name: --average_qual + alternatives: [-e] + type: integer + description: | + If one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default. + # - name: --split_prefix_digits + # type: integer + # description: | + # The digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding. + # example: 4 + resources: + - type: bash_script + path: script.sh + test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data +platforms: + - type: docker + image: quay.io/biocontainers/fastp:0.23.4--hadf994f_2 + setup: + - type: docker + run: | + fastp --version 2>&1 | sed 's# #: "#;s#$#"#' > /var/software_versions.txt + - type: nextflow diff --git a/src/fastp/help.txt b/src/fastp/help.txt new file mode 100644 index 00000000..d34917b2 --- /dev/null +++ b/src/fastp/help.txt @@ -0,0 +1,93 @@ +```bash +fastp --help +``` + +usage: fastp [options] ... +options: + -i, --in1 read1 input file name (string [=]) + -o, --out1 read1 output file name (string [=]) + -I, --in2 read2 input file name (string [=]) + -O, --out2 read2 output file name (string [=]) + --unpaired1 for PE input, if read1 passed QC but read2 not, it will be written to unpaired1. Default is to discard it. (string [=]) + --unpaired2 for PE input, if read2 passed QC but read1 not, it will be written to unpaired2. If --unpaired2 is same as --unpaired1 (default mode), both unpaired reads will be written to this same file. (string [=]) + --overlapped_out for each read pair, output the overlapped region if it has no any mismatched base. (string [=]) + --failed_out specify the file to store reads that cannot pass the filters. (string [=]) + -m, --merge for paired-end input, merge each pair of reads into a single read if they are overlapped. The merged reads will be written to the file given by --merged_out, the unmerged reads will be written to the files specified by --out1 and --out2. The merging mode is disabled by default. + --merged_out in the merging mode, specify the file name to store merged output, or specify --stdout to stream the merged output (string [=]) + --include_unmerged in the merging mode, write the unmerged or unpaired reads to the file specified by --merge. Disabled by default. + -6, --phred64 indicate the input is using phred64 scoring (it'll be converted to phred33, so the output will still be phred33) + -z, --compression compression level for gzip output (1 ~ 9). 1 is fastest, 9 is smallest, default is 4. (int [=4]) + --stdin input from STDIN. If the STDIN is interleaved paired-end FASTQ, please also add --interleaved_in. + --stdout stream passing-filters reads to STDOUT. This option will result in interleaved FASTQ output for paired-end output. Disabled by default. + --interleaved_in indicate that is an interleaved FASTQ which contains both read1 and read2. Disabled by default. + --reads_to_process specify how many reads/pairs to be processed. Default 0 means process all reads. (int [=0]) + --dont_overwrite don't overwrite existing files. Overwritting is allowed by default. + --fix_mgi_id the MGI FASTQ ID format is not compatible with many BAM operation tools, enable this option to fix it. + -V, --verbose output verbose log information (i.e. when every 1M reads are processed). + -A, --disable_adapter_trimming adapter trimming is enabled by default. If this option is specified, adapter trimming is disabled + -a, --adapter_sequence the adapter for read1. For SE data, if not specified, the adapter will be auto-detected. For PE data, this is used if R1/R2 are found not overlapped. (string [=auto]) + --adapter_sequence_r2 the adapter for read2 (PE data only). This is used if R1/R2 are found not overlapped. If not specified, it will be the same as (string [=auto]) + --adapter_fasta specify a FASTA file to trim both read1 and read2 (if PE) by all the sequences in this FASTA file (string [=]) + --detect_adapter_for_pe by default, the auto-detection for adapter is for SE data input only, turn on this option to enable it for PE data. + -f, --trim_front1 trimming how many bases in front for read1, default is 0 (int [=0]) + -t, --trim_tail1 trimming how many bases in tail for read1, default is 0 (int [=0]) + -b, --max_len1 if read1 is longer than max_len1, then trim read1 at its tail to make it as long as max_len1. Default 0 means no limitation (int [=0]) + -F, --trim_front2 trimming how many bases in front for read2. If it's not specified, it will follow read1's settings (int [=0]) + -T, --trim_tail2 trimming how many bases in tail for read2. If it's not specified, it will follow read1's settings (int [=0]) + -B, --max_len2 if read2 is longer than max_len2, then trim read2 at its tail to make it as long as max_len2. Default 0 means no limitation. If it's not specified, it will follow read1's settings (int [=0]) + -D, --dedup enable deduplication to drop the duplicated reads/pairs + --dup_calc_accuracy accuracy level to calculate duplication (1~6), higher level uses more memory (1G, 2G, 4G, 8G, 16G, 24G). Default 1 for no-dedup mode, and 3 for dedup mode. (int [=0]) + --dont_eval_duplication don't evaluate duplication rate to save time and use less memory. + -g, --trim_poly_g force polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + --poly_g_min_len the minimum length to detect polyG in the read tail. 10 by default. (int [=10]) + -G, --disable_trim_poly_g disable polyG tail trimming, by default trimming is automatically enabled for Illumina NextSeq/NovaSeq data + -x, --trim_poly_x enable polyX trimming in 3' ends. + --poly_x_min_len the minimum length to detect polyX in the read tail. 10 by default. (int [=10]) + -5, --cut_front move a sliding window from front (5') to tail, drop the bases in the window if its mean quality < threshold, stop otherwise. + -3, --cut_tail move a sliding window from tail (3') to front, drop the bases in the window if its mean quality < threshold, stop otherwise. + -r, --cut_right move a sliding window from front to tail, if meet one window with mean quality < threshold, drop the bases in the window and the right part, and then stop. + -W, --cut_window_size the window size option shared by cut_front, cut_tail or cut_sliding. Range: 1~1000, default: 4 (int [=4]) + -M, --cut_mean_quality the mean quality requirement option shared by cut_front, cut_tail or cut_sliding. Range: 1~36 default: 20 (Q20) (int [=20]) + --cut_front_window_size the window size option of cut_front, default to cut_window_size if not specified (int [=4]) + --cut_front_mean_quality the mean quality requirement option for cut_front, default to cut_mean_quality if not specified (int [=20]) + --cut_tail_window_size the window size option of cut_tail, default to cut_window_size if not specified (int [=4]) + --cut_tail_mean_quality the mean quality requirement option for cut_tail, default to cut_mean_quality if not specified (int [=20]) + --cut_right_window_size the window size option of cut_right, default to cut_window_size if not specified (int [=4]) + --cut_right_mean_quality the mean quality requirement option for cut_right, default to cut_mean_quality if not specified (int [=20]) + -Q, --disable_quality_filtering quality filtering is enabled by default. If this option is specified, quality filtering is disabled + -q, --qualified_quality_phred the quality value that a base is qualified. Default 15 means phred quality >=Q15 is qualified. (int [=15]) + -u, --unqualified_percent_limit how many percents of bases are allowed to be unqualified (0~100). Default 40 means 40% (int [=40]) + -n, --n_base_limit if one read's number of N base is >n_base_limit, then this read/pair is discarded. Default is 5 (int [=5]) + -e, --average_qual if one read's average quality score =1000), a sequential number prefix will be added to output name ( 0001.out.fq, 0002.out.fq...), disabled by default (long [=0]) + -d, --split_prefix_digits the digits for the sequential number padding (1~10), default is 4, so the filename will be padded as 0001.xxx, 0 to disable padding (int [=4]) + --cut_by_quality5 DEPRECATED, use --cut_front instead. + --cut_by_quality3 DEPRECATED, use --cut_tail instead. + --cut_by_quality_aggressive DEPRECATED, use --cut_right instead. + --discard_unmerged DEPRECATED, no effect now, see the introduction for merging. + -?, --help print this message diff --git a/src/fastp/script.sh b/src/fastp/script.sh new file mode 100644 index 00000000..4bb37c87 --- /dev/null +++ b/src/fastp/script.sh @@ -0,0 +1,105 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +# disable flags +[[ "$par_disable_adapter_trimming" == "false" ]] && unset par_disable_adapter_trimming +[[ "$par_detect_adapter_for_pe" == "false" ]] && unset par_detect_adapter_for_pe +[[ "$par_merge" == "false" ]] && unset par_merge +[[ "$par_include_unmerged" == "false" ]] && unset par_include_unmerged +[[ "$par_interleaved_in" == "false" ]] && unset par_interleaved_in +[[ "$par_fix_mgi_id" == "false" ]] && unset par_fix_mgi_id +[[ "$par_phred64" == "false" ]] && unset par_phred64 +[[ "$par_dont_overwrite" == "false" ]] && unset par_dont_overwrite +[[ "$par_verbose" == "false" ]] && unset par_verbose +[[ "$par_dedup" == "false" ]] && unset par_dedup +[[ "$par_dont_eval_duplication" == "false" ]] && unset par_dont_eval_duplication +[[ "$par_trim_poly_g" == "false" ]] && unset par_trim_poly_g +[[ "$par_disable_trim_poly_g" == "false" ]] && unset par_disable_trim_poly_g +[[ "$par_trim_poly_x" == "false" ]] && unset par_trim_poly_x +[[ "$par_disable_quality_filtering" == "false" ]] && unset par_disable_quality_filtering +[[ "$par_disable_length_filtering" == "false" ]] && unset par_disable_length_filtering +[[ "$par_low_complexity_filter" == "false" ]] && unset par_low_complexity_filter +[[ "$par_umi" == "false" ]] && unset par_umi +[[ "$par_overrepresentation_analysis" == "false" ]] && unset par_overrepresentation_analysis + +# run command +fastp \ + -i "$par_in1" \ + -o "$par_out1" \ + ${par_in2:+--in2 "${par_in2}"} \ + ${par_out2:+--out2 "${par_out2}"} \ + ${par_unpaired1:+--unpaired1 "${par_unpaired1}"} \ + ${par_unpaired2:+--unpaired2 "${par_unpaired2}"} \ + ${par_failed_out:+--failed_out "${par_failed_out}"} \ + ${par_overlapped_out:+--overlapped_out "${par_overlapped_out}"} \ + ${par_json:+--json "${par_json}"} \ + ${par_html:+--html "${par_html}"} \ + ${par_report_title:+--report_title "${par_report_title}"} \ + ${par_disable_adapter_trimming:+--disable_adapter_trimming} \ + ${par_detect_adapter_for_pe:+--detect_adapter_for_pe} \ + ${par_adapter_sequence:+--adapter_sequence "${par_adapter_sequence}"} \ + ${par_adapter_sequence_r2:+--adapter_sequence_r2 "${par_adapter_sequence_r2}"} \ + ${par_adapter_fasta:+--adapter_fasta "${par_adapter_fasta}"} \ + ${par_trim_front1:+--trim_front1 "${par_trim_front1}"} \ + ${par_trim_tail1:+--trim_tail1 "${par_trim_tail1}"} \ + ${par_max_len1:+--max_len1 "${par_max_len1}"} \ + ${par_trim_front2:+--trim_front2 "${par_trim_front2}"} \ + ${par_trim_tail2:+--trim_tail2 "${par_trim_tail2}"} \ + ${par_max_len2:+--max_len2 "${par_max_len2}"} \ + ${par_merge:+--merge} \ + ${par_merged_out:+--merged_out "${par_merged_out}"} \ + ${par_include_unmerged:+--include_unmerged} \ + ${par_interleaved_in:+--interleaved_in} \ + ${par_fix_mgi_id:+--fix_mgi_id} \ + ${par_phred64:+--phred64} \ + ${par_compression:+--compression "${par_compression}"} \ + ${par_dont_overwrite:+--dont_overwrite} \ + ${par_verbose:+--verbose} \ + ${par_reads_to_process:+--reads_to_process "${par_reads_to_process}"} \ + ${par_dedup:+--dedup} \ + ${par_dup_calc_accuracy:+--dup_calc_accuracy "${par_dup_calc_accuracy}"} \ + ${par_dont_eval_duplication:+--dont_eval_duplication} \ + ${par_trim_poly_g:+--trim_poly_g} \ + ${par_poly_g_min_len:+--poly_g_min_len "${par_poly_g_min_len}"} \ + ${par_disable_trim_poly_g:+--disable_trim_poly_g} \ + ${par_trim_poly_x:+--trim_poly_x} \ + ${par_poly_x_min_len:+--poly_x_min_len "${par_poly_x_min_len}"} \ + ${par_cut_front:+--cut_front "${par_cut_front}"} \ + ${par_cut_tail:+--cut_tail "${par_cut_tail}"} \ + ${par_cut_right:+--cut_right "${par_cut_right}"} \ + ${par_cut_window_size:+--cut_window_size "${par_cut_window_size}"} \ + ${par_cut_mean_quality:+--cut_mean_quality "${par_cut_mean_quality}"} \ + ${par_cut_front_window_size:+--cut_front_window_size "${par_cut_front_window_size}"} \ + ${par_cut_front_mean_quality:+--cut_front_mean_quality "${par_cut_front_mean_quality}"} \ + ${par_cut_tail_window_size:+--cut_tail_window_size "${par_cut_tail_window_size}"} \ + ${par_cut_tail_mean_quality:+--cut_tail_mean_quality "${par_cut_tail_mean_quality}"} \ + ${par_cut_right_window_size:+--cut_right_window_size "${par_cut_right_window_size}"} \ + ${par_cut_right_mean_quality:+--cut_right_mean_quality "${par_cut_right_mean_quality}"} \ + ${par_disable_quality_filtering:+--disable_quality_filtering} \ + ${par_qualified_quality_phred:+--qualified_quality_phred "${par_qualified_quality_phred}"} \ + ${par_unqualified_percent_limit:+--unqualified_percent_limit "${par_unqualified_percent_limit}"} \ + ${par_n_base_limit:+--n_base_limit "${par_n_base_limit}"} \ + ${par_average_qual:+--average_qual "${par_average_qual}"} \ + ${par_disable_length_filtering:+--disable_length_filtering} \ + ${par_length_required:+--length_required "${par_length_required}"} \ + ${par_length_limit:+--length_limit "${par_length_limit}"} \ + ${par_low_complexity_filter:+--low_complexity_filter} \ + ${par_complexity_threshold:+--complexity_threshold "${par_complexity_threshold}"} \ + ${par_filter_by_index1:+--filter_by_index1 "${par_filter_by_index1}"} \ + ${par_filter_by_index2:+--filter_by_index2 "${par_filter_by_index2}"} \ + ${par_filter_by_index_threshold:+--filter_by_index_threshold "${par_filter_by_index_threshold}"} \ + ${par_correction:+--correction} \ + ${par_overlap_len_require:+--overlap_len_require "${par_overlap_len_require}"} \ + ${par_overlap_diff_limit:+--overlap_diff_limit "${par_overlap_diff_limit}"} \ + ${par_overlap_diff_percent_limit:+--overlap_diff_percent_limit "${par_overlap_diff_percent_limit}"} \ + ${par_umi:+--umi} \ + ${par_umi_loc:+--umi_loc "${par_umi_loc}"} \ + ${par_umi_len:+--umi_len "${par_umi_len}"} \ + ${par_umi_prefix:+--umi_prefix "${par_umi_prefix}"} \ + ${par_umi_skip:+--umi_skip "${par_umi_skip}"} \ + ${par_umi_delim:+--umi_delim "${par_umi_delim}"} \ + ${par_overrepresentation_analysis:+--overrepresentation_analysis} \ + ${par_overrepresentation_sampling:+--overrepresentation_sampling "${par_overrepresentation_sampling}"} \ + ${meta_cpus:+--thread "${meta_cpus}"} diff --git a/src/fastp/test.sh b/src/fastp/test.sh new file mode 100644 index 00000000..1b1f6f0c --- /dev/null +++ b/src/fastp/test.sh @@ -0,0 +1,74 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/docker/fastp/fastp" +meta_resources_dir="src/fastp" +## VIASH END + +######################################################################################### +mkdir fastp_se +cd fastp_se + +echo "> Run fastp on SE" +"$meta_executable" \ + --in1 "$meta_resources_dir/test_data/se/a.fastq" \ + --out1 "trimmed.fastq" \ + --failed_out "failed.fastq" \ + --json "report.json" \ + --html "report.html" \ + --adapter_sequence ACGGCTAGCTA + +echo ">> Check if output exists" +[ ! -f "trimmed.fastq" ] && echo ">> trimmed.fastq does not exist" && exit 1 +[ ! -f "failed.fastq" ] && echo ">> failed.fastq does not exist" && exit 1 +[ ! -f "report.json" ] && echo ">> report.json does not exist" && exit 1 +[ ! -f "report.html" ] && echo ">> report.html does not exist" && exit 1 + +######################################################################################### +cd .. +mkdir fastp_pe_minimal +cd fastp_pe_minimal + +echo ">> Run fastp on PE with minimal parameters" +"$meta_executable" \ + --in1 "$meta_resources_dir/test_data/pe/a.1.fastq" \ + --in2 "$meta_resources_dir/test_data/pe/a.2.fastq" \ + --out1 "trimmed_1.fastq" \ + --out2 "trimmed_2.fastq" + +echo ">> Check if output exists" +[ ! -f "trimmed_1.fastq" ] && echo ">> trimmed_1.fastq does not exist" && exit 1 +[ ! -f "trimmed_2.fastq" ] && echo ">> trimmed_2.fastq does not exist" && exit 1 + +######################################################################################### +cd .. +mkdir fastp_pe_many +cd fastp_pe_many + +echo ">> Run fastp on PE with many parameters" +"$meta_executable" \ + --in1 "$meta_resources_dir/test_data/pe/a.1.fastq" \ + --in2 "$meta_resources_dir/test_data/pe/a.2.fastq" \ + --out1 "trimmed_1.fastq" \ + --out2 "trimmed_2.fastq" \ + --failed_out "failed.fastq" \ + --json "report.json" \ + --html "report.html" \ + --adapter_sequence ACGGCTAGCTA \ + --adapter_sequence_r2 AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ + --merge \ + --merged_out "merged.fastq" + +echo ">> Check if output exists" +[ ! -f "trimmed_1.fastq" ] && echo ">> trimmed_1.fastq does not exist" && exit 1 +[ ! -f "trimmed_2.fastq" ] && echo ">> trimmed_2.fastq does not exist" && exit 1 +[ ! -f "failed.fastq" ] && echo ">> failed.fastq does not exist" && exit 1 +[ ! -f "report.json" ] && echo ">> report.json does not exist" && exit 1 +[ ! -f "report.html" ] && echo ">> report.html does not exist" && exit 1 +[ ! -f "merged.fastq" ] && echo ">> merged.fastq does not exist" && exit 1 + +######################################################################################### + +echo "> Test successful" \ No newline at end of file diff --git a/src/fastp/test_data/pe/a.1.fastq b/src/fastp/test_data/pe/a.1.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/fastp/test_data/pe/a.1.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/src/fastp/test_data/pe/a.2.fastq b/src/fastp/test_data/pe/a.2.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/fastp/test_data/pe/a.2.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/src/fastp/test_data/script.sh b/src/fastp/test_data/script.sh new file mode 100755 index 00000000..725eef6d --- /dev/null +++ b/src/fastp/test_data/script.sh @@ -0,0 +1,10 @@ +# fastp test data + +# Test data was obtained from https://github.com/snakemake/snakemake-wrappers/tree/master/bio/fastp/test + +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +cp -r /tmp/snakemake-wrappers/bio/fastp/test/reads/* src/fastp/test_data + diff --git a/src/fastp/test_data/se/a.fastq b/src/fastp/test_data/se/a.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/fastp/test_data/se/a.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!!