From 2682d39dbfa971ad23c0df927cb45b5734ece1cc Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 27 Jun 2024 11:13:22 +0200 Subject: [PATCH 01/10] tests added --- src/seqtk/seqtk_sample/config.vsh.yaml | 54 ++++++++++++++++ src/seqtk/seqtk_sample/help.txt | 7 +++ src/seqtk/seqtk_sample/script.sh | 11 ++++ src/seqtk/seqtk_sample/test.sh | 59 ++++++++++++++++++ .../seqtk_sample/test_data/reads/a.1.fastq.gz | Bin 0 -> 100 bytes .../seqtk_sample/test_data/reads/a.2.fastq.gz | Bin 0 -> 100 bytes .../seqtk_sample/test_data/reads/a.fastq | 4 ++ .../seqtk_sample/test_data/reads/a.fastq.gz | Bin 0 -> 44 bytes .../seqtk_sample/test_data/reads/id.list | 1 + src/seqtk/seqtk_sample/test_data/script.sh | 9 +++ 10 files changed, 145 insertions(+) create mode 100644 src/seqtk/seqtk_sample/config.vsh.yaml create mode 100644 src/seqtk/seqtk_sample/help.txt create mode 100644 src/seqtk/seqtk_sample/script.sh create mode 100644 src/seqtk/seqtk_sample/test.sh create mode 100644 src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz create mode 100644 src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz create mode 100644 src/seqtk/seqtk_sample/test_data/reads/a.fastq create mode 100644 src/seqtk/seqtk_sample/test_data/reads/a.fastq.gz create mode 100644 src/seqtk/seqtk_sample/test_data/reads/id.list create mode 100755 src/seqtk/seqtk_sample/test_data/script.sh diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml new file mode 100644 index 00000000..9f82c03a --- /dev/null +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -0,0 +1,54 @@ +name: seqtk_sample +namespace: seqtk +description: Subsamples sequences from FASTA/Q files. +keywords: [tag1, tag2] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input FASTA/Q file. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + description: The output FASTA/Q file. + required: true + direction: output + + - name: Options + arguments: + - name: --seed + type: integer + description: Seed for random generator. + default: 42 + - name: --fraction_number + type: double + description: Fraction or number of sequences to sample. + default: 0.1 + - name: --two_pass_mode + type: boolean + description: twice as slow but with much reduced memory + default: false + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: test_data + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/help.txt b/src/seqtk/seqtk_sample/help.txt new file mode 100644 index 00000000..1ca78811 --- /dev/null +++ b/src/seqtk/seqtk_sample/help.txt @@ -0,0 +1,7 @@ +``` +seqtk_sample +``` +Usage: seqtk sample [-2] [-s seed=11] | > + +Options: -s INT RNG seed [11] + -2 2-pass mode: twice as slow but with much reduced memory diff --git a/src/seqtk/seqtk_sample/script.sh b/src/seqtk/seqtk_sample/script.sh new file mode 100644 index 00000000..3ac9ca71 --- /dev/null +++ b/src/seqtk/seqtk_sample/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +seqtk sample \ + ${par_2_pass_mode:+-2} \ + ${par_seed:+-s "$par_seed"} \ + "$par_input" \ + "$par_fraction_number" \ + > "$par_output" \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh new file mode 100644 index 00000000..d8594724 --- /dev/null +++ b/src/seqtk/seqtk_sample/test.sh @@ -0,0 +1,59 @@ +#!/bin/bash + +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_sample" +meta_resources_dir="src/seqtk/seqtk_sample" +## VIASH END + +######################################################################################### +mkdir seqtk_sample_se +cd seqtk_sample_se + +echo "> Run seqtk_sample on SE with fastq" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.fastq" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled.fastq" ]; then + echo ">> sampled.fastq.gz does not exist" + exit 1 +fi + +cat sampled.fastq + +######################################################################################### +cd .. +mkdir seqtk_sample_pe +cd seqtk_sample_pe + +echo ">> Run seqtk_sample on PE with fastq.gz" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 3 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || echo "Mismatch detected" && exit 1 diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz b/src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..97a72ce5d48317556a145f93c32c87f0e9e5500f GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw(6~jOf1wpPTJlJGMx7b%C&OZykT2i1C{p;n6 zLRM|nP{^ij8VcF9T|*&&2 literal 0 HcmV?d00001 diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz b/src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..038bc976ac32e8f26be16949bf5632c7090e635b GIT binary patch literal 100 zcmV-q0Gt0GiwFRnrn+7N10Bw*5yUVM1wrm8Zt)RG{ Date: Thu, 27 Jun 2024 11:20:58 +0200 Subject: [PATCH 02/10] tests extended --- src/seqtk/seqtk_sample/test.sh | 45 ++++++++++++++++++++++++++++------ 1 file changed, 38 insertions(+), 7 deletions(-) diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh index d8594724..f9d942eb 100644 --- a/src/seqtk/seqtk_sample/test.sh +++ b/src/seqtk/seqtk_sample/test.sh @@ -11,7 +11,7 @@ meta_resources_dir="src/seqtk/seqtk_sample" mkdir seqtk_sample_se cd seqtk_sample_se -echo "> Run seqtk_sample on SE with fastq" +echo "> Run seqtk_sample on fastq SE" "$meta_executable" \ --input "$meta_resources_dir/test_data/reads/a.fastq" \ --seed 42 \ @@ -24,14 +24,12 @@ if [ ! -f "sampled.fastq" ]; then exit 1 fi -cat sampled.fastq - ######################################################################################### cd .. -mkdir seqtk_sample_pe -cd seqtk_sample_pe +mkdir seqtk_sample_pe_number +cd seqtk_sample_pe_number -echo ">> Run seqtk_sample on PE with fastq.gz" +echo ">> Run seqtk_sample on fastq.gz PE with number of reads" "$meta_executable" \ --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ --seed 42 \ @@ -56,4 +54,37 @@ headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) # Compare headers -diff <(echo "$headers1") <(echo "$headers2") || echo "Mismatch detected" && exit 1 +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + +######################################################################################### +cd .. +mkdir seqtk_sample_pe_fraction +cd seqtk_sample_pe_fraction + +echo ">> Run seqtk_sample on fastq.gz PE with fraction of reads" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_1.fastq" + +"$meta_executable" \ + --input "$meta_resources_dir/test_data/reads/a.2.fastq.gz" \ + --seed 42 \ + --fraction_number 0.5 \ + --output "sampled_2.fastq" + +echo ">> Check if output exists" +if [ ! -f "sampled_1.fastq" ] || [ ! -f "sampled_2.fastq" ]; then + echo ">> One or both output files do not exist" + exit 1 +fi + +echo ">> Compare reads" +# Extract headers +headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# Compare headers +diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + From 796c3bcdbb5f5fa578686cd33c28817d83f04427 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 27 Jun 2024 11:25:46 +0200 Subject: [PATCH 03/10] changelog entry added --- CHANGELOG.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99c859d2..dc37f826 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,6 +50,9 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). +* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files +to FASTA/Q (PR #68). + ## MAJOR CHANGES From 51c69e9d1d5a6233e3d9e8f954b2a4d792d19646 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Fri, 5 Jul 2024 14:56:54 +0200 Subject: [PATCH 04/10] reorganized seqtk namespace + added seqtk subseq config and script --- src/seqtk/seqtk_sample/config.vsh.yaml | 2 +- src/seqtk/seqtk_sample/script.sh | 2 +- src/seqtk/seqtk_sample/test.sh | 2 +- src/seqtk/seqtk_subseq/config.vsh.yaml | 60 ++++++++++++++++++ src/seqtk/seqtk_subseq/help.txt | 7 ++ src/seqtk/seqtk_subseq/script.sh | 11 ++++ .../test_data/reads/a.1.fastq.gz | Bin .../test_data/reads/a.2.fastq.gz | Bin .../test_data/reads/a.fastq | 0 .../test_data/reads/a.fastq.gz | Bin .../test_data/reads/id.list | 0 .../{seqtk_sample => }/test_data/script.sh | 4 +- 12 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 src/seqtk/seqtk_subseq/config.vsh.yaml create mode 100644 src/seqtk/seqtk_subseq/help.txt create mode 100644 src/seqtk/seqtk_subseq/script.sh rename src/seqtk/{seqtk_sample => }/test_data/reads/a.1.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.2.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.fastq (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/id.list (100%) rename src/seqtk/{seqtk_sample => }/test_data/script.sh (61%) diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml index 9f82c03a..63c3271a 100644 --- a/src/seqtk/seqtk_sample/config.vsh.yaml +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -44,7 +44,7 @@ test_resources: - type: bash_script path: test.sh - type: file - path: test_data + path: ../test_data engines: - type: docker diff --git a/src/seqtk/seqtk_sample/script.sh b/src/seqtk/seqtk_sample/script.sh index 3ac9ca71..01d981b3 100644 --- a/src/seqtk/seqtk_sample/script.sh +++ b/src/seqtk/seqtk_sample/script.sh @@ -4,7 +4,7 @@ ## VIASH END seqtk sample \ - ${par_2_pass_mode:+-2} \ + ${par_two_pass_mode:+-2} \ ${par_seed:+-s "$par_seed"} \ "$par_input" \ "$par_fraction_number" \ diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh index f9d942eb..304c2f31 100644 --- a/src/seqtk/seqtk_sample/test.sh +++ b/src/seqtk/seqtk_sample/test.sh @@ -4,7 +4,7 @@ set -e ## VIASH START meta_executable="target/executable/seqtk/seqtk_sample" -meta_resources_dir="src/seqtk/seqtk_sample" +meta_resources_dir="src/seqtk" ## VIASH END ######################################################################################### diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml new file mode 100644 index 00000000..0cec1761 --- /dev/null +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -0,0 +1,60 @@ +name: seqtk_subseq +namespace: seqtk +description: +keywords: [tag1, tag2] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input FASTA/Q file. + required: true + - name: "--regions_file" + type: file + description: | + File with regions to extract. Can be either a list file + with one sequence name per line or a bed file. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + description: The output FASTA/Q file. + required: true + direction: output + + - name: Options + arguments: + - name: "--tab" + type: boolean + description: Output in tab-delimited format. + default: false + - name: "--strand_aware" + type: boolean + description: Strand-aware mode. + default: false + - name: "--line_length" + type: integer + description: Number of bases per line. + default: 60 + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: ../test_data + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/help.txt b/src/seqtk/seqtk_subseq/help.txt new file mode 100644 index 00000000..1ca78811 --- /dev/null +++ b/src/seqtk/seqtk_subseq/help.txt @@ -0,0 +1,7 @@ +``` +seqtk_sample +``` +Usage: seqtk sample [-2] [-s seed=11] | > + +Options: -s INT RNG seed [11] + -2 2-pass mode: twice as slow but with much reduced memory diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh new file mode 100644 index 00000000..a04ebbaa --- /dev/null +++ b/src/seqtk/seqtk_subseq/script.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +seqtk sample \ + ${par_tab:+-t} \ + ${par_strand_aware:+-s} \ + ${par_line_length:+-l "$par_line_length"} \ + "$par_input" \ + > "$par_output" \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz b/src/seqtk/test_data/reads/a.1.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz rename to src/seqtk/test_data/reads/a.1.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz b/src/seqtk/test_data/reads/a.2.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz rename to src/seqtk/test_data/reads/a.2.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.fastq b/src/seqtk/test_data/reads/a.fastq similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.fastq rename to src/seqtk/test_data/reads/a.fastq diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.fastq.gz b/src/seqtk/test_data/reads/a.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.fastq.gz rename to src/seqtk/test_data/reads/a.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/id.list b/src/seqtk/test_data/reads/id.list similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/id.list rename to src/seqtk/test_data/reads/id.list diff --git a/src/seqtk/seqtk_sample/test_data/script.sh b/src/seqtk/test_data/script.sh similarity index 61% rename from src/seqtk/seqtk_sample/test_data/script.sh rename to src/seqtk/test_data/script.sh index cd3f2360..049093cd 100755 --- a/src/seqtk/seqtk_sample/test_data/script.sh +++ b/src/seqtk/test_data/script.sh @@ -4,6 +4,6 @@ if [ ! -d /tmp/snakemake-wrappers ]; then fi # copy test data -cp -r /tmp/snakemake-wrappers/bio/seqtk/test/* src/seqtk/seqtk_sample/test_data +cp -r /tmp/snakemake-wrappers/bio/seqtk/test/* src/seqtk/test_data -rm src/seqtk/seqtk_sample/test_data/Snakefile +rm src/seqtk/test_data/Snakefile From e2adfd660abd4695518549aa33e4a0be8ee25cd7 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Fri, 5 Jul 2024 15:02:50 +0200 Subject: [PATCH 05/10] added subseq help.txt --- src/seqtk/seqtk_sample/help.txt | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/seqtk/seqtk_sample/help.txt b/src/seqtk/seqtk_sample/help.txt index 1ca78811..49f8001b 100644 --- a/src/seqtk/seqtk_sample/help.txt +++ b/src/seqtk/seqtk_sample/help.txt @@ -1,7 +1,9 @@ ``` -seqtk_sample +seqtk_subseq ``` -Usage: seqtk sample [-2] [-s seed=11] | > - -Options: -s INT RNG seed [11] - -2 2-pass mode: twice as slow but with much reduced memory +Usage: seqtk subseq [options] | +Options: + -t TAB delimited output + -s strand aware + -l INT sequence line length [0] +Note: Use 'samtools faidx' if only a few regions are intended. \ No newline at end of file From 85231b406f8140e48bd7ddf7b14b217b302153c1 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 11 Jul 2024 10:00:01 +0200 Subject: [PATCH 06/10] revert to seqtk sample only --- src/seqtk/seqtk_sample/config.vsh.yaml | 2 +- src/seqtk/seqtk_sample/script.sh | 2 +- src/seqtk/seqtk_sample/test.sh | 2 +- .../{seqtk_sample => }/test_data/reads/a.1.fastq.gz | Bin .../{seqtk_sample => }/test_data/reads/a.2.fastq.gz | Bin .../{seqtk_sample => }/test_data/reads/a.fastq | 0 .../{seqtk_sample => }/test_data/reads/a.fastq.gz | Bin .../{seqtk_sample => }/test_data/reads/id.list | 0 src/seqtk/{seqtk_sample => }/test_data/script.sh | 4 ++-- 9 files changed, 5 insertions(+), 5 deletions(-) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.1.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.2.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.fastq (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/a.fastq.gz (100%) rename src/seqtk/{seqtk_sample => }/test_data/reads/id.list (100%) rename src/seqtk/{seqtk_sample => }/test_data/script.sh (61%) diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml index 9f82c03a..63c3271a 100644 --- a/src/seqtk/seqtk_sample/config.vsh.yaml +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -44,7 +44,7 @@ test_resources: - type: bash_script path: test.sh - type: file - path: test_data + path: ../test_data engines: - type: docker diff --git a/src/seqtk/seqtk_sample/script.sh b/src/seqtk/seqtk_sample/script.sh index 3ac9ca71..01d981b3 100644 --- a/src/seqtk/seqtk_sample/script.sh +++ b/src/seqtk/seqtk_sample/script.sh @@ -4,7 +4,7 @@ ## VIASH END seqtk sample \ - ${par_2_pass_mode:+-2} \ + ${par_two_pass_mode:+-2} \ ${par_seed:+-s "$par_seed"} \ "$par_input" \ "$par_fraction_number" \ diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh index f9d942eb..304c2f31 100644 --- a/src/seqtk/seqtk_sample/test.sh +++ b/src/seqtk/seqtk_sample/test.sh @@ -4,7 +4,7 @@ set -e ## VIASH START meta_executable="target/executable/seqtk/seqtk_sample" -meta_resources_dir="src/seqtk/seqtk_sample" +meta_resources_dir="src/seqtk" ## VIASH END ######################################################################################### diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz b/src/seqtk/test_data/reads/a.1.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.1.fastq.gz rename to src/seqtk/test_data/reads/a.1.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz b/src/seqtk/test_data/reads/a.2.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.2.fastq.gz rename to src/seqtk/test_data/reads/a.2.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.fastq b/src/seqtk/test_data/reads/a.fastq similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.fastq rename to src/seqtk/test_data/reads/a.fastq diff --git a/src/seqtk/seqtk_sample/test_data/reads/a.fastq.gz b/src/seqtk/test_data/reads/a.fastq.gz similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/a.fastq.gz rename to src/seqtk/test_data/reads/a.fastq.gz diff --git a/src/seqtk/seqtk_sample/test_data/reads/id.list b/src/seqtk/test_data/reads/id.list similarity index 100% rename from src/seqtk/seqtk_sample/test_data/reads/id.list rename to src/seqtk/test_data/reads/id.list diff --git a/src/seqtk/seqtk_sample/test_data/script.sh b/src/seqtk/test_data/script.sh similarity index 61% rename from src/seqtk/seqtk_sample/test_data/script.sh rename to src/seqtk/test_data/script.sh index cd3f2360..049093cd 100755 --- a/src/seqtk/seqtk_sample/test_data/script.sh +++ b/src/seqtk/test_data/script.sh @@ -4,6 +4,6 @@ if [ ! -d /tmp/snakemake-wrappers ]; then fi # copy test data -cp -r /tmp/snakemake-wrappers/bio/seqtk/test/* src/seqtk/seqtk_sample/test_data +cp -r /tmp/snakemake-wrappers/bio/seqtk/test/* src/seqtk/test_data -rm src/seqtk/seqtk_sample/test_data/Snakefile +rm src/seqtk/test_data/Snakefile From dd9828ec0673ff31e96454bfb79f573e16b05bb8 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 11 Jul 2024 10:04:53 +0200 Subject: [PATCH 07/10] remove subseq --- src/seqtk/seqtk_subseq/config.vsh.yaml | 60 -------------------------- src/seqtk/seqtk_subseq/help.txt | 7 --- src/seqtk/seqtk_subseq/script.sh | 11 ----- 3 files changed, 78 deletions(-) delete mode 100644 src/seqtk/seqtk_subseq/config.vsh.yaml delete mode 100644 src/seqtk/seqtk_subseq/help.txt delete mode 100644 src/seqtk/seqtk_subseq/script.sh diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml deleted file mode 100644 index 0cec1761..00000000 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ /dev/null @@ -1,60 +0,0 @@ -name: seqtk_subseq -namespace: seqtk -description: -keywords: [tag1, tag2] -links: - repository: https://github.com/lh3/seqtk/tree/v1.4 -license: MIT - -argument_groups: - - name: Inputs - arguments: - - name: --input - type: file - description: The input FASTA/Q file. - required: true - - name: "--regions_file" - type: file - description: | - File with regions to extract. Can be either a list file - with one sequence name per line or a bed file. - required: true - - - name: Outputs - arguments: - - name: --output - type: file - description: The output FASTA/Q file. - required: true - direction: output - - - name: Options - arguments: - - name: "--tab" - type: boolean - description: Output in tab-delimited format. - default: false - - name: "--strand_aware" - type: boolean - description: Strand-aware mode. - default: false - - name: "--line_length" - type: integer - description: Number of bases per line. - default: 60 - -resources: - - type: bash_script - path: script.sh -test_resources: - - type: bash_script - path: test.sh - - type: file - path: ../test_data - -engines: - - type: docker - image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 -runners: - - type: executable - - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/help.txt b/src/seqtk/seqtk_subseq/help.txt deleted file mode 100644 index 1ca78811..00000000 --- a/src/seqtk/seqtk_subseq/help.txt +++ /dev/null @@ -1,7 +0,0 @@ -``` -seqtk_sample -``` -Usage: seqtk sample [-2] [-s seed=11] | > - -Options: -s INT RNG seed [11] - -2 2-pass mode: twice as slow but with much reduced memory diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh deleted file mode 100644 index a04ebbaa..00000000 --- a/src/seqtk/seqtk_subseq/script.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -## VIASH START -## VIASH END - -seqtk sample \ - ${par_tab:+-t} \ - ${par_strand_aware:+-s} \ - ${par_line_length:+-l "$par_line_length"} \ - "$par_input" \ - > "$par_output" \ No newline at end of file From 56b0dbe3bf73a87aba19a4f94cd5dae90971cf45 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Thu, 11 Jul 2024 10:39:44 +0200 Subject: [PATCH 08/10] updated tests, added tags --- CHANGELOG.md | 3 +-- src/seqtk/seqtk_sample/config.vsh.yaml | 9 +++++---- src/seqtk/seqtk_sample/test.sh | 18 ++++++++++++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dc37f826..8faaf0f2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -50,8 +50,7 @@ * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). -* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files -to FASTA/Q (PR #68). +* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). ## MAJOR CHANGES diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml index 63c3271a..b95afa61 100644 --- a/src/seqtk/seqtk_sample/config.vsh.yaml +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -1,7 +1,7 @@ name: seqtk_sample namespace: seqtk description: Subsamples sequences from FASTA/Q files. -keywords: [tag1, tag2] +keywords: [sample, FASTA, FASTQ] links: repository: https://github.com/lh3/seqtk/tree/v1.4 license: MIT @@ -27,14 +27,15 @@ argument_groups: - name: --seed type: integer description: Seed for random generator. - default: 42 + example: 42 - name: --fraction_number type: double description: Fraction or number of sequences to sample. - default: 0.1 + required: true + example: 0.1 - name: --two_pass_mode type: boolean - description: twice as slow but with much reduced memory + description: Twice as slow but with much reduced memory default: false resources: diff --git a/src/seqtk/seqtk_sample/test.sh b/src/seqtk/seqtk_sample/test.sh index 304c2f31..cba5f613 100644 --- a/src/seqtk/seqtk_sample/test.sh +++ b/src/seqtk/seqtk_sample/test.sh @@ -13,14 +13,21 @@ cd seqtk_sample_se echo "> Run seqtk_sample on fastq SE" "$meta_executable" \ - --input "$meta_resources_dir/test_data/reads/a.fastq" \ + --input "$meta_resources_dir/test_data/reads/a.1.fastq.gz" \ --seed 42 \ --fraction_number 3 \ --output "sampled.fastq" echo ">> Check if output exists" if [ ! -f "sampled.fastq" ]; then - echo ">> sampled.fastq.gz does not exist" + echo ">> sampled.fastq does not exist" + exit 1 +fi + +echo ">> Count number of samples" +num_samples=$(grep -c '^@' sampled.fastq) +if [ "$num_samples" -ne 3 ]; then + echo ">> sampled.fastq does not contain 3 samples" exit 1 fi @@ -56,6 +63,13 @@ headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) # Compare headers diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } +echo ">> Count number of samples" +num_headers=$(echo "$headers1" | wc -l) +if [ "$num_headers" -ne 3 ]; then + echo ">> sampled_1.fastq does not contain 3 headers" + exit 1 +fi + ######################################################################################### cd .. mkdir seqtk_sample_pe_fraction From cca29999584edd1f2071fb6028a80857dec8840d Mon Sep 17 00:00:00 2001 From: Jakub Majercik <57993790+jakubmajercik@users.noreply.github.com> Date: Wed, 17 Jul 2024 18:58:36 +0200 Subject: [PATCH 09/10] Update two_pass_mode Co-authored-by: Robrecht Cannoodt --- src/seqtk/seqtk_sample/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml index b95afa61..3e4e9b81 100644 --- a/src/seqtk/seqtk_sample/config.vsh.yaml +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -34,7 +34,7 @@ argument_groups: required: true example: 0.1 - name: --two_pass_mode - type: boolean + type: boolean_true description: Twice as slow but with much reduced memory default: false From a351469bf74435d2924e93d67853e078a8b4e412 Mon Sep 17 00:00:00 2001 From: jakubmajercik Date: Wed, 17 Jul 2024 19:16:11 +0200 Subject: [PATCH 10/10] author added to config --- src/_authors/jakub_majercik.yaml | 10 ++++++++++ src/seqtk/seqtk_sample/config.vsh.yaml | 4 +++- 2 files changed, 13 insertions(+), 1 deletion(-) create mode 100644 src/_authors/jakub_majercik.yaml diff --git a/src/_authors/jakub_majercik.yaml b/src/_authors/jakub_majercik.yaml new file mode 100644 index 00000000..3b75fffe --- /dev/null +++ b/src/_authors/jakub_majercik.yaml @@ -0,0 +1,10 @@ +name: Jakub Majercik +info: + links: + email: jakub@data-intuitive.com + github: jakubmajercik + linkedin: jakubmajercik + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatics Engineer \ No newline at end of file diff --git a/src/seqtk/seqtk_sample/config.vsh.yaml b/src/seqtk/seqtk_sample/config.vsh.yaml index 3e4e9b81..0cd369e7 100644 --- a/src/seqtk/seqtk_sample/config.vsh.yaml +++ b/src/seqtk/seqtk_sample/config.vsh.yaml @@ -5,6 +5,9 @@ keywords: [sample, FASTA, FASTQ] links: repository: https://github.com/lh3/seqtk/tree/v1.4 license: MIT +authors: + - __merge__: /src/_authors/jakub_majercik.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs @@ -36,7 +39,6 @@ argument_groups: - name: --two_pass_mode type: boolean_true description: Twice as slow but with much reduced memory - default: false resources: - type: bash_script