From 1b5d696bcd668dcfa9dfe35a0ad01530f3e91a59 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 15 Jul 2024 15:16:40 -0300 Subject: [PATCH 01/28] Config file and help.txt file Created: - config.vsh.yaml - help.txt --- src/seqtk/seqtk_subseq/config.vsh.yaml | 62 ++++++++++++++++++++++++++ src/seqtk/seqtk_subseq/help.txt | 9 ++++ 2 files changed, 71 insertions(+) create mode 100644 src/seqtk/seqtk_subseq/config.vsh.yaml create mode 100644 src/seqtk/seqtk_subseq/help.txt diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml new file mode 100644 index 00000000..fccabe3a --- /dev/null +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -0,0 +1,62 @@ +name: seqtk_subseq +namespace: seqtk +description: Extract subsequences from FASTA/Q files. Takes as input a FASTA/Q file and a name.lst (sequence ids file) or a reg.bed (genomic regions file). +keywords: [subseq, FASTA, FASTQ] +links: + repository: https://github.com/lh3/seqtk/tree/v1.4 +license: MIT + +argument_groups: + - name: Inputs + arguments: + - name: --input + type: file + description: The input FASTA/Q file. + required: true + - name: --name_list + type: file + description: List of sequence names or genomic regions to extract. + required: true + + - name: Outputs + arguments: + - name: --output + type: file + description: The output FASTA/Q file. + required: true + direction: output + + - name: Options + arguments: + - name: --tab + alternatives: -t + type: boolean_true + description: TAB delimited output. + required: false + - name: --strand_aware + alternatives: -s + type: boolean_true + description: Strand aware. + required: false + - name: --sequence_line_length + alternatives: -l + type: integer + description: sequence line length [0]. + required: false + example: 60 + +resources: + - type: bash_script + path: script.sh +test_resources: + - type: bash_script + path: test.sh + - type: file + path: ../test_data + +engines: + - type: docker + image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 +runners: + - type: executable + - type: nextflow \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/help.txt b/src/seqtk/seqtk_subseq/help.txt new file mode 100644 index 00000000..5768e4ff --- /dev/null +++ b/src/seqtk/seqtk_subseq/help.txt @@ -0,0 +1,9 @@ +```bash +seqtk subseq +``` +Usage: seqtk subseq [options] | +Options: + -t TAB delimited output + -s strand aware + -l INT sequence line length [0] +Note: Use 'samtools faidx' if only a few regions are intended. \ No newline at end of file From 54b23ff7858ffad0b648b28ae8136d40fe601763 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 15 Jul 2024 15:43:17 -0300 Subject: [PATCH 02/28] Added script.sh File added: - script.sh --- src/seqtk/seqtk_subseq/script.sh | 12 ++++++++++++ src/seqtk/seqtk_subseq/test.sh | 0 2 files changed, 12 insertions(+) create mode 100644 src/seqtk/seqtk_subseq/script.sh create mode 100644 src/seqtk/seqtk_subseq/test.sh diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh new file mode 100644 index 00000000..32053316 --- /dev/null +++ b/src/seqtk/seqtk_subseq/script.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +## VIASH START +## VIASH END + +seqtk subseq \ + ${par_tab:+-t} \ + ${par_strand_aware:+-s} \ + ${par_sequence_line_length:+-l "$par_sequence_line_length"} \ + "$par_input" \ + "$par_name_list" \ + > "$par_output" \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh new file mode 100644 index 00000000..e69de29b From 4157f4b4f6412906327179fe01f532f82cee571b Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 15 Jul 2024 18:02:47 -0300 Subject: [PATCH 03/28] Created test.sh updates: - changes to config.vsh.yaml - created test.sh - created some test files Problems: - there is some error in config file that is preventing me from running the component and testing --- src/seqtk/seqtk_subseq/config.vsh.yaml | 19 ++++++++---- src/seqtk/seqtk_subseq/test.sh | 36 +++++++++++++++++++++++ src/seqtk/seqtk_subseq/test_data/input.fa | 6 ++++ src/seqtk/seqtk_subseq/test_data/input.fq | 12 ++++++++ src/seqtk/seqtk_subseq/test_data/list.lst | 2 ++ src/seqtk/seqtk_subseq/test_data/reg.bed | 3 ++ 6 files changed, 73 insertions(+), 5 deletions(-) create mode 100644 src/seqtk/seqtk_subseq/test_data/input.fa create mode 100644 src/seqtk/seqtk_subseq/test_data/input.fq create mode 100644 src/seqtk/seqtk_subseq/test_data/list.lst create mode 100644 src/seqtk/seqtk_subseq/test_data/reg.bed diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index fccabe3a..49dc8b7f 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -1,6 +1,7 @@ name: seqtk_subseq namespace: seqtk -description: Extract subsequences from FASTA/Q files. Takes as input a FASTA/Q file and a name.lst (sequence ids file) or a reg.bed (genomic regions file). +description: | + Extract subsequences from FASTA/Q files. Takes as input a FASTA/Q file and a name.lst (sequence ids file) or a reg.bed (genomic regions file). keywords: [subseq, FASTA, FASTQ] links: repository: https://github.com/lh3/seqtk/tree/v1.4 @@ -11,20 +12,28 @@ argument_groups: arguments: - name: --input type: file + direction: input description: The input FASTA/Q file. required: true + example: input.fa + - name: --name_list type: file - description: List of sequence names or genomic regions to extract. + direction: input + description: | + List of sequence names (name.lst) or genomic regions (reg.bed) to extract. required: true + example: list.lst - name: Outputs arguments: - name: --output + alternatives: -o type: file + direction: output description: The output FASTA/Q file. required: true - direction: output + default: output.fa - name: Options arguments: @@ -41,9 +50,9 @@ argument_groups: - name: --sequence_line_length alternatives: -l type: integer - description: sequence line length [0]. + description: Sequence line length of input fasta file. + example: 16 required: false - example: 60 resources: - type: bash_script diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index e69de29b..6c0bb325 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_subseq" +meta_resources_dir="src/seqtk" +## VIASH END + +######################################################################################### +mkdir seqtk_subseq_test +cd seqtk_subseq_test + +echo "> Run seqtk_subseq on FASTA/Q file" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/input.fa" \ + --name_list "$meta_resources_dir/test_data/list.lst" \ + --output "sub_sampled.fa" + +echo ">> Check if output exists" +if [ ! -f "sub_sampled.fa" ]; then + echo ">> sub_sampled.fa does not exist" + exit 1 +fi + +echo ">> Count number of subsamples in output" +num_samples=$(grep -c '^@' sub_sampled.fa) +if [ "$num_samples" -ne 2 ]; then + echo ">> sub_sampled.fa does not contain the 2 sub-samples" + exit 1 +fi + +######################################################################################### +# ... add more tests here ... + diff --git a/src/seqtk/seqtk_subseq/test_data/input.fa b/src/seqtk/seqtk_subseq/test_data/input.fa new file mode 100644 index 00000000..72d4b4f3 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/input.fa @@ -0,0 +1,6 @@ +>1 +ATCGATCGATCGATCG +>2 +GCTAGCTAGCTAGCTA +>3 +TTAGGCTAATCGATCG \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/input.fq b/src/seqtk/seqtk_subseq/test_data/input.fq new file mode 100644 index 00000000..a127ee94 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/input.fq @@ -0,0 +1,12 @@ +@1 +ATCGATCGATCGATCG ++ +HHHHHHHHHHHHHHHH +@2 +GCTAGCTAGCTAGCTA ++ +HHHHHHHHHHHHHHHH +@3 +TTAGGCTAATCGATCG ++ +HHHHHHHHHHHHHHHH diff --git a/src/seqtk/seqtk_subseq/test_data/list.lst b/src/seqtk/seqtk_subseq/test_data/list.lst new file mode 100644 index 00000000..2b2f2e1b --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/list.lst @@ -0,0 +1,2 @@ +1 +3 diff --git a/src/seqtk/seqtk_subseq/test_data/reg.bed b/src/seqtk/seqtk_subseq/test_data/reg.bed new file mode 100644 index 00000000..81099956 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/reg.bed @@ -0,0 +1,3 @@ +chr1 1000 5000 feature1 0 + +chr2 2000 6000 feature2 0 - +chr3 3000 7000 feature3 0 + From 9ca70717c6094ad23d93a4abaa39700acb837b42 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Mon, 15 Jul 2024 22:56:42 -0300 Subject: [PATCH 04/28] Update on test.sh --- src/seqtk/seqtk_subseq/test.sh | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 6c0bb325..dece837f 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -31,6 +31,43 @@ if [ "$num_samples" -ne 2 ]; then exit 1 fi +# echo ">> Compare reads" +# # Extract headers +# headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +# headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# # Compare headers +# diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } + ######################################################################################### # ... add more tests here ... +# test fq file + +# TODO: Figure out how the test fq file should look like and how the reg.bed file should look like + +######################################################################################### +# test tab option +echo "> Run seqtk_subseq with TAB option" +"$meta_executable" \ + --tab \ + --input "$meta_resources_dir/test_data/input.fa" \ + --name_list "$meta_resources_dir/test_data/list.lst" \ + --output "sub_sampled.fa" + +######################################################################################### +# test strand aware option +echo "> Run seqtk_subseq with Strand Aware option" +"$meta_executable" \ + --strand_aware \ + --input "$meta_resources_dir/test_data/input.fa" \ + --name_list "$meta_resources_dir/test_data/list.lst" \ + --output "sub_sampled.fa" +######################################################################################### +# test sequence line length option +echo "> Run seqtk_subseq with line length option" +"$meta_executable" \ + --sequence_line_length 16 \ + --input "$meta_resources_dir/test_data/input.fa" \ + --name_list "$meta_resources_dir/test_data/list.lst" \ + --output "sub_sampled.fa" From 999c582c3bbc629d63951da3fad2f401944e11df Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 10:11:10 -0300 Subject: [PATCH 05/28] update --- src/seqtk/seqtk_subseq/config.vsh.yaml | 12 ++++++------ src/seqtk/seqtk_subseq/test.sh | 15 +++++++++------ 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 49dc8b7f..575d5d3d 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -10,14 +10,14 @@ license: MIT argument_groups: - name: Inputs arguments: - - name: --input + - name: "--input" type: file direction: input description: The input FASTA/Q file. required: true example: input.fa - - name: --name_list + - name: "--name_list" type: file direction: input description: | @@ -27,7 +27,7 @@ argument_groups: - name: Outputs arguments: - - name: --output + - name: "--output" alternatives: -o type: file direction: output @@ -37,17 +37,17 @@ argument_groups: - name: Options arguments: - - name: --tab + - name: "--tab" alternatives: -t type: boolean_true description: TAB delimited output. required: false - - name: --strand_aware + - name: "--strand_aware" alternatives: -s type: boolean_true description: Strand aware. required: false - - name: --sequence_line_length + - name: "--sequence_line_length" alternatives: -l type: integer description: Sequence line length of input fasta file. diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index dece837f..21876e6b 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -31,13 +31,7 @@ if [ "$num_samples" -ne 2 ]; then exit 1 fi -# echo ">> Compare reads" -# # Extract headers -# headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) -# headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) -# # Compare headers -# diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } ######################################################################################### # ... add more tests here ... @@ -71,3 +65,12 @@ echo "> Run seqtk_subseq with line length option" --input "$meta_resources_dir/test_data/input.fa" \ --name_list "$meta_resources_dir/test_data/list.lst" \ --output "sub_sampled.fa" + + +# echo ">> Compare reads" +# # Extract headers +# headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) +# headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) + +# # Compare headers +# diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } \ No newline at end of file From 628a335700cbdba81f48daecb46406d62626bb4e Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 10:21:06 -0300 Subject: [PATCH 06/28] Bug fixes - config required: false bug --- src/seqtk/seqtk_subseq/config.vsh.yaml | 8 ++++---- src/seqtk/seqtk_subseq/test.sh | 1 - 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 575d5d3d..b6c2ab53 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -41,18 +41,18 @@ argument_groups: alternatives: -t type: boolean_true description: TAB delimited output. - required: false + - name: "--strand_aware" alternatives: -s type: boolean_true description: Strand aware. - required: false + - name: "--sequence_line_length" alternatives: -l type: integer description: Sequence line length of input fasta file. example: 16 - required: false + resources: - type: bash_script @@ -61,7 +61,7 @@ test_resources: - type: bash_script path: test.sh - type: file - path: ../test_data + path: test_data engines: - type: docker diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 21876e6b..8e4c36f4 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -32,7 +32,6 @@ if [ "$num_samples" -ne 2 ]; then fi - ######################################################################################### # ... add more tests here ... # test fq file From 1337b0faffa96321e609ca20f9e2904d995e7c8d Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 10:39:26 -0300 Subject: [PATCH 07/28] Update test --- src/seqtk/seqtk_subseq/test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 8e4c36f4..87f815dd 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -34,6 +34,7 @@ fi ######################################################################################### # ... add more tests here ... +# # test fq file # TODO: Figure out how the test fq file should look like and how the reg.bed file should look like From 5b7f207caa27085d9654d0b022263905e66c81f0 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 10:41:27 -0300 Subject: [PATCH 08/28] Update CHANGELOG.md --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 80b8b9f3..0c88fec1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,6 +80,8 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). +* `seqtk subseq`: (PR #) + ## MINOR CHANGES * Uniformize component metadata (PR #23). From 84f714e8e67e2febaf644eccf2956c7b5dab54c5 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 15:45:23 -0300 Subject: [PATCH 09/28] Improvement on test.sh --- src/seqtk/seqtk_subseq/test.sh | 80 +++++++++---------- src/seqtk/seqtk_subseq/test_data/a.1.fastq | 40 ++++++++++ src/seqtk/seqtk_subseq/test_data/a.2.fastq | 40 ++++++++++ src/seqtk/seqtk_subseq/test_data/a.fastq | 4 + .../test_data/{list.lst => id.list} | 1 - src/seqtk/seqtk_subseq/test_data/input.fa | 6 -- src/seqtk/seqtk_subseq/test_data/input.fq | 12 --- src/seqtk/seqtk_subseq/test_data/reg.bed | 3 - src/seqtk/seqtk_subseq/test_data/script.sh | 13 +++ 9 files changed, 135 insertions(+), 64 deletions(-) create mode 100644 src/seqtk/seqtk_subseq/test_data/a.1.fastq create mode 100644 src/seqtk/seqtk_subseq/test_data/a.2.fastq create mode 100644 src/seqtk/seqtk_subseq/test_data/a.fastq rename src/seqtk/seqtk_subseq/test_data/{list.lst => id.list} (50%) delete mode 100644 src/seqtk/seqtk_subseq/test_data/input.fa delete mode 100644 src/seqtk/seqtk_subseq/test_data/input.fq delete mode 100644 src/seqtk/seqtk_subseq/test_data/reg.bed create mode 100644 src/seqtk/seqtk_subseq/test_data/script.sh diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 87f815dd..28b08360 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -12,65 +12,61 @@ meta_resources_dir="src/seqtk" mkdir seqtk_subseq_test cd seqtk_subseq_test -echo "> Run seqtk_subseq on FASTA/Q file" +echo "> Run seqtk_subseq on FASTA file" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.fa" \ - --name_list "$meta_resources_dir/test_data/list.lst" \ - --output "sub_sampled.fa" + --input "$meta_resources_dir/test_data/a.1.fastq.gz" \ + --name_list "$meta_resources_dir/test_data/id.list" \ + --output "sub_sample.fq" echo ">> Check if output exists" -if [ ! -f "sub_sampled.fa" ]; then - echo ">> sub_sampled.fa does not exist" +if [ ! -f "sub_sample.fq" ]; then + echo ">> sub_sample.fq does not exist" exit 1 fi -echo ">> Count number of subsamples in output" -num_samples=$(grep -c '^@' sub_sampled.fa) -if [ "$num_samples" -ne 2 ]; then - echo ">> sub_sampled.fa does not contain the 2 sub-samples" +echo ">> Check number of lines in output" +n_lines=$(wc -l < sub_sample.fq) +n_lines=$(echo "$n_lines" | awk '{print $1}') + +if [ "$n_lines" -ne 2 ]; then + echo ">> sub_sample.fq does not contain exactly two lines" exit 1 fi - -######################################################################################### -# ... add more tests here ... -# -# test fq file - -# TODO: Figure out how the test fq file should look like and how the reg.bed file should look like +echo ">> Check content in output" +result=$(sed -n '2p' sub_sample.fq) +expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") +if [ "$result" == "$expected" ]; then + echo "--> content are equal" +else + echo "--> content are not equal" +fi ######################################################################################### # test tab option -echo "> Run seqtk_subseq with TAB option" -"$meta_executable" \ - --tab \ - --input "$meta_resources_dir/test_data/input.fa" \ - --name_list "$meta_resources_dir/test_data/list.lst" \ - --output "sub_sampled.fa" +# echo "> Run seqtk_subseq with TAB option" +# "$meta_executable" \ +# --tab \ +# --input "$meta_resources_dir/test_data/input.fa" \ +# --name_list "$meta_resources_dir/test_data/list.lst" \ +# --output "sub_sampled.fa" ######################################################################################### # test strand aware option -echo "> Run seqtk_subseq with Strand Aware option" -"$meta_executable" \ - --strand_aware \ - --input "$meta_resources_dir/test_data/input.fa" \ - --name_list "$meta_resources_dir/test_data/list.lst" \ - --output "sub_sampled.fa" +# echo "> Run seqtk_subseq with Strand Aware option" +# "$meta_executable" \ +# --strand_aware \ +# --input "$meta_resources_dir/test_data/input.fa" \ +# --name_list "$meta_resources_dir/test_data/list.lst" \ +# --output "sub_sampled.fa" ######################################################################################### # test sequence line length option -echo "> Run seqtk_subseq with line length option" -"$meta_executable" \ - --sequence_line_length 16 \ - --input "$meta_resources_dir/test_data/input.fa" \ - --name_list "$meta_resources_dir/test_data/list.lst" \ - --output "sub_sampled.fa" - +# echo "> Run seqtk_subseq with line length option" +# "$meta_executable" \ +# --sequence_line_length 16 \ +# --input "$meta_resources_dir/test_data/input.fa" \ +# --name_list "$meta_resources_dir/test_data/list.lst" \ +# --output "sub_sampled.fa" -# echo ">> Compare reads" -# # Extract headers -# headers1=$(grep '^@' sampled_1.fastq | sed -e's/ 1$//' | sort) -# headers2=$(grep '^@' sampled_2.fastq | sed -e 's/ 2$//' | sort) -# # Compare headers -# diff <(echo "$headers1") <(echo "$headers2") || { echo "Mismatch detected"; exit 1; } \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/a.1.fastq b/src/seqtk/seqtk_subseq/test_data/a.1.fastq new file mode 100644 index 00000000..458b2a9c --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/a.1.fastq @@ -0,0 +1,40 @@ +@1 1 +A1_1 ++1_1 +!1_1 +@2 1 +A2_1 ++2_1 +!2_1 +@3 1 +A3_1 ++3_1 +!3_1 +@4 1 +A4_1 ++4_1 +!4_1 +@5 1 +A5_1 ++5_1 +!5_1 +@6 1 +A6_1 ++6_1 +!6_1 +@7 1 +A7_1 ++7_1 +!7_1 +@8 1 +A8_1 ++8_1 +!8_1 +@9 1 +A9_1 ++9_1 +!9_1 +@10 1 +A10_1 ++10_1 +!10_1 diff --git a/src/seqtk/seqtk_subseq/test_data/a.2.fastq b/src/seqtk/seqtk_subseq/test_data/a.2.fastq new file mode 100644 index 00000000..50d3ce80 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/a.2.fastq @@ -0,0 +1,40 @@ +@1 2 +A1_2 ++1_2 +!1_2 +@2 2 +A2_2 ++2_2 +!2_2 +@3 2 +A3_2 ++3_2 +!3_2 +@4 2 +A4_2 ++4_2 +!4_2 +@5 2 +A5_2 ++5_2 +!5_2 +@6 2 +A6_2 ++6_2 +!6_2 +@7 2 +A7_2 ++7_2 +!7_2 +@8 2 +A8_2 ++8_2 +!8_2 +@9 2 +A9_2 ++9_2 +!9_2 +@10 2 +A10_2 ++10_2 +!10_2 diff --git a/src/seqtk/seqtk_subseq/test_data/a.fastq b/src/seqtk/seqtk_subseq/test_data/a.fastq new file mode 100644 index 00000000..42735560 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/a.fastq @@ -0,0 +1,4 @@ +@1 +ACGGCAT ++ +!!!!!!! diff --git a/src/seqtk/seqtk_subseq/test_data/list.lst b/src/seqtk/seqtk_subseq/test_data/id.list similarity index 50% rename from src/seqtk/seqtk_subseq/test_data/list.lst rename to src/seqtk/seqtk_subseq/test_data/id.list index 2b2f2e1b..d00491fd 100644 --- a/src/seqtk/seqtk_subseq/test_data/list.lst +++ b/src/seqtk/seqtk_subseq/test_data/id.list @@ -1,2 +1 @@ 1 -3 diff --git a/src/seqtk/seqtk_subseq/test_data/input.fa b/src/seqtk/seqtk_subseq/test_data/input.fa deleted file mode 100644 index 72d4b4f3..00000000 --- a/src/seqtk/seqtk_subseq/test_data/input.fa +++ /dev/null @@ -1,6 +0,0 @@ ->1 -ATCGATCGATCGATCG ->2 -GCTAGCTAGCTAGCTA ->3 -TTAGGCTAATCGATCG \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/input.fq b/src/seqtk/seqtk_subseq/test_data/input.fq deleted file mode 100644 index a127ee94..00000000 --- a/src/seqtk/seqtk_subseq/test_data/input.fq +++ /dev/null @@ -1,12 +0,0 @@ -@1 -ATCGATCGATCGATCG -+ -HHHHHHHHHHHHHHHH -@2 -GCTAGCTAGCTAGCTA -+ -HHHHHHHHHHHHHHHH -@3 -TTAGGCTAATCGATCG -+ -HHHHHHHHHHHHHHHH diff --git a/src/seqtk/seqtk_subseq/test_data/reg.bed b/src/seqtk/seqtk_subseq/test_data/reg.bed deleted file mode 100644 index 81099956..00000000 --- a/src/seqtk/seqtk_subseq/test_data/reg.bed +++ /dev/null @@ -1,3 +0,0 @@ -chr1 1000 5000 feature1 0 + -chr2 2000 6000 feature2 0 - -chr3 3000 7000 feature3 0 + diff --git a/src/seqtk/seqtk_subseq/test_data/script.sh b/src/seqtk/seqtk_subseq/test_data/script.sh new file mode 100644 index 00000000..b2d825f5 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/script.sh @@ -0,0 +1,13 @@ +# clone repo +if [ ! -d /tmp/snakemake-wrappers ]; then + git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers +fi + +# copy test data +cp -r /tmp/snakemake-wrappers/bio/seqtk/test/reads/* src/seqtk/seqtk_subseq/test_data + +# remove a.fastq file +rm src/seqtk/seqtk_subseq/test_data/a.fastq + +# unzip fastq files +gunzip src/seqtk/seqtk_subseq/test_data/*.gz From 88cf6d74af5a198003e6909c368ccd147c62e7eb Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 17:56:15 -0300 Subject: [PATCH 10/28] Added more test I tried out different option of the command with different fasta and fastq files and different list, but the output does not seem to change. --- src/seqtk/seqtk_subseq/config.vsh.yaml | 2 +- src/seqtk/seqtk_subseq/test.sh | 132 +++++++++++++++---- src/seqtk/seqtk_subseq/test_data/id2.list | 2 + src/seqtk/seqtk_subseq/test_data/ids.txt | 2 + src/seqtk/seqtk_subseq/test_data/input.fasta | 15 +++ 5 files changed, 127 insertions(+), 26 deletions(-) create mode 100644 src/seqtk/seqtk_subseq/test_data/id2.list create mode 100644 src/seqtk/seqtk_subseq/test_data/ids.txt create mode 100644 src/seqtk/seqtk_subseq/test_data/input.fasta diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index b6c2ab53..99aa92bb 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -68,4 +68,4 @@ engines: image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 runners: - type: executable - - type: nextflow \ No newline at end of file + - type: nextflow diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 28b08360..ca89fa12 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -9,12 +9,12 @@ meta_resources_dir="src/seqtk" ## VIASH END ######################################################################################### -mkdir seqtk_subseq_test -cd seqtk_subseq_test +mkdir test1 +cd test1 -echo "> Run seqtk_subseq on FASTA file" +echo "> Run seqtk_subseq on FASTA/Q file" "$meta_executable" \ - --input "$meta_resources_dir/test_data/a.1.fastq.gz" \ + --input "$meta_resources_dir/test_data/a.1.fastq" \ --name_list "$meta_resources_dir/test_data/id.list" \ --output "sub_sample.fq" @@ -43,30 +43,112 @@ else fi ######################################################################################### -# test tab option -# echo "> Run seqtk_subseq with TAB option" -# "$meta_executable" \ -# --tab \ -# --input "$meta_resources_dir/test_data/input.fa" \ -# --name_list "$meta_resources_dir/test_data/list.lst" \ -# --output "sub_sampled.fa" +# -- tab option -- +cd .. +mkdir test2 +cd test2 + +echo "> Run seqtk_subseq with TAB option" +"$meta_executable" \ + --tab \ + --input "$meta_resources_dir/test_data/a.1.fastq" \ + --name_list "$meta_resources_dir/test_data/id.list" \ + --output "sub_sample.fq" + +echo ">> Check if output exists" +if [ ! -f "sub_sample.fq" ]; then + echo ">> sub_sample.fq does not exist" + exit 1 +fi + +echo ">> Check number of lines in output" +n_lines=$(wc -l < sub_sample.fq) +n_lines=$(echo "$n_lines" | awk '{print $1}') + +if [ "$n_lines" -ne 2 ]; then + echo ">> sub_sample.fq does not contain exactly two lines" + exit 1 +fi + +echo ">> Check content in output" +result=$(sed -n '2p' sub_sample.fq) +expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") +if [ "$result" == "$expected" ]; then + echo "--> content are equal" +else + echo "--> content are not equal" +fi ######################################################################################### -# test strand aware option -# echo "> Run seqtk_subseq with Strand Aware option" -# "$meta_executable" \ -# --strand_aware \ -# --input "$meta_resources_dir/test_data/input.fa" \ -# --name_list "$meta_resources_dir/test_data/list.lst" \ -# --output "sub_sampled.fa" +# -- strand aware option -- +cd .. +mkdir test3 +cd test3 +echo "> Run seqtk_subseq with Strand Aware option" + +"$meta_executable" \ + --strand_aware \ + --input "$meta_resources_dir/test_data/a.1.fastq" \ + --name_list "$meta_resources_dir/test_data/id.list" \ + --output "sub_sample.fq" + +echo ">> Check if output exists" +if [ ! -f "sub_sample.fq" ]; then + echo ">> sub_sample.fq does not exist" + exit 1 +fi + +echo ">> Check number of lines in output" +n_lines=$(wc -l < sub_sample.fq) +n_lines=$(echo "$n_lines" | awk '{print $1}') + +if [ "$n_lines" -ne 2 ]; then + echo ">> sub_sample.fq does not contain exactly two lines" + exit 1 +fi + +echo ">> Check content in output" +result=$(sed -n '2p' sub_sample.fq) +expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") +if [ "$result" == "$expected" ]; then + echo "--> content are equal" +else + echo "--> content are not equal" +fi ######################################################################################### -# test sequence line length option -# echo "> Run seqtk_subseq with line length option" -# "$meta_executable" \ -# --sequence_line_length 16 \ -# --input "$meta_resources_dir/test_data/input.fa" \ -# --name_list "$meta_resources_dir/test_data/list.lst" \ -# --output "sub_sampled.fa" +# -- sequence line length option -- +cd .. +mkdir test4 +cd test4 +echo "> Run seqtk_subseq with line length option" +"$meta_executable" \ + --sequence_line_length 10 \ + --input "$meta_resources_dir/test_data/a.1.fastq" \ + --name_list "$meta_resources_dir/test_data/id.list" \ + --output "sub_sample.fq" +echo ">> Check if output exists" +if [ ! -f "sub_sample.fq" ]; then + echo ">> sub_sample.fq does not exist" + exit 1 +fi + +echo ">> Check number of lines in output" +n_lines=$(wc -l < sub_sample.fq) +n_lines=$(echo "$n_lines" | awk '{print $1}') + +if [ "$n_lines" -ne 2 ]; then + echo ">> sub_sample.fq does not contain exactly two lines" + exit 1 +fi + +echo ">> Check content in output" +result=$(sed -n '2p' sub_sample.fq) +expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") +if [ "$result" == "$expected" ]; then + echo "--> content are equal" +else + echo "--> content are not equal" +fi diff --git a/src/seqtk/seqtk_subseq/test_data/id2.list b/src/seqtk/seqtk_subseq/test_data/id2.list new file mode 100644 index 00000000..7a754f41 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/id2.list @@ -0,0 +1,2 @@ +1 +2 \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/ids.txt b/src/seqtk/seqtk_subseq/test_data/ids.txt new file mode 100644 index 00000000..56d77f0e --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/ids.txt @@ -0,0 +1,2 @@ +a 10 20 +c 10 20 \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/input.fasta b/src/seqtk/seqtk_subseq/test_data/input.fasta new file mode 100644 index 00000000..d6554922 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/input.fasta @@ -0,0 +1,15 @@ +>a +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA +AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>b +CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA +TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC +>c +CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC +AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC +>d +CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA +ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC +>e +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC +AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG \ No newline at end of file From bd5de0a5cbcbc09d23cdfa4cedc10c88f94d2da6 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Tue, 16 Jul 2024 23:21:44 -0300 Subject: [PATCH 11/28] Update on tests - got unstuck - I need to create a docker image with the lastest version of seqtk - --- myout.fa | 0 src/seqtk/seqtk_subseq/config.vsh.yaml | 6 +- src/seqtk/seqtk_subseq/test.sh | 18 +-- src/seqtk/seqtk_subseq/test1.sh | 122 +++++++++++++++++++ src/seqtk/seqtk_subseq/test_data/id.list | 3 +- src/seqtk/seqtk_subseq/test_data/input.fasta | 10 +- src/seqtk/seqtk_subseq/test_data/reg.bed | 3 + 7 files changed, 147 insertions(+), 15 deletions(-) create mode 100644 myout.fa create mode 100644 src/seqtk/seqtk_subseq/test1.sh create mode 100644 src/seqtk/seqtk_subseq/test_data/reg.bed diff --git a/myout.fa b/myout.fa new file mode 100644 index 00000000..e69de29b diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 99aa92bb..f27311bc 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -59,13 +59,17 @@ resources: path: script.sh test_resources: - type: bash_script - path: test.sh + path: test1.sh - type: file path: test_data engines: - type: docker image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 + setup: + - type: docker + run: | + echo "xxx: \"0.1.0\"" > /var/software_versions.txt runners: - type: executable - type: nextflow diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index ca89fa12..7df736f6 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -42,8 +42,8 @@ else echo "--> content are not equal" fi -######################################################################################### -# -- tab option -- +######################################################################################## +#-- tab option -- cd .. mkdir test2 cd test2 @@ -51,8 +51,8 @@ cd test2 echo "> Run seqtk_subseq with TAB option" "$meta_executable" \ --tab \ - --input "$meta_resources_dir/test_data/a.1.fastq" \ - --name_list "$meta_resources_dir/test_data/id.list" \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/ids.txt" \ --output "sub_sample.fq" echo ">> Check if output exists" @@ -79,8 +79,10 @@ else echo "--> content are not equal" fi -######################################################################################### -# -- strand aware option -- +cat sub_sample.fq + +######################################################################################## +-- strand aware option -- cd .. mkdir test3 cd test3 @@ -116,8 +118,8 @@ else echo "--> content are not equal" fi -######################################################################################### -# -- sequence line length option -- +######################################################################################## +-- sequence line length option -- cd .. mkdir test4 cd test4 diff --git a/src/seqtk/seqtk_subseq/test1.sh b/src/seqtk/seqtk_subseq/test1.sh new file mode 100644 index 00000000..42b9f005 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test1.sh @@ -0,0 +1,122 @@ +#!/bin/bash + +# exit on error +set -e + +## VIASH START +meta_executable="target/executable/seqtk/seqtk_subseq" +meta_resources_dir="src/seqtk" +## VIASH END + +######################################################################################### +# Run basic test +mkdir test1 +cd test1 + +echo "> Run seqtk_subseq on FASTA/Q file" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/id.list" \ + --output "sub_sample.fq" + +expected_output_basic=">KU562861.1 +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCAAGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>MH150936.1 +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG" +output_basic=$(cat sub_sample.fq) + +if [ "$output_basic" == "$expected_output_basic" ]; then + echo "Basic test passed" +else + echo "Basic test failed" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" +fi + +######################################################################################### +# Run reg.bed as name list input test +cd .. +mkdir test2 +cd test2 + +echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list" +"$meta_executable" \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_basic=">KU562861.1:11-20 +AGTGTTCGAG +>MH150936.1:11-20 +TGAAAACTTT" +output_basic=$(cat sub_sample.fq) + +if [ "$output_basic" == "$expected_output_basic" ]; then + echo "Test passed!" +else + echo "Test failed!" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" +fi + +######################################################################################### +# Run tab option output test +cd .. +mkdir test3 +cd test3 + +echo "> Run seqtk_subseq with TAB option" +"$meta_executable" \ + --tab \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_tabular="KU562861.1\t11\tAGTGTTCGAG +MH150936.1\t11\tTGAAAACTTT" +output_tabular=$(cat sub_sample.fq) + +if [ "$output_tabular" == "$expected_output_tabular" ]; then + echo "Tabular output test passed" +else + echo "Tabular output test failed" + echo "Expected:" + echo "$expected_output_tabular" + echo "Got:" + echo "$output_tabular" +fi + +######################################################################################### +# Run line option output test +cd .. +mkdir test4 +cd test4 + +echo "> Run seqtk_subseq with line length option" +"$meta_executable" \ + --sequence_line_length 5 \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ + --output "sub_sample.fq" + +expected_output_wrapped=">KU562861.1:11-20 +AGTGT +TCGAG +>MH150936.1:11-20 +TGAAA +ACTTT" +output_wrapped=$(cat sub_sample.fq) + +if [ "$output_wrapped" == "$expected_output_wrapped" ]; then + echo "Line-wrapped output test passed" +else + echo "Line-wrapped output test failed" + echo "Expected:" + echo "$expected_output_wrapped" + echo "Got:" + echo "$output_wrapped" +fi diff --git a/src/seqtk/seqtk_subseq/test_data/id.list b/src/seqtk/seqtk_subseq/test_data/id.list index d00491fd..3071abfc 100644 --- a/src/seqtk/seqtk_subseq/test_data/id.list +++ b/src/seqtk/seqtk_subseq/test_data/id.list @@ -1 +1,2 @@ -1 +KU562861.1 +MH150936.1 \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/input.fasta b/src/seqtk/seqtk_subseq/test_data/input.fasta index d6554922..ea35c1f3 100644 --- a/src/seqtk/seqtk_subseq/test_data/input.fasta +++ b/src/seqtk/seqtk_subseq/test_data/input.fasta @@ -1,15 +1,15 @@ ->a +>KU562861.1 GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA ->b +>GU056837.1 CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC ->c +>CP097510.1 CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC ->d +>JAMFTS010000002.1 CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC ->e +>MH150936.1 TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/reg.bed b/src/seqtk/seqtk_subseq/test_data/reg.bed new file mode 100644 index 00000000..137ed6c1 --- /dev/null +++ b/src/seqtk/seqtk_subseq/test_data/reg.bed @@ -0,0 +1,3 @@ +KU562861.1 10 20 +MH150936.1 10 20 + From cc6746eaebff4e2b537d3eab865692c3d03ef37c Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 13:57:29 -0300 Subject: [PATCH 12/28] Bug fixed - removed some test files - fixed bug with the help of Toni - added correct software_versions.txt to config Still Needs: - add one more test to strand aware - fix tab test --- myout.fa | 0 src/seqtk/seqtk_subseq/config.vsh.yaml | 8 +- src/seqtk/seqtk_subseq/script.sh | 2 + src/seqtk/seqtk_subseq/test.sh | 168 +++++++++------------ src/seqtk/seqtk_subseq/test1.sh | 122 --------------- src/seqtk/seqtk_subseq/test_data/a.1.fastq | 40 ----- src/seqtk/seqtk_subseq/test_data/a.2.fastq | 40 ----- src/seqtk/seqtk_subseq/test_data/a.fastq | 4 - src/seqtk/seqtk_subseq/test_data/id2.list | 2 - src/seqtk/seqtk_subseq/test_data/ids.txt | 2 - 10 files changed, 75 insertions(+), 313 deletions(-) delete mode 100644 myout.fa delete mode 100644 src/seqtk/seqtk_subseq/test1.sh delete mode 100644 src/seqtk/seqtk_subseq/test_data/a.1.fastq delete mode 100644 src/seqtk/seqtk_subseq/test_data/a.2.fastq delete mode 100644 src/seqtk/seqtk_subseq/test_data/a.fastq delete mode 100644 src/seqtk/seqtk_subseq/test_data/id2.list delete mode 100644 src/seqtk/seqtk_subseq/test_data/ids.txt diff --git a/myout.fa b/myout.fa deleted file mode 100644 index e69de29b..00000000 diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index f27311bc..d98cd839 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -59,7 +59,7 @@ resources: path: script.sh test_resources: - type: bash_script - path: test1.sh + path: test.sh - type: file path: test_data @@ -67,9 +67,9 @@ engines: - type: docker image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 setup: - - type: docker - run: | - echo "xxx: \"0.1.0\"" > /var/software_versions.txt ++ - type: docker ++ run: | ++ echo "seqtk version: 1.4 (r122)" > /var/software_versions.txt runners: - type: executable - type: nextflow diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh index 32053316..140663ca 100644 --- a/src/seqtk/seqtk_subseq/script.sh +++ b/src/seqtk/seqtk_subseq/script.sh @@ -2,6 +2,8 @@ ## VIASH START ## VIASH END +[[ "$par_tab" == "false" ]] && unset par_tab +[[ "$par_strand_aware" == "false" ]] && unset par_strand_aware seqtk subseq \ ${par_tab:+-t} \ diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 7df736f6..5081b40f 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -8,149 +8,119 @@ meta_executable="target/executable/seqtk/seqtk_subseq" meta_resources_dir="src/seqtk" ## VIASH END +# TODO: +# - Fix Tab option test +# - Add strand aware test (create new fasta file with right configuration) + ######################################################################################### +# Run basic test mkdir test1 cd test1 echo "> Run seqtk_subseq on FASTA/Q file" "$meta_executable" \ - --input "$meta_resources_dir/test_data/a.1.fastq" \ + --input "$meta_resources_dir/test_data/input.fasta" \ --name_list "$meta_resources_dir/test_data/id.list" \ --output "sub_sample.fq" -echo ">> Check if output exists" -if [ ! -f "sub_sample.fq" ]; then - echo ">> sub_sample.fq does not exist" - exit 1 -fi - -echo ">> Check number of lines in output" -n_lines=$(wc -l < sub_sample.fq) -n_lines=$(echo "$n_lines" | awk '{print $1}') +expected_output_basic=">KU562861.1 +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCAAGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>MH150936.1 +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG" +output_basic=$(cat sub_sample.fq) -if [ "$n_lines" -ne 2 ]; then - echo ">> sub_sample.fq does not contain exactly two lines" - exit 1 -fi - -echo ">> Check content in output" -result=$(sed -n '2p' sub_sample.fq) -expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") -if [ "$result" == "$expected" ]; then - echo "--> content are equal" +if [ "$output_basic" == "$expected_output_basic" ]; then + echo "Basic test passed" else - echo "--> content are not equal" + echo "Basic test failed" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" fi -######################################################################################## -#-- tab option -- +######################################################################################### +# Run reg.bed as name list input test cd .. mkdir test2 cd test2 -echo "> Run seqtk_subseq with TAB option" +echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list" "$meta_executable" \ - --tab \ --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/ids.txt" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ --output "sub_sample.fq" -echo ">> Check if output exists" -if [ ! -f "sub_sample.fq" ]; then - echo ">> sub_sample.fq does not exist" - exit 1 -fi - -echo ">> Check number of lines in output" -n_lines=$(wc -l < sub_sample.fq) -n_lines=$(echo "$n_lines" | awk '{print $1}') +expected_output_basic=">KU562861.1:11-20 +AGTGTTCGAG +>MH150936.1:11-20 +TGAAAACTTT" +output_basic=$(cat sub_sample.fq) -if [ "$n_lines" -ne 2 ]; then - echo ">> sub_sample.fq does not contain exactly two lines" - exit 1 -fi - -echo ">> Check content in output" -result=$(sed -n '2p' sub_sample.fq) -expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") -if [ "$result" == "$expected" ]; then - echo "--> content are equal" +if [ "$output_basic" == "$expected_output_basic" ]; then + echo "Test passed!" else - echo "--> content are not equal" + echo "Test failed!" + echo "Expected:" + echo "$expected_output_basic" + echo "Got:" + echo "$output_basic" fi -cat sub_sample.fq - -######################################################################################## --- strand aware option -- +######################################################################################### +# Run tab option output test cd .. mkdir test3 cd test3 -echo "> Run seqtk_subseq with Strand Aware option" +echo "> Run seqtk_subseq with TAB option" "$meta_executable" \ - --strand_aware \ - --input "$meta_resources_dir/test_data/a.1.fastq" \ - --name_list "$meta_resources_dir/test_data/id.list" \ + --tab \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ --output "sub_sample.fq" -echo ">> Check if output exists" -if [ ! -f "sub_sample.fq" ]; then - echo ">> sub_sample.fq does not exist" - exit 1 -fi - -echo ">> Check number of lines in output" -n_lines=$(wc -l < sub_sample.fq) -n_lines=$(echo "$n_lines" | awk '{print $1}') - -if [ "$n_lines" -ne 2 ]; then - echo ">> sub_sample.fq does not contain exactly two lines" - exit 1 -fi +expected_output_tabular=$(echo "KU562861.1\t11\tAGTGTTCGAG +MH150936.1\t11\tTGAAAACTTT") +output_tabular=$(cat sub_sample.fq) -echo ">> Check content in output" -result=$(sed -n '2p' sub_sample.fq) -expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") -if [ "$result" == "$expected" ]; then - echo "--> content are equal" +if [ "$output_tabular" == "$expected_output_tabular" ]; then + echo "Tabular output test passed" else - echo "--> content are not equal" + echo "Tabular output test failed" + echo "Expected:" + echo "$expected_output_tabular" + echo "Got:" + echo "$output_tabular" fi -######################################################################################## --- sequence line length option -- +######################################################################################### +# Run line option output test cd .. mkdir test4 cd test4 echo "> Run seqtk_subseq with line length option" "$meta_executable" \ - --sequence_line_length 10 \ - --input "$meta_resources_dir/test_data/a.1.fastq" \ - --name_list "$meta_resources_dir/test_data/id.list" \ + --sequence_line_length 5 \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ --output "sub_sample.fq" -echo ">> Check if output exists" -if [ ! -f "sub_sample.fq" ]; then - echo ">> sub_sample.fq does not exist" - exit 1 -fi - -echo ">> Check number of lines in output" -n_lines=$(wc -l < sub_sample.fq) -n_lines=$(echo "$n_lines" | awk '{print $1}') - -if [ "$n_lines" -ne 2 ]; then - echo ">> sub_sample.fq does not contain exactly two lines" - exit 1 -fi +expected_output_wrapped=">KU562861.1:11-20 +AGTGT +TCGAG +>MH150936.1:11-20 +TGAAA +ACTTT" +output_wrapped=$(cat sub_sample.fq) -echo ">> Check content in output" -result=$(sed -n '2p' sub_sample.fq) -expected=$(sed -n '2p' "$meta_resources_dir/test_data/a.1.fastq") -if [ "$result" == "$expected" ]; then - echo "--> content are equal" +if [ "$output_wrapped" == "$expected_output_wrapped" ]; then + echo "Line-wrapped output test passed" else - echo "--> content are not equal" + echo "Line-wrapped output test failed" + echo "Expected:" + echo "$expected_output_wrapped" + echo "Got:" + echo "$output_wrapped" fi diff --git a/src/seqtk/seqtk_subseq/test1.sh b/src/seqtk/seqtk_subseq/test1.sh deleted file mode 100644 index 42b9f005..00000000 --- a/src/seqtk/seqtk_subseq/test1.sh +++ /dev/null @@ -1,122 +0,0 @@ -#!/bin/bash - -# exit on error -set -e - -## VIASH START -meta_executable="target/executable/seqtk/seqtk_subseq" -meta_resources_dir="src/seqtk" -## VIASH END - -######################################################################################### -# Run basic test -mkdir test1 -cd test1 - -echo "> Run seqtk_subseq on FASTA/Q file" -"$meta_executable" \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/id.list" \ - --output "sub_sample.fq" - -expected_output_basic=">KU562861.1 -GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCAAGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA ->MH150936.1 -TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG" -output_basic=$(cat sub_sample.fq) - -if [ "$output_basic" == "$expected_output_basic" ]; then - echo "Basic test passed" -else - echo "Basic test failed" - echo "Expected:" - echo "$expected_output_basic" - echo "Got:" - echo "$output_basic" -fi - -######################################################################################### -# Run reg.bed as name list input test -cd .. -mkdir test2 -cd test2 - -echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list" -"$meta_executable" \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ - --output "sub_sample.fq" - -expected_output_basic=">KU562861.1:11-20 -AGTGTTCGAG ->MH150936.1:11-20 -TGAAAACTTT" -output_basic=$(cat sub_sample.fq) - -if [ "$output_basic" == "$expected_output_basic" ]; then - echo "Test passed!" -else - echo "Test failed!" - echo "Expected:" - echo "$expected_output_basic" - echo "Got:" - echo "$output_basic" -fi - -######################################################################################### -# Run tab option output test -cd .. -mkdir test3 -cd test3 - -echo "> Run seqtk_subseq with TAB option" -"$meta_executable" \ - --tab \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ - --output "sub_sample.fq" - -expected_output_tabular="KU562861.1\t11\tAGTGTTCGAG -MH150936.1\t11\tTGAAAACTTT" -output_tabular=$(cat sub_sample.fq) - -if [ "$output_tabular" == "$expected_output_tabular" ]; then - echo "Tabular output test passed" -else - echo "Tabular output test failed" - echo "Expected:" - echo "$expected_output_tabular" - echo "Got:" - echo "$output_tabular" -fi - -######################################################################################### -# Run line option output test -cd .. -mkdir test4 -cd test4 - -echo "> Run seqtk_subseq with line length option" -"$meta_executable" \ - --sequence_line_length 5 \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ - --output "sub_sample.fq" - -expected_output_wrapped=">KU562861.1:11-20 -AGTGT -TCGAG ->MH150936.1:11-20 -TGAAA -ACTTT" -output_wrapped=$(cat sub_sample.fq) - -if [ "$output_wrapped" == "$expected_output_wrapped" ]; then - echo "Line-wrapped output test passed" -else - echo "Line-wrapped output test failed" - echo "Expected:" - echo "$expected_output_wrapped" - echo "Got:" - echo "$output_wrapped" -fi diff --git a/src/seqtk/seqtk_subseq/test_data/a.1.fastq b/src/seqtk/seqtk_subseq/test_data/a.1.fastq deleted file mode 100644 index 458b2a9c..00000000 --- a/src/seqtk/seqtk_subseq/test_data/a.1.fastq +++ /dev/null @@ -1,40 +0,0 @@ -@1 1 -A1_1 -+1_1 -!1_1 -@2 1 -A2_1 -+2_1 -!2_1 -@3 1 -A3_1 -+3_1 -!3_1 -@4 1 -A4_1 -+4_1 -!4_1 -@5 1 -A5_1 -+5_1 -!5_1 -@6 1 -A6_1 -+6_1 -!6_1 -@7 1 -A7_1 -+7_1 -!7_1 -@8 1 -A8_1 -+8_1 -!8_1 -@9 1 -A9_1 -+9_1 -!9_1 -@10 1 -A10_1 -+10_1 -!10_1 diff --git a/src/seqtk/seqtk_subseq/test_data/a.2.fastq b/src/seqtk/seqtk_subseq/test_data/a.2.fastq deleted file mode 100644 index 50d3ce80..00000000 --- a/src/seqtk/seqtk_subseq/test_data/a.2.fastq +++ /dev/null @@ -1,40 +0,0 @@ -@1 2 -A1_2 -+1_2 -!1_2 -@2 2 -A2_2 -+2_2 -!2_2 -@3 2 -A3_2 -+3_2 -!3_2 -@4 2 -A4_2 -+4_2 -!4_2 -@5 2 -A5_2 -+5_2 -!5_2 -@6 2 -A6_2 -+6_2 -!6_2 -@7 2 -A7_2 -+7_2 -!7_2 -@8 2 -A8_2 -+8_2 -!8_2 -@9 2 -A9_2 -+9_2 -!9_2 -@10 2 -A10_2 -+10_2 -!10_2 diff --git a/src/seqtk/seqtk_subseq/test_data/a.fastq b/src/seqtk/seqtk_subseq/test_data/a.fastq deleted file mode 100644 index 42735560..00000000 --- a/src/seqtk/seqtk_subseq/test_data/a.fastq +++ /dev/null @@ -1,4 +0,0 @@ -@1 -ACGGCAT -+ -!!!!!!! diff --git a/src/seqtk/seqtk_subseq/test_data/id2.list b/src/seqtk/seqtk_subseq/test_data/id2.list deleted file mode 100644 index 7a754f41..00000000 --- a/src/seqtk/seqtk_subseq/test_data/id2.list +++ /dev/null @@ -1,2 +0,0 @@ -1 -2 \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/ids.txt b/src/seqtk/seqtk_subseq/test_data/ids.txt deleted file mode 100644 index 56d77f0e..00000000 --- a/src/seqtk/seqtk_subseq/test_data/ids.txt +++ /dev/null @@ -1,2 +0,0 @@ -a 10 20 -c 10 20 \ No newline at end of file From c782e2a5504e8a4f960526cac9a569646ce27d83 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 14:11:42 -0300 Subject: [PATCH 13/28] Update CHANGELOG.md --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0c88fec1..8f54dcd0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -80,7 +80,9 @@ - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). -* `seqtk subseq`: (PR #) +* `seqtk`: + - `subseq`: extract the sequences (complete or subsequence) from the FASTA/FASTQ files + based on provided sequence IDs or region coordinates file (PR #85). ## MINOR CHANGES From 06e1fe881d642ab4ede0462bfede7eaf75afdf20 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 14:39:18 -0300 Subject: [PATCH 14/28] Fixed Tabular test bug --- src/seqtk/seqtk_subseq/config.vsh.yaml | 7 ++-- src/seqtk/seqtk_subseq/test.sh | 50 ++++++++++++++++++++------ 2 files changed, 44 insertions(+), 13 deletions(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index d98cd839..a732af2b 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -67,9 +67,10 @@ engines: - type: docker image: quay.io/biocontainers/seqtk:1.4--he4a0461_2 setup: -+ - type: docker -+ run: | -+ echo "seqtk version: 1.4 (r122)" > /var/software_versions.txt + - type: docker + run: | + echo "seqtk version: 1.4 (r122)" > /var/software_versions.txt + runners: - type: executable - type: nextflow diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 5081b40f..0959db36 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -30,9 +30,9 @@ TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTCAAAAGCAATC output_basic=$(cat sub_sample.fq) if [ "$output_basic" == "$expected_output_basic" ]; then - echo "Basic test passed" + echo "Test passed" else - echo "Basic test failed" + echo "Test failed" echo "Expected:" echo "$expected_output_basic" echo "Got:" @@ -58,9 +58,9 @@ TGAAAACTTT" output_basic=$(cat sub_sample.fq) if [ "$output_basic" == "$expected_output_basic" ]; then - echo "Test passed!" + echo "Test passed" else - echo "Test failed!" + echo "Test failed" echo "Expected:" echo "$expected_output_basic" echo "Got:" @@ -80,14 +80,13 @@ echo "> Run seqtk_subseq with TAB option" --name_list "$meta_resources_dir/test_data/reg.bed" \ --output "sub_sample.fq" -expected_output_tabular=$(echo "KU562861.1\t11\tAGTGTTCGAG -MH150936.1\t11\tTGAAAACTTT") +expected_output_tabular=$'KU562861.1\t11\tAGTGTTCGAG\nMH150936.1\t11\tTGAAAACTTT' output_tabular=$(cat sub_sample.fq) if [ "$output_tabular" == "$expected_output_tabular" ]; then - echo "Tabular output test passed" + echo "Test passed" else - echo "Tabular output test failed" + echo "Test failed" echo "Expected:" echo "$expected_output_tabular" echo "Got:" @@ -116,11 +115,42 @@ ACTTT" output_wrapped=$(cat sub_sample.fq) if [ "$output_wrapped" == "$expected_output_wrapped" ]; then - echo "Line-wrapped output test passed" + echo "Test passed" else - echo "Line-wrapped output test failed" + echo "Test failed" echo "Expected:" echo "$expected_output_wrapped" echo "Got:" echo "$output_wrapped" fi + +######################################################################################### +# Run Strand Aware option output test +cd .. +mkdir test5 +cd test5 + +echo "> Run seqtk_subseq with strand aware option" +"$meta_executable" \ + --strand_aware \ + --input "$meta_resources_dir/test_data/input.fasta" \ + --name_list "$meta_resources_dir/test_data/reg.bed" \ + --output "sub_sample.fq" + +# expected_output_wrapped=">KU562861.1:11-20 +# AGTGT +# TCGAG +# >MH150936.1:11-20 +# TGAAA +# ACTTT" +# output_wrapped=$(cat sub_sample.fq) + +# if [ "$output_wrapped" == "$expected_output_wrapped" ]; then +# echo "Line-wrapped output test passed" +# else +# echo "Line-wrapped output test failed" +# echo "Expected:" +# echo "$expected_output_wrapped" +# echo "Got:" +# echo "$output_wrapped" +# fi \ No newline at end of file From 20ac10a9a13a852a77f94c165c62df867308b03d Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 15:33:37 -0300 Subject: [PATCH 15/28] Strand Aware Test - implementation of strand aware test - change of format for reg.bed file --- CHANGELOG.md | 4 +-- src/seqtk/seqtk_subseq/script.sh | 2 ++ src/seqtk/seqtk_subseq/test.sh | 32 +++++++++++------------- src/seqtk/seqtk_subseq/test_data/reg.bed | 4 +-- 4 files changed, 21 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f54dcd0..d80f18f3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -81,8 +81,8 @@ intervals defined in a BED/GFF/VCF file (PR #59). * `seqtk`: - - `subseq`: extract the sequences (complete or subsequence) from the FASTA/FASTQ files - based on provided sequence IDs or region coordinates file (PR #85). + - `subseq`: Extract the sequences (complete or subsequence) from the FASTA/FASTQ files + based on a provided sequence IDs or region coordinates file (PR #85). ## MINOR CHANGES diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh index 140663ca..3abf5f2d 100644 --- a/src/seqtk/seqtk_subseq/script.sh +++ b/src/seqtk/seqtk_subseq/script.sh @@ -5,6 +5,8 @@ [[ "$par_tab" == "false" ]] && unset par_tab [[ "$par_strand_aware" == "false" ]] && unset par_strand_aware +# Todo: if a bed file is given, check if format is valid! + - should be in six column + seqtk subseq \ ${par_tab:+-t} \ ${par_strand_aware:+-s} \ diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 0959db36..95c8a19f 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -137,20 +137,18 @@ echo "> Run seqtk_subseq with strand aware option" --name_list "$meta_resources_dir/test_data/reg.bed" \ --output "sub_sample.fq" -# expected_output_wrapped=">KU562861.1:11-20 -# AGTGT -# TCGAG -# >MH150936.1:11-20 -# TGAAA -# ACTTT" -# output_wrapped=$(cat sub_sample.fq) - -# if [ "$output_wrapped" == "$expected_output_wrapped" ]; then -# echo "Line-wrapped output test passed" -# else -# echo "Line-wrapped output test failed" -# echo "Expected:" -# echo "$expected_output_wrapped" -# echo "Got:" -# echo "$output_wrapped" -# fi \ No newline at end of file +expected_output_wrapped=">KU562861.1:11-20 +AGTGTTCGAG +>MH150936.1:11-20 +AAAGTTTTCA" +output_wrapped=$(cat sub_sample.fq) + +if [ "$output_wrapped" == "$expected_output_wrapped" ]; then + echo "Test passed" +else + echo "Test failed" + echo "Expected:" + echo "$expected_output_wrapped" + echo "Got:" + echo "$output_wrapped" +fi \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/reg.bed b/src/seqtk/seqtk_subseq/test_data/reg.bed index 137ed6c1..af33d0a5 100644 --- a/src/seqtk/seqtk_subseq/test_data/reg.bed +++ b/src/seqtk/seqtk_subseq/test_data/reg.bed @@ -1,3 +1,3 @@ -KU562861.1 10 20 -MH150936.1 10 20 +KU562861.1 10 20 region 0 + +MH150936.1 10 20 region 0 - From 0059ede599f2dc8a0995789a532bf6e79bd4f6ec Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 17:11:24 -0300 Subject: [PATCH 16/28] Input validation for list file - input validation for name_list parameter --- src/seqtk/seqtk_subseq/script.sh | 79 +++++++++++++++++++++- src/seqtk/seqtk_subseq/test_data/script.sh | 13 ---- 2 files changed, 77 insertions(+), 15 deletions(-) delete mode 100644 src/seqtk/seqtk_subseq/test_data/script.sh diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh index 3abf5f2d..d07e1a02 100644 --- a/src/seqtk/seqtk_subseq/script.sh +++ b/src/seqtk/seqtk_subseq/script.sh @@ -2,11 +2,86 @@ ## VIASH START ## VIASH END + +# Function to check if a file is a valid BED file +check_bed_file() { + local file="$1" + num_columns=$(head -n 1 "$file" | cut -f 1- | tr '\t' '\n' | wc -l) + + # Check if the file exists + if [[ ! -f "$file" ]]; then + echo "Error: The specified file does not exist." + return 1 + fi + + # Check if the file is non-empty + if [[ ! -s "$file" ]]; then + echo "Error: The specified file is empty." + return 1 + fi + + # Check if the file is a valid BED file (minimum 3 tab-separated columns) + if [[ $num_columns -lt 3 ]]; then + echo "The specified file is not a valid BED file. It should have at least three tab-separated columns." + return 1 + fi + + # Additional check: Ensure that the 6th column (if present) is either + or - + if [[ $num_columns -eq 6 ]]; then + while IFS=$'\t' read -r -a columns; do + columns[5]=${columns[5]%$'\n'} + if [[ ${columns[5]} == "+" || ${columns[5]} == "-" ]]; then + return 0 + else + echo "Error: The 6th column of the specified BED file should be either + or -." + echo "Offending line: ${columns[5]}" + return 1 + fi + done < "$file" + fi + + return 0 +} + +# Function to check if a file is a valid list of FASTA file IDs +check_fasta_id_list() { + local file="$1" + + # Check if the file exists + if [[ ! -f "$file" ]]; then + echo "Error: The specified file does not exist." + return 1 + fi + + # Check if the file is non-empty + if [[ ! -s "$file" ]]; then + echo "Error: The specified file is empty." + return 1 + fi + + # Additional check: Ensure that each line contains only one word (FASTA ID) + if ! awk 'NF != 1 { exit 1 }' "$file"; then + return 1 + fi + + return 0 +} + +# Check if the par_name_list is given and validate accordingly +if [[ -n "$par_name_list" ]]; then + if check_fasta_id_list "$par_name_list"; then + echo "The specified file is a valid list of FASTA IDs." + elif check_bed_file "$par_name_list"; then + echo "The specified file is a valid BED file." + else + echo "Error: The specified file is neither a valid BED file nor a valid list of FASTA IDs." + exit 1 + fi +fi + [[ "$par_tab" == "false" ]] && unset par_tab [[ "$par_strand_aware" == "false" ]] && unset par_strand_aware -# Todo: if a bed file is given, check if format is valid! + - should be in six column - seqtk subseq \ ${par_tab:+-t} \ ${par_strand_aware:+-s} \ diff --git a/src/seqtk/seqtk_subseq/test_data/script.sh b/src/seqtk/seqtk_subseq/test_data/script.sh deleted file mode 100644 index b2d825f5..00000000 --- a/src/seqtk/seqtk_subseq/test_data/script.sh +++ /dev/null @@ -1,13 +0,0 @@ -# clone repo -if [ ! -d /tmp/snakemake-wrappers ]; then - git clone --depth 1 --single-branch --branch master https://github.com/snakemake/snakemake-wrappers /tmp/snakemake-wrappers -fi - -# copy test data -cp -r /tmp/snakemake-wrappers/bio/seqtk/test/reads/* src/seqtk/seqtk_subseq/test_data - -# remove a.fastq file -rm src/seqtk/seqtk_subseq/test_data/a.fastq - -# unzip fastq files -gunzip src/seqtk/seqtk_subseq/test_data/*.gz From aab5679b3bbc21ec9bcb8af6cce77d72414d87ac Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 20:15:53 -0300 Subject: [PATCH 17/28] Sugested Changes - removed test_data dir - removed input validation --- src/seqtk/seqtk_subseq/config.vsh.yaml | 2 - src/seqtk/seqtk_subseq/script.sh | 76 -------------------- src/seqtk/seqtk_subseq/test.sh | 62 ++++++++++++---- src/seqtk/seqtk_subseq/test_data/id.list | 2 - src/seqtk/seqtk_subseq/test_data/input.fasta | 15 ---- src/seqtk/seqtk_subseq/test_data/reg.bed | 3 - 6 files changed, 48 insertions(+), 112 deletions(-) delete mode 100644 src/seqtk/seqtk_subseq/test_data/id.list delete mode 100644 src/seqtk/seqtk_subseq/test_data/input.fasta delete mode 100644 src/seqtk/seqtk_subseq/test_data/reg.bed diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index a732af2b..d11915d4 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -60,8 +60,6 @@ resources: test_resources: - type: bash_script path: test.sh - - type: file - path: test_data engines: - type: docker diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh index d07e1a02..7073937f 100644 --- a/src/seqtk/seqtk_subseq/script.sh +++ b/src/seqtk/seqtk_subseq/script.sh @@ -3,82 +3,6 @@ ## VIASH START ## VIASH END -# Function to check if a file is a valid BED file -check_bed_file() { - local file="$1" - num_columns=$(head -n 1 "$file" | cut -f 1- | tr '\t' '\n' | wc -l) - - # Check if the file exists - if [[ ! -f "$file" ]]; then - echo "Error: The specified file does not exist." - return 1 - fi - - # Check if the file is non-empty - if [[ ! -s "$file" ]]; then - echo "Error: The specified file is empty." - return 1 - fi - - # Check if the file is a valid BED file (minimum 3 tab-separated columns) - if [[ $num_columns -lt 3 ]]; then - echo "The specified file is not a valid BED file. It should have at least three tab-separated columns." - return 1 - fi - - # Additional check: Ensure that the 6th column (if present) is either + or - - if [[ $num_columns -eq 6 ]]; then - while IFS=$'\t' read -r -a columns; do - columns[5]=${columns[5]%$'\n'} - if [[ ${columns[5]} == "+" || ${columns[5]} == "-" ]]; then - return 0 - else - echo "Error: The 6th column of the specified BED file should be either + or -." - echo "Offending line: ${columns[5]}" - return 1 - fi - done < "$file" - fi - - return 0 -} - -# Function to check if a file is a valid list of FASTA file IDs -check_fasta_id_list() { - local file="$1" - - # Check if the file exists - if [[ ! -f "$file" ]]; then - echo "Error: The specified file does not exist." - return 1 - fi - - # Check if the file is non-empty - if [[ ! -s "$file" ]]; then - echo "Error: The specified file is empty." - return 1 - fi - - # Additional check: Ensure that each line contains only one word (FASTA ID) - if ! awk 'NF != 1 { exit 1 }' "$file"; then - return 1 - fi - - return 0 -} - -# Check if the par_name_list is given and validate accordingly -if [[ -n "$par_name_list" ]]; then - if check_fasta_id_list "$par_name_list"; then - echo "The specified file is a valid list of FASTA IDs." - elif check_bed_file "$par_name_list"; then - echo "The specified file is a valid BED file." - else - echo "Error: The specified file is neither a valid BED file nor a valid list of FASTA IDs." - exit 1 - fi -fi - [[ "$par_tab" == "false" ]] && unset par_tab [[ "$par_strand_aware" == "false" ]] && unset par_strand_aware diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 95c8a19f..34a07d89 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -8,10 +8,44 @@ meta_executable="target/executable/seqtk/seqtk_subseq" meta_resources_dir="src/seqtk" ## VIASH END -# TODO: -# - Fix Tab option test -# - Add strand aware test (create new fasta file with right configuration) - +# Create directories for tests +echo "Creating Test Data..." +mkdir test_data + +# Create and populate input.fasta +cat < "test_data/input.fasta" +>KU562861.1 +GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA +AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA +>GU056837.1 +CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA +TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC +>CP097510.1 +CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC +AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC +>JAMFTS010000002.1 +CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA +ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC +>MH150936.1 +TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC +AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG +EOL + +# Update id.list with new entries +cat < "test_data/id.list" +KU562861.1 +MH150936.1 +EOL + +# Create and populate reg.bed +cat < "test_data/reg.bed" +KU562861.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t")0$(echo -e "\t")+$(echo -e "\n") +MH150936.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t")0$(echo -e "\t")- +EOL + +cd test_data +cat reg.bed +cd .. ######################################################################################### # Run basic test mkdir test1 @@ -19,8 +53,8 @@ cd test1 echo "> Run seqtk_subseq on FASTA/Q file" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/id.list" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/id.list" \ --output "sub_sample.fq" expected_output_basic=">KU562861.1 @@ -47,8 +81,8 @@ cd test2 echo "> Run seqtk_subseq on FASTA/Q file with BED file as name list" "$meta_executable" \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ --output "sub_sample.fq" expected_output_basic=">KU562861.1:11-20 @@ -76,8 +110,8 @@ cd test3 echo "> Run seqtk_subseq with TAB option" "$meta_executable" \ --tab \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ --output "sub_sample.fq" expected_output_tabular=$'KU562861.1\t11\tAGTGTTCGAG\nMH150936.1\t11\tTGAAAACTTT' @@ -102,8 +136,8 @@ cd test4 echo "> Run seqtk_subseq with line length option" "$meta_executable" \ --sequence_line_length 5 \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ --output "sub_sample.fq" expected_output_wrapped=">KU562861.1:11-20 @@ -133,8 +167,8 @@ cd test5 echo "> Run seqtk_subseq with strand aware option" "$meta_executable" \ --strand_aware \ - --input "$meta_resources_dir/test_data/input.fasta" \ - --name_list "$meta_resources_dir/test_data/reg.bed" \ + --input "../test_data/input.fasta" \ + --name_list "../test_data/reg.bed" \ --output "sub_sample.fq" expected_output_wrapped=">KU562861.1:11-20 diff --git a/src/seqtk/seqtk_subseq/test_data/id.list b/src/seqtk/seqtk_subseq/test_data/id.list deleted file mode 100644 index 3071abfc..00000000 --- a/src/seqtk/seqtk_subseq/test_data/id.list +++ /dev/null @@ -1,2 +0,0 @@ -KU562861.1 -MH150936.1 \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/input.fasta b/src/seqtk/seqtk_subseq/test_data/input.fasta deleted file mode 100644 index ea35c1f3..00000000 --- a/src/seqtk/seqtk_subseq/test_data/input.fasta +++ /dev/null @@ -1,15 +0,0 @@ ->KU562861.1 -GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA -AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA ->GU056837.1 -CTAATTTTATTTTTTTATAATAATTATTGGAGGAACTAAAACATTAATGAAATAATAATTATCATAATTA -TTAATTACATATTTATTAGGTATAATATTTAAGGAAAAATATATTTTATGTTAATTGTAATAATTAGAAC ->CP097510.1 -CGATTTAGATCGGTGTAGTCAACACACATCCTCCACTTCCATTAGGCTTCTTGACGAGGACTACATTGAC -AGCCACCGAGGGAACCGACCTCCTCAATGAAGTCAGACGCCAAGAGCCTATCAACTTCCTTCTGCACAGC ->JAMFTS010000002.1 -CCTAAACCCTAAACCCTAAACCCCCTACAAACCTTACCCTAAACCCTAAACCCTAAACCCTAAACCCTAA -ACCCGAAACCCTATACCCTAAACCCTAAACCCTAAACCCTAAACCCTAACCCAAACCTAATCCCTAAACC ->MH150936.1 -TAGAAGCTAATGAAAACTTTTCCTTTACTAAAAACCGTCAAACACGGTAAGAAACGCTTTTAATCATTTC -AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/test_data/reg.bed b/src/seqtk/seqtk_subseq/test_data/reg.bed deleted file mode 100644 index af33d0a5..00000000 --- a/src/seqtk/seqtk_subseq/test_data/reg.bed +++ /dev/null @@ -1,3 +0,0 @@ -KU562861.1 10 20 region 0 + -MH150936.1 10 20 region 0 - - From 32c084df8ec16116b31cb0f39b2d4ea86c010bfb Mon Sep 17 00:00:00 2001 From: tgaspe Date: Wed, 17 Jul 2024 20:25:49 -0300 Subject: [PATCH 18/28] Added author info --- src/_authors/theodoro_gasperin.yaml | 10 ++++++++++ src/seqtk/seqtk_subseq/config.vsh.yaml | 3 +++ src/seqtk/seqtk_subseq/test.sh | 3 --- 3 files changed, 13 insertions(+), 3 deletions(-) create mode 100644 src/_authors/theodoro_gasperin.yaml diff --git a/src/_authors/theodoro_gasperin.yaml b/src/_authors/theodoro_gasperin.yaml new file mode 100644 index 00000000..899ce180 --- /dev/null +++ b/src/_authors/theodoro_gasperin.yaml @@ -0,0 +1,10 @@ +name: Theodoro Gasperin Terra Camargo +info: + links: + email: theodorogtc@gmail.com + github: tgaspe + linkedin: theodoro-gasperin-terra-camargo + organizations: + - name: Data Intuitive + href: https://www.data-intuitive.com + role: Bioinformatician \ No newline at end of file diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index d11915d4..2f19f05b 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -6,6 +6,9 @@ keywords: [subseq, FASTA, FASTQ] links: repository: https://github.com/lh3/seqtk/tree/v1.4 license: MIT +authors: + - __merge__: /src/_authors/theodoro_gasperin.yaml + roles: [ author, maintainer ] argument_groups: - name: Inputs diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 34a07d89..e13da48d 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -43,9 +43,6 @@ KU562861.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t") MH150936.1$(echo -e "\t")10$(echo -e "\t")20$(echo -e "\t")region$(echo -e "\t")0$(echo -e "\t")- EOL -cd test_data -cat reg.bed -cd .. ######################################################################################### # Run basic test mkdir test1 From a84277769718535c91ad6ab3122f0361a979ae30 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 18 Jul 2024 07:54:35 +0200 Subject: [PATCH 19/28] Update CHANGELOG.md --- CHANGELOG.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8056c024..d6256e72 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,11 +95,8 @@ - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTQ (PR #52). - `samtools/samtools_fastq`: Converts a SAM/BAM/CRAM file to FASTA (PR #53). - * `falco`: A C++ drop-in replacement of FastQC to assess the quality of sequence read data (PR #43). -* `seqtk/seqtk_sample`: Sample sequences from FASTA/Q(.gz) files to FASTA/Q (PR #68). - * `bedtools`: - `bedtools_getfasta`: extract sequences from a FASTA file for each of the intervals defined in a BED/GFF/VCF file (PR #59). @@ -124,4 +121,4 @@ * Add escaping character before leading hashtag in the description field of the config file (PR #50). -* Format URL in biobase/bcl_convert description (PR #55). \ No newline at end of file +* Format URL in biobase/bcl_convert description (PR #55). From 819bd9bb731fb4834a2815215b743ed543d84d3d Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 18 Jul 2024 07:55:03 +0200 Subject: [PATCH 20/28] Update theodoro_gasperin.yaml --- src/_authors/theodoro_gasperin.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/_authors/theodoro_gasperin.yaml b/src/_authors/theodoro_gasperin.yaml index 899ce180..47af96a9 100644 --- a/src/_authors/theodoro_gasperin.yaml +++ b/src/_authors/theodoro_gasperin.yaml @@ -7,4 +7,4 @@ info: organizations: - name: Data Intuitive href: https://www.data-intuitive.com - role: Bioinformatician \ No newline at end of file + role: Bioinformatician From d05cbf0c4a125e9fc4a2373c2a703c5569118f58 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 18 Jul 2024 07:59:16 +0200 Subject: [PATCH 21/28] add newline --- src/seqtk/seqtk_subseq/script.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seqtk/seqtk_subseq/script.sh b/src/seqtk/seqtk_subseq/script.sh index 7073937f..0aceaf29 100644 --- a/src/seqtk/seqtk_subseq/script.sh +++ b/src/seqtk/seqtk_subseq/script.sh @@ -12,4 +12,4 @@ seqtk subseq \ ${par_sequence_line_length:+-l "$par_sequence_line_length"} \ "$par_input" \ "$par_name_list" \ - > "$par_output" \ No newline at end of file + > "$par_output" From c22b383c2508cf7ace18eb977ef5188cc456ccbe Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 18 Jul 2024 07:59:47 +0200 Subject: [PATCH 22/28] add newline --- src/seqtk/seqtk_subseq/test.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index e13da48d..4f73e1e8 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -182,4 +182,4 @@ else echo "$expected_output_wrapped" echo "Got:" echo "$output_wrapped" -fi \ No newline at end of file +fi From dfc0a6559e1eb43cfa45ef51b55549ff090b9f00 Mon Sep 17 00:00:00 2001 From: Theodoro Gasperin Terra Camargo <98555209+tgaspe@users.noreply.github.com> Date: Thu, 18 Jul 2024 09:32:19 -0300 Subject: [PATCH 23/28] Update src/seqtk/seqtk_subseq/config.vsh.yaml Co-authored-by: Robrecht Cannoodt --- src/seqtk/seqtk_subseq/config.vsh.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 2f19f05b..d2dd223c 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -53,8 +53,8 @@ argument_groups: - name: "--sequence_line_length" alternatives: -l type: integer - description: Sequence line length of input fasta file. - example: 16 + description: Sequence line length of input fasta file. Default: 0. + example: 0 resources: From 0e9060af16b9887b9818428174c4ea5caffd52e7 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Thu, 18 Jul 2024 09:49:08 -0300 Subject: [PATCH 24/28] Version Fix --- src/seqtk/seqtk_subseq/config.vsh.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 2f19f05b..f85ca250 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -70,7 +70,7 @@ engines: setup: - type: docker run: | - echo "seqtk version: 1.4 (r122)" > /var/software_versions.txt + echo $(echo $(seqtk 2>&1) | sed -n 's/.*\(Version: [^ ]*\).*/\1/p') > /var/software_versions.txt runners: - type: executable From 3a27502126b105ba9144f007f38a81140318eae5 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Thu, 18 Jul 2024 09:54:27 -0300 Subject: [PATCH 25/28] Update on config --- src/seqtk/seqtk_subseq/config.vsh.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/seqtk/seqtk_subseq/config.vsh.yaml b/src/seqtk/seqtk_subseq/config.vsh.yaml index 48059436..1c2e8c08 100644 --- a/src/seqtk/seqtk_subseq/config.vsh.yaml +++ b/src/seqtk/seqtk_subseq/config.vsh.yaml @@ -53,7 +53,8 @@ argument_groups: - name: "--sequence_line_length" alternatives: -l type: integer - description: Sequence line length of input fasta file. Default: 0. + description: | + Sequence line length of input fasta file. Default: 0. example: 0 From d00c94417950bd8c047fe13b966b9292df97f102 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Thu, 18 Jul 2024 10:50:38 -0300 Subject: [PATCH 26/28] Helper bed.sh --- src/seqtk/src/_helpers/bed.sh | 80 +++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 src/seqtk/src/_helpers/bed.sh diff --git a/src/seqtk/src/_helpers/bed.sh b/src/seqtk/src/_helpers/bed.sh new file mode 100644 index 00000000..56fa9f0f --- /dev/null +++ b/src/seqtk/src/_helpers/bed.sh @@ -0,0 +1,80 @@ +#!/bin/bash + +# Function to check if a file is a valid BED file +check_bed_file() { + local file="$1" + num_columns=$(head -n 1 "$file" | cut -f 1- | tr '\t' '\n' | wc -l) + + # Check if the file exists + if [[ ! -f "$file" ]]; then + echo "Error: The specified file does not exist." + return 1 + fi + + # Check if the file is non-empty + if [[ ! -s "$file" ]]; then + echo "Error: The specified file is empty." + return 1 + fi + + # Check if the file is a valid BED file (minimum 3 tab-separated columns) + if [[ $num_columns -lt 3 ]]; then + echo "The specified file is not a valid BED file. It should have at least three tab-separated columns." + return 1 + fi + + # Additional check: Ensure that the 6th column (if present) is either + or - + if [[ $num_columns -eq 6 ]]; then + while IFS=$'\t' read -r -a columns; do + columns[5]=${columns[5]%$'\n'} + if [[ ${columns[5]} == "+" || ${columns[5]} == "-" ]]; then + return 0 + else + echo "Error: The 6th column of the specified BED file should be either + or -." + #echo "Offending line: ${columns[5]}" + return 1 + fi + done < "$file" + fi + + return 0 +} + +# Function to check if a file is a valid list of FASTA file IDs +check_fasta_id_list() { + local file="$1" + + # Check if the file exists + if [[ ! -f "$file" ]]; then + echo "Error: The specified file does not exist." + return 1 + fi + + # Check if the file is non-empty + if [[ ! -s "$file" ]]; then + echo "Error: The specified file is empty." + return 1 + fi + + # Additional check: Ensure that each line contains only one word (FASTA ID) + if ! awk 'NF != 1 { exit 1 }' "$file"; then + return 1 + fi + + return 0 +} + +# Arguments: +list_file="$1" + +# Check if the par_name_list is given and validate accordingly +if [[ -n "$list_file" ]]; then + if check_fasta_id_list "$list_file"; then + echo "The specified file is a valid list of FASTA IDs." + elif check_bed_file "$list_file"; then + echo "The specified file is a valid BED file." + else + echo "Error: The specified file is neither a valid BED file nor a valid list of FASTA IDs." + exit 1 + fi +fi \ No newline at end of file From a276b9ec1431f11e1e752d7af4ca90b9a1503946 Mon Sep 17 00:00:00 2001 From: tgaspe Date: Thu, 18 Jul 2024 15:00:26 -0300 Subject: [PATCH 27/28] Deleted _helpers --- src/seqtk/src/_helpers/bed.sh | 80 ----------------------------------- 1 file changed, 80 deletions(-) delete mode 100644 src/seqtk/src/_helpers/bed.sh diff --git a/src/seqtk/src/_helpers/bed.sh b/src/seqtk/src/_helpers/bed.sh deleted file mode 100644 index 56fa9f0f..00000000 --- a/src/seqtk/src/_helpers/bed.sh +++ /dev/null @@ -1,80 +0,0 @@ -#!/bin/bash - -# Function to check if a file is a valid BED file -check_bed_file() { - local file="$1" - num_columns=$(head -n 1 "$file" | cut -f 1- | tr '\t' '\n' | wc -l) - - # Check if the file exists - if [[ ! -f "$file" ]]; then - echo "Error: The specified file does not exist." - return 1 - fi - - # Check if the file is non-empty - if [[ ! -s "$file" ]]; then - echo "Error: The specified file is empty." - return 1 - fi - - # Check if the file is a valid BED file (minimum 3 tab-separated columns) - if [[ $num_columns -lt 3 ]]; then - echo "The specified file is not a valid BED file. It should have at least three tab-separated columns." - return 1 - fi - - # Additional check: Ensure that the 6th column (if present) is either + or - - if [[ $num_columns -eq 6 ]]; then - while IFS=$'\t' read -r -a columns; do - columns[5]=${columns[5]%$'\n'} - if [[ ${columns[5]} == "+" || ${columns[5]} == "-" ]]; then - return 0 - else - echo "Error: The 6th column of the specified BED file should be either + or -." - #echo "Offending line: ${columns[5]}" - return 1 - fi - done < "$file" - fi - - return 0 -} - -# Function to check if a file is a valid list of FASTA file IDs -check_fasta_id_list() { - local file="$1" - - # Check if the file exists - if [[ ! -f "$file" ]]; then - echo "Error: The specified file does not exist." - return 1 - fi - - # Check if the file is non-empty - if [[ ! -s "$file" ]]; then - echo "Error: The specified file is empty." - return 1 - fi - - # Additional check: Ensure that each line contains only one word (FASTA ID) - if ! awk 'NF != 1 { exit 1 }' "$file"; then - return 1 - fi - - return 0 -} - -# Arguments: -list_file="$1" - -# Check if the par_name_list is given and validate accordingly -if [[ -n "$list_file" ]]; then - if check_fasta_id_list "$list_file"; then - echo "The specified file is a valid list of FASTA IDs." - elif check_bed_file "$list_file"; then - echo "The specified file is a valid BED file." - else - echo "Error: The specified file is neither a valid BED file nor a valid list of FASTA IDs." - exit 1 - fi -fi \ No newline at end of file From 590e1386deed5b9380091f61509e18adaa6ce453 Mon Sep 17 00:00:00 2001 From: Robrecht Cannoodt Date: Thu, 18 Jul 2024 22:15:58 +0200 Subject: [PATCH 28/28] don't forget exit when a test fails --- src/seqtk/seqtk_subseq/test.sh | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/src/seqtk/seqtk_subseq/test.sh b/src/seqtk/seqtk_subseq/test.sh index 4f73e1e8..f19cfa4a 100644 --- a/src/seqtk/seqtk_subseq/test.sh +++ b/src/seqtk/seqtk_subseq/test.sh @@ -13,7 +13,7 @@ echo "Creating Test Data..." mkdir test_data # Create and populate input.fasta -cat < "test_data/input.fasta" +cat > "test_data/input.fasta" <KU562861.1 GGAGCAGGAGAGTGTTCGAGTTCAGAGATGTCCATGGCGCCGTACGAGAAGGTGATGGATGACCTGGCCA AGGGGCAGCAGTTCGCGACGCAGCTGCAGGGCCTCCTCCGGGACTCCCCCAAGGCCGGCCACATCATGGA @@ -32,13 +32,13 @@ AAAAGCAATCCCAATAGTGGTTACATCCAAACAAAACCCATTTCTTATATTTTCTCAAAAACAGTGAGAG EOL # Update id.list with new entries -cat < "test_data/id.list" +cat > "test_data/id.list" < "test_data/reg.bed" +cat > "test_data/reg.bed" < Run seqtk_subseq with TAB option" expected_output_tabular=$'KU562861.1\t11\tAGTGTTCGAG\nMH150936.1\t11\tTGAAAACTTT' output_tabular=$(cat sub_sample.fq) -if [ "$output_tabular" == "$expected_output_tabular" ]; then - echo "Test passed" -else +if [ "$output_tabular" != "$expected_output_tabular" ]; then echo "Test failed" echo "Expected:" echo "$expected_output_tabular" echo "Got:" echo "$output_tabular" + exit 1 fi ######################################################################################### @@ -145,14 +142,13 @@ TGAAA ACTTT" output_wrapped=$(cat sub_sample.fq) -if [ "$output_wrapped" == "$expected_output_wrapped" ]; then - echo "Test passed" -else +if [ "$output_wrapped" != "$expected_output_wrapped" ]; then echo "Test failed" echo "Expected:" echo "$expected_output_wrapped" echo "Got:" echo "$output_wrapped" + exit 1 fi ######################################################################################### @@ -174,12 +170,13 @@ AGTGTTCGAG AAAGTTTTCA" output_wrapped=$(cat sub_sample.fq) -if [ "$output_wrapped" == "$expected_output_wrapped" ]; then - echo "Test passed" -else +if [ "$output_wrapped" != "$expected_output_wrapped" ]; then echo "Test failed" echo "Expected:" echo "$expected_output_wrapped" echo "Got:" echo "$output_wrapped" + exit 1 fi + +echo "All tests succeeded!"