diff --git a/src/star/star_align/test.sh b/src/star/star_align/test.sh index 1bdadf38..ae31af3c 100644 --- a/src/star/star_align/test.sh +++ b/src/star/star_align/test.sh @@ -7,14 +7,78 @@ meta_executable="target/docker/star/star_align/star_align" meta_resources_dir="src/star/star_align" ## VIASH END +############################################# +# helper functions +assert_file_exists() { + [ -f "$1" ] || (echo "File '$1' does not exist" && exit 1) +} +assert_file_doesnt_exist() { + [ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1) +} +assert_file_empty() { + [ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1) +} +assert_file_not_empty() { + [ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1) +} +assert_file_contains() { + grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains() { + grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} +assert_file_contains_regex() { + grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1) +} +assert_file_not_contains_regex() { + grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1) +} + +############################################# +echo "> Prepare test data" + +cat > reads_R1.fastq <<'EOF' +@SEQ_ID1 +ACGCTGCCTCATAAGCCTCACACAT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +ACCCGCAAGATTAGGCTCCGTACAC ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > reads_R2.fastq <<'EOF' +@SEQ_ID1 +ATGTGTGAGGCTTATGAGGCAGCGT ++ +IIIIIIIIIIIIIIIIIIIIIIIII +@SEQ_ID2 +GTGTACGGAGCCTAATCTTGCAGGG ++ +!!!!!!!!!!!!!!!!!!!!!!!!! +EOF + +cat > genome.fasta <<'EOF' +>chr1 +TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG +GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA +TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA +EOF + +cat > genes.gtf <<'EOF' +chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1"; +chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1"; +EOF echo "> Generate index" STAR \ ${meta_cpus:+--runThreadN $meta_cpus} \ --runMode genomeGenerate \ --genomeDir "index/" \ - --genomeFastaFiles "$meta_resources_dir/test_data/genome.fasta" #\ - # --sjdbGTFfile "$meta_resources_dir/test_data/genes.gtf" + --genomeFastaFiles "genome.fasta" \ + --sjdbGTFfile "genes.gtf" \ + --genomeSAindexNbases 2 ######################################################################################### mkdir star_align_se @@ -22,71 +86,84 @@ cd star_align_se echo "> Run star_align on SE" "$meta_executable" \ - --input "$meta_resources_dir/test_data/a_R1.1.fastq" \ - --input "$meta_resources_dir/test_data/a_R1.2.fastq" \ + --input "../reads_R1.fastq" \ --genomeDir "../index/" \ - --aligned_reads "output.bam" \ + --aligned_reads "output.sam" \ --log "log.txt" \ --outReadsUnmapped "Fastx" \ - --unmapped "unmapped.bam" \ - ${meta_cpus:+---cpus $meta_cpus} + --unmapped "unmapped.sam" \ + ${meta_cpus:+---cpus $meta_cpus} \ + --quantMode "TranscriptomeSAM;GeneCounts" \ + --reads_per_gene "reads_per_gene.tsv" \ + --outSJtype Standard \ + --splice_junctions "splice_junctions.tsv" - # optional outputs - # \ - # --quantMode "TranscriptomeSAM;GeneCounts" \ - # --reads_per_gene "reads_per_gene.tsv" \ - # --chimOutType "Junctions" \ - # --chimeric_junctions "chimeric_junctions.tsv" \ - # --outSJtype Standard \ - # --splice_junctions "splice_junctions.tsv" - -# ↑ automate depending on desired outputs? +# TODO: Test data doesn't contain any chimeric reads yet +# --chimOutType "Junctions" \ +# --chimeric_junctions "chimeric_junctions.tsv" \ echo ">> Check if output exists" -[ ! -f "output.bam" ] && echo ">> output.bam does not exist" && exit 1 -[ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1 - -# [ ! -f "reads_per_gene.tsv" ] && echo ">> reads_per_gene.tsv does not exist" && exit 1 -# [ ! -f "chimeric_junctions.tsv" ] && echo ">> chimeric_junctions.tsv does not exist" && exit 1 -# [ ! -f "splice_junctions.tsv" ] && echo ">> splice_junctions.tsv does not exist" && exit 1 -[ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1 +assert_file_exists "output.sam" +assert_file_exists "log.txt" +assert_file_exists "reads_per_gene.tsv" +# assert_file_exists "chimeric_junctions.tsv" +assert_file_exists "splice_junctions.tsv" +assert_file_exists "unmapped.sam" + +echo ">> Check if output contents are not empty" +assert_file_not_empty "output.sam" +assert_file_not_empty "log.txt" +assert_file_not_empty "reads_per_gene.tsv" +# assert_file_not_empty "chimeric_junctions.tsv" +# assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet +assert_file_not_empty "unmapped.sam" + +echo ">> Check if output contents are correct" +assert_file_contains "log.txt" "Number of input reads \\| 2" +assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1" +assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1" +assert_file_contains "reads_per_gene.tsv" "gene1 1 1 0" +assert_file_contains "reads_per_gene.tsv" "N_unmapped 1 1 1" +assert_file_contains "output.sam" "SEQ_ID1 0 chr1 17 255 25M \\* 0 0 ACGCTGCCTCATAAGCCTCACACAT IIIIIIIIIIIIIIIIIIIIIIIII NH:i:1 HI:i:1 AS:i:24 nM:i:0" +assert_file_contains "unmapped.sam" "@SEQ_ID2 0:N:" +assert_file_contains "unmapped.sam" "ACCCGCAAGATTAGGCTCCGTACAC" cd .. -######################################################################################### -mkdir star_align_pe_minimal -cd star_align_pe_minimal - -echo ">> Run star_align on PE" -"$meta_executable" \ - --input "$meta_resources_dir/test_data/a_R1.1.fastq" \ - --input "$meta_resources_dir/test_data/a_R1.2.fastq" \ - --input_r2 "$meta_resources_dir/test_data/a_R2.1.fastq" \ - --input_r2 "$meta_resources_dir/test_data/a_R2.2.fastq" \ - --genomeDir "../index/" \ - --aligned_reads "output.bam" \ - --log "log.txt" \ - --outReadsUnmapped "Fastx" \ - --unmapped "unmapped.bam" \ - --unmapped_r2 "unmapped_r2.bam" \ - ${meta_cpus:+---cpus $meta_cpus} - -echo ">> Check if output exists" -[ ! -f "output.bam" ] && echo ">> output.bam does not exist" && exit 1 -[ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1 -[ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1 -[ ! -f "unmapped_r2.bam" ] && echo ">> unmapped_r2.bam does not exist" && exit 1 - -cd .. -######################################################################################### - -# TODO: check whether optional outputs work: -# - quantMode "TranscriptomeSAM;GeneCounts" -# - reads_per_gene "reads_per_gene.tsv" -# - chimOutType "Junctions" -# - chimeric_junctions "chimeric_junctions.tsv" -# - outSJtype Standard -# - splice_junctions "splice_junctions.tsv" -# TODO: check whether output contents are correct - -echo "> Test successful" +# ######################################################################################### +# mkdir star_align_pe_minimal +# cd star_align_pe_minimal + +# echo ">> Run star_align on PE" +# "$meta_executable" \ +# --input "$meta_resources_dir/test_data/a_R1.1.fastq" \ +# --input "$meta_resources_dir/test_data/a_R1.2.fastq" \ +# --input_r2 "$meta_resources_dir/test_data/a_R2.1.fastq" \ +# --input_r2 "$meta_resources_dir/test_data/a_R2.2.fastq" \ +# --genomeDir "../index/" \ +# --aligned_reads "output.sam" \ +# --log "log.txt" \ +# --outReadsUnmapped "Fastx" \ +# --unmapped "unmapped.bam" \ +# --unmapped_r2 "unmapped_r2.bam" \ +# ${meta_cpus:+---cpus $meta_cpus} + +# echo ">> Check if output exists" +# [ ! -f "output.sam" ] && echo ">> output.sam does not exist" && exit 1 +# [ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1 +# [ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1 +# [ ! -f "unmapped_r2.bam" ] && echo ">> unmapped_r2.bam does not exist" && exit 1 + +# cd .. +# ######################################################################################### + +# # TODO: check whether optional outputs work: +# # - quantMode "TranscriptomeSAM;GeneCounts" +# # - reads_per_gene "reads_per_gene.tsv" +# # - chimOutType "Junctions" +# # - chimeric_junctions "chimeric_junctions.tsv" +# # - outSJtype Standard +# # - splice_junctions "splice_junctions.tsv" +# # TODO: check whether output contents are correct + +# echo "> Test successful"