Skip to content

Commit

Permalink
wip check contents of star output
Browse files Browse the repository at this point in the history
  • Loading branch information
rcannood committed Feb 26, 2024
1 parent e06e407 commit 0da7b0b
Showing 1 changed file with 138 additions and 61 deletions.
199 changes: 138 additions & 61 deletions src/star/star_align/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,86 +7,163 @@ meta_executable="target/docker/star/star_align/star_align"
meta_resources_dir="src/star/star_align"
## VIASH END

#############################################
# helper functions
assert_file_exists() {
[ -f "$1" ] || (echo "File '$1' does not exist" && exit 1)
}
assert_file_doesnt_exist() {
[ ! -f "$1" ] || (echo "File '$1' exists but shouldn't" && exit 1)
}
assert_file_empty() {
[ ! -s "$1" ] || (echo "File '$1' is not empty but should be" && exit 1)
}
assert_file_not_empty() {
[ -s "$1" ] || (echo "File '$1' is empty but shouldn't be" && exit 1)
}
assert_file_contains() {
grep -q "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1)
}
assert_file_not_contains() {
grep -q "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1)
}
assert_file_contains_regex() {
grep -q -E "$2" "$1" || (echo "File '$1' does not contain '$2'" && exit 1)
}
assert_file_not_contains_regex() {
grep -q -E "$2" "$1" && (echo "File '$1' contains '$2' but shouldn't" && exit 1)
}

#############################################
echo "> Prepare test data"

cat > reads_R1.fastq <<'EOF'
@SEQ_ID1
ACGCTGCCTCATAAGCCTCACACAT
+
IIIIIIIIIIIIIIIIIIIIIIIII
@SEQ_ID2
ACCCGCAAGATTAGGCTCCGTACAC
+
!!!!!!!!!!!!!!!!!!!!!!!!!
EOF

cat > reads_R2.fastq <<'EOF'
@SEQ_ID1
ATGTGTGAGGCTTATGAGGCAGCGT
+
IIIIIIIIIIIIIIIIIIIIIIIII
@SEQ_ID2
GTGTACGGAGCCTAATCTTGCAGGG
+
!!!!!!!!!!!!!!!!!!!!!!!!!
EOF

cat > genome.fasta <<'EOF'
>chr1
TGGCATGAGCCAACGAACGCTGCCTCATAAGCCTCACACATCCGCGCCTATGTTGTGACTCTCTGTGAGCGTTCGTGGG
GCTCGTCACCACTATGGTTGGCCGGTTAGTAGTGTGACTCCTGGTTTTCTGGAGCTTCTTTAAACCGTAGTCCAGTCAA
TGCGAATGGCACTTCACGACGGACTGTCCTTAGCTCAGGGGA
EOF

cat > genes.gtf <<'EOF'
chr1 example_source gene 0 50 . + . gene_id "gene1"; transcript_id "transcript1";
chr1 example_source exon 20 40 . + . gene_id "gene1"; transcript_id "transcript1";
EOF

echo "> Generate index"
STAR \
${meta_cpus:+--runThreadN $meta_cpus} \
--runMode genomeGenerate \
--genomeDir "index/" \
--genomeFastaFiles "$meta_resources_dir/test_data/genome.fasta" #\
# --sjdbGTFfile "$meta_resources_dir/test_data/genes.gtf"
--genomeFastaFiles "genome.fasta" \
--sjdbGTFfile "genes.gtf" \
--genomeSAindexNbases 2

#########################################################################################
mkdir star_align_se
cd star_align_se

echo "> Run star_align on SE"
"$meta_executable" \
--input "$meta_resources_dir/test_data/a_R1.1.fastq" \
--input "$meta_resources_dir/test_data/a_R1.2.fastq" \
--input "../reads_R1.fastq" \
--genomeDir "../index/" \
--aligned_reads "output.bam" \
--aligned_reads "output.sam" \
--log "log.txt" \
--outReadsUnmapped "Fastx" \
--unmapped "unmapped.bam" \
${meta_cpus:+---cpus $meta_cpus}
--unmapped "unmapped.sam" \
${meta_cpus:+---cpus $meta_cpus} \
--quantMode "TranscriptomeSAM;GeneCounts" \
--reads_per_gene "reads_per_gene.tsv" \
--outSJtype Standard \
--splice_junctions "splice_junctions.tsv"

# optional outputs
# \
# --quantMode "TranscriptomeSAM;GeneCounts" \
# --reads_per_gene "reads_per_gene.tsv" \
# --chimOutType "Junctions" \
# --chimeric_junctions "chimeric_junctions.tsv" \
# --outSJtype Standard \
# --splice_junctions "splice_junctions.tsv"

# ↑ automate depending on desired outputs?
# TODO: Test data doesn't contain any chimeric reads yet
# --chimOutType "Junctions" \
# --chimeric_junctions "chimeric_junctions.tsv" \

echo ">> Check if output exists"
[ ! -f "output.bam" ] && echo ">> output.bam does not exist" && exit 1
[ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1

# [ ! -f "reads_per_gene.tsv" ] && echo ">> reads_per_gene.tsv does not exist" && exit 1
# [ ! -f "chimeric_junctions.tsv" ] && echo ">> chimeric_junctions.tsv does not exist" && exit 1
# [ ! -f "splice_junctions.tsv" ] && echo ">> splice_junctions.tsv does not exist" && exit 1
[ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1
assert_file_exists "output.sam"
assert_file_exists "log.txt"
assert_file_exists "reads_per_gene.tsv"
# assert_file_exists "chimeric_junctions.tsv"
assert_file_exists "splice_junctions.tsv"
assert_file_exists "unmapped.sam"

echo ">> Check if output contents are not empty"
assert_file_not_empty "output.sam"
assert_file_not_empty "log.txt"
assert_file_not_empty "reads_per_gene.tsv"
# assert_file_not_empty "chimeric_junctions.tsv"
# assert_file_not_empty "splice_junctions.tsv" # TODO: test data doesn't contain any splice junctions yet
assert_file_not_empty "unmapped.sam"

echo ">> Check if output contents are correct"
assert_file_contains "log.txt" "Number of input reads \\| 2"
assert_file_contains "log.txt" "Number of reads unmapped: too short \\| 1"
assert_file_contains "log.txt" "Uniquely mapped reads number \\| 1"
assert_file_contains "reads_per_gene.tsv" "gene1 1 1 0"
assert_file_contains "reads_per_gene.tsv" "N_unmapped 1 1 1"
assert_file_contains "output.sam" "SEQ_ID1 0 chr1 17 255 25M \\* 0 0 ACGCTGCCTCATAAGCCTCACACAT IIIIIIIIIIIIIIIIIIIIIIIII NH:i:1 HI:i:1 AS:i:24 nM:i:0"
assert_file_contains "unmapped.sam" "@SEQ_ID2 0:N:"
assert_file_contains "unmapped.sam" "ACCCGCAAGATTAGGCTCCGTACAC"

cd ..

#########################################################################################
mkdir star_align_pe_minimal
cd star_align_pe_minimal

echo ">> Run star_align on PE"
"$meta_executable" \
--input "$meta_resources_dir/test_data/a_R1.1.fastq" \
--input "$meta_resources_dir/test_data/a_R1.2.fastq" \
--input_r2 "$meta_resources_dir/test_data/a_R2.1.fastq" \
--input_r2 "$meta_resources_dir/test_data/a_R2.2.fastq" \
--genomeDir "../index/" \
--aligned_reads "output.bam" \
--log "log.txt" \
--outReadsUnmapped "Fastx" \
--unmapped "unmapped.bam" \
--unmapped_r2 "unmapped_r2.bam" \
${meta_cpus:+---cpus $meta_cpus}

echo ">> Check if output exists"
[ ! -f "output.bam" ] && echo ">> output.bam does not exist" && exit 1
[ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1
[ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1
[ ! -f "unmapped_r2.bam" ] && echo ">> unmapped_r2.bam does not exist" && exit 1

cd ..
#########################################################################################

# TODO: check whether optional outputs work:
# - quantMode "TranscriptomeSAM;GeneCounts"
# - reads_per_gene "reads_per_gene.tsv"
# - chimOutType "Junctions"
# - chimeric_junctions "chimeric_junctions.tsv"
# - outSJtype Standard
# - splice_junctions "splice_junctions.tsv"
# TODO: check whether output contents are correct

echo "> Test successful"
# #########################################################################################
# mkdir star_align_pe_minimal
# cd star_align_pe_minimal

# echo ">> Run star_align on PE"
# "$meta_executable" \
# --input "$meta_resources_dir/test_data/a_R1.1.fastq" \
# --input "$meta_resources_dir/test_data/a_R1.2.fastq" \
# --input_r2 "$meta_resources_dir/test_data/a_R2.1.fastq" \
# --input_r2 "$meta_resources_dir/test_data/a_R2.2.fastq" \
# --genomeDir "../index/" \
# --aligned_reads "output.sam" \
# --log "log.txt" \
# --outReadsUnmapped "Fastx" \
# --unmapped "unmapped.bam" \
# --unmapped_r2 "unmapped_r2.bam" \
# ${meta_cpus:+---cpus $meta_cpus}

# echo ">> Check if output exists"
# [ ! -f "output.sam" ] && echo ">> output.sam does not exist" && exit 1
# [ ! -f "log.txt" ] && echo ">> log.txt does not exist" && exit 1
# [ ! -f "unmapped.bam" ] && echo ">> unmapped.bam does not exist" && exit 1
# [ ! -f "unmapped_r2.bam" ] && echo ">> unmapped_r2.bam does not exist" && exit 1

# cd ..
# #########################################################################################

# # TODO: check whether optional outputs work:
# # - quantMode "TranscriptomeSAM;GeneCounts"
# # - reads_per_gene "reads_per_gene.tsv"
# # - chimOutType "Junctions"
# # - chimeric_junctions "chimeric_junctions.tsv"
# # - outSJtype Standard
# # - splice_junctions "splice_junctions.tsv"
# # TODO: check whether output contents are correct

# echo "> Test successful"

0 comments on commit 0da7b0b

Please sign in to comment.