From 33065181aec56e2c9c5205d64265ef7a69cef5f4 Mon Sep 17 00:00:00 2001 From: Fangchao Date: Fri, 28 Jun 2024 12:47:15 +0800 Subject: [PATCH 1/2] fit present working envs (@2024) --- bin/ccs2-mutations.pl | 2 +- scripts/ccs2-chimeric.sh | 25 +++++++------ scripts/ccs2-consensus.sh | 39 ++++++++++---------- scripts/ccs2-mutation.sh | 27 +++++++------- scripts/ccs2-summary.sh | 27 ++++++++------ workflow.sh | 75 +++++++++++++++++++++++++-------------- 6 files changed, 115 insertions(+), 80 deletions(-) diff --git a/bin/ccs2-mutations.pl b/bin/ccs2-mutations.pl index 736d343..82685ea 100755 --- a/bin/ccs2-mutations.pl +++ b/bin/ccs2-mutations.pl @@ -93,7 +93,7 @@ next if( $flag & 0x800 ); ### skip reads with low mapping quality - next if( $mapq < 254 ); + next if( $mapq < 60 ); # ----- process CIGAR ------------------------------------------------------ diff --git a/scripts/ccs2-chimeric.sh b/scripts/ccs2-chimeric.sh index 5f36f97..1c600f1 100644 --- a/scripts/ccs2-chimeric.sh +++ b/scripts/ccs2-chimeric.sh @@ -1,18 +1,21 @@ #!/bin/bash -#$ -S /bin/bash -#$ -P longrun -#$ -pe smp 8 +if [ "$#" -ne 8 ]; then + echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads" + exit 1 +fi -cat $JOB_SCRIPT >> $SGE_STDOUT_PATH - -echo "" -echo "Started on $(date) on $(uname -n)" - -module load bwa -module load p7zip -module load samtools +while getopts "a:s:c:p:" opt; do + case $opt in + a) amplicon="$OPTARG" ;; + s) sampleId="$OPTARG" ;; + c) collectionPathUri="$OPTARG" ;; + p) threads="$OPTARG" ;; + \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;; + esac +done +root=`pwd` reference="$root/references/$amplicon/sequence/$amplicon.fasta" ### top run directory diff --git a/scripts/ccs2-consensus.sh b/scripts/ccs2-consensus.sh index 91f8fc4..e23615c 100644 --- a/scripts/ccs2-consensus.sh +++ b/scripts/ccs2-consensus.sh @@ -1,17 +1,21 @@ #!/bin/bash -#$ -S /bin/bash -#$ -P longrun -#$ -pe smp 8 +if [ "$#" -ne 8 ]; then + echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads" + exit 1 +fi -cat $JOB_SCRIPT >> $SGE_STDOUT_PATH - -echo "" -echo "Started on $(date) on $(uname -n)" - -module load smrtlink -module load samtools +while getopts "a:s:c:p:" opt; do + case $opt in + a) amplicon="$OPTARG" ;; + s) sampleId="$OPTARG" ;; + c) collectionPathUri="$OPTARG" ;; + p) threads="$OPTARG" ;; + \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;; + esac +done +root=`pwd` reference="$root/references/$amplicon/sequence/$amplicon.fasta" ### create run directory @@ -21,20 +25,19 @@ mkdir -p $rundir ### switch to working directory cd $rundir +echo "Task 1 completed at $(date)" ### look up sequencing data echo "" -find "$collectionPathUri" -name "*.bax.h5" | sort -u > input.fofn -echo "Task 1 completed at $(date)" +raw_bam=`find "$collectionPathUri" -name "*.bam"` -### convert -echo "" -bax2bam --fofn=input.fofn -o movie +cmd="ln -s $raw_bam movie.subreads.bam" +echo $cmd && $cmd && \ echo "Task 2 completed at $(date)" -### map reads echo "" -blasr movie.subreads.bam "$reference" -bam -out aligned_reads.bam -useQuality -nproc 8 -bestn 1 +cmd="pbmm2 align "$reference" "$raw_bam" aligned_reads.bam -j $threads -N 1" +echo $cmd && $cmd && \ echo "Task 3 completed at $(date)" ### extract mapping direction @@ -57,7 +60,7 @@ do ### build ccs echo "" - ccs --reportFile=subreads_ccs.${strand}.csv --logFile=subreads_ccs.${strand}.log --numThreads=7 --minPasses=1 subreads.${strand}.bam subreads_ccs.${strand}.bam + ccs --reportFile=subreads_ccs.${strand}.csv --logFile=subreads_ccs.${strand}.log --num-threads=$threads --minPasses=1 subreads.${strand}.bam subreads_ccs.${strand}.bam echo "Task 7 ($strand) completed at $(date)" done diff --git a/scripts/ccs2-mutation.sh b/scripts/ccs2-mutation.sh index 40ae10c..c40be14 100644 --- a/scripts/ccs2-mutation.sh +++ b/scripts/ccs2-mutation.sh @@ -1,18 +1,21 @@ #!/bin/bash -#$ -S /bin/bash -#$ -P longrun -#$ -pe smp 2 +if [ "$#" -ne 8 ]; then + echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads" + exit 1 +fi -cat $JOB_SCRIPT >> $SGE_STDOUT_PATH - -echo "" -echo "Started on $(date) on $(uname -n)" - -module load p7zip -module load smrtlink -module load samtools +while getopts "a:s:c:p:" opt; do + case $opt in + a) amplicon="$OPTARG" ;; + s) sampleId="$OPTARG" ;; + c) collectionPathUri="$OPTARG" ;; + p) threads="$OPTARG" ;; + \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;; + esac +done +root=`pwd` reference="$root/references/$amplicon/sequence/$amplicon.fasta" ### run directory @@ -36,7 +39,7 @@ do ### align reads echo "" - blasr $ccsdir/subreads_ccs.$strand.bam reference.fasta -bam -out aligned_reads.bam -bestn 1 -nproc 2 + pbmm2 align reference.fasta $ccsdir/subreads_ccs.$strand.bam aligned_reads.bam -j 8 -N 1 echo "Task 3 ($strand) completed at $(date)" ### call mutations diff --git a/scripts/ccs2-summary.sh b/scripts/ccs2-summary.sh index fb84add..2adb90e 100644 --- a/scripts/ccs2-summary.sh +++ b/scripts/ccs2-summary.sh @@ -1,16 +1,21 @@ #!/bin/bash -#$ -S /bin/bash -#$ -P longrun -#$ -pe smp 2 - -cat $JOB_SCRIPT >> $SGE_STDOUT_PATH - -echo "" -echo "Started on $(date) on $(uname -n)" - -module load p7zip - +if [ "$#" -ne 8 ]; then + echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads" + exit 1 +fi + +while getopts "a:s:c:p:" opt; do + case $opt in + a) amplicon="$OPTARG" ;; + s) sampleId="$OPTARG" ;; + c) collectionPathUri="$OPTARG" ;; + p) threads="$OPTARG" ;; + \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;; + esac +done + +root=`pwd` reference="$root/references/$amplicon/sequence/$amplicon.fasta" ### run directory diff --git a/workflow.sh b/workflow.sh index fa1983a..7fe370b 100755 --- a/workflow.sh +++ b/workflow.sh @@ -1,6 +1,6 @@ #!/bin/bash -if [ "$#" -ne 2 ] || ( [ "$2" != "view" ] && [ "$2" != "consensus" ] && [ "$2" != "chimeric" ] && [ "$2" != "mutation" ] && [ "$2" != "summary" ] && [ "$2" != "tabulate" ] ) +if [ "$#" -lt 2 ] || ( [ "$2" != "view" ] && [ "$2" != "consensus" ] && [ "$2" != "chimeric" ] && [ "$2" != "mutation" ] && [ "$2" != "summary" ] && [ "$2" != "tabulate" ] ) then echo "usage: $0 samples.csv view" >&2 echo " $0 samples.csv consensus" >&2 @@ -11,11 +11,16 @@ then exit 1 fi -samples=$1 +PROJDIR=`dirname $0` + +samples=$(realpath $1) command=$2 +howtorun=$3 #either 'local', 'dryrun', 'qsub' +threads=${4:-1} +memory=${5:-4g} -### define top project directory -PROJDIR= +# set environment value for $SGEQ and $SGEP +resource="-l vf=$memory,num_proc=$threads -binding linear:$threads -q $SGEQ -P $SGEP" cd $PROJDIR @@ -27,28 +32,44 @@ while read line do if [[ ! $line =~ "SampleID" ]] then - ### extract SampleID, Amplicon, and data path for each sample - sampleId=`echo $line | cut -d, -f1` - amplicon=`echo $line | cut -d, -f3` - collectionPathUri=`echo $line | cut -d, -f6` - - if [ "$command" == "view" ] - then - ### preview run info - echo "" - echo "sampleId=$sampleId" - echo "amplicon=$amplicon" - echo "collectionPathUri=$collectionPathUri" - else - if [ "$command" != "tabulate" ] ; then - logdir=`printf "%s/samples/%05i" $PROJDIR $sampleId` - mkdir -p "$logdir" - qsub -v root="$PROJDIR",amplicon="$amplicon",sampleId="$sampleId",collectionPathUri="$PROJDIR/$collectionPathUri" -N "ccs$sampleId" -o $logdir/$command.log -j yes $PROJDIR/scripts/ccs2-$command.sh - else - rundir=`printf "%s/samples/%05i/summary" $PROJDIR $sampleId` - echo $line | tr '\n' "," - tail --lines 1 $rundir/summary.csv - fi - fi + ### extract SampleID, Amplicon, and data path for each sample + sampleId=`echo $line | cut -d, -f1` + amplicon=`echo $line | cut -d, -f3` + collectionPathUri=`echo $line | cut -d, -f6 | xargs -n1 realpath` + if [ "$command" == "view" ] + then + ### preview run info + echo "" + echo "sampleId=$sampleId" + echo "amplicon=$amplicon" + echo "collectionPathUri=$collectionPathUri" + else + if [ "$command" != "tabulate" ] ; then + logdir=`printf "%s/samples/%05i" $PROJDIR $sampleId` + mkdir -p "$logdir" + + cmd="$PROJDIR/scripts/ccs2-$command.sh -a $amplicon -s $sampleId -c $collectionPathUri -p $threads" + + if [ "$howtorun" == "qsub" ];then + echo "Run on SGE: $cmd" + qsub -V -cwd $resource -N "ccs-$sampleId" -o $logdir/$command.log -j yes $cmd + else + cmd="bash $cmd" + if [ "$howtorun" == "local" ];then + echo "Run on local: $cmd" && $cmd + else + if [ "$howtorun" == "localbg" ];then + echo "Run on local: $cmd" && $cmd & + else + echo "Dryrun(no run): $cmd" + fi + fi + fi + else + rundir=`printf "%s/samples/%05i/summary" $PROJDIR $sampleId` + echo $line | tr '\n' "," + tail --lines 1 $rundir/summary.csv + fi + fi fi done < $samples From 276f3680c2d0924c24ae09fef61e6ef922c51fa1 Mon Sep 17 00:00:00 2001 From: Fangchao Date: Fri, 28 Jun 2024 13:16:22 +0800 Subject: [PATCH 2/2] add installation and usage --- README.md | 40 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 39 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b700878..beef403 100644 --- a/README.md +++ b/README.md @@ -21,11 +21,49 @@ The analysis workflow consists of several consecutive steps as outlined below. T The included [workflow](workflow.md) provides detailed instructions for data analysis conducted in the original publication [(1)](#ref1). ## Requirements -* [SMRT Link](https://github.com/PacificBiosciences/SMRT-Link). Scripts rely on `bax2bam`, `ccs` and `blasr` command-line utilities, which are developed by Pacific Biosciences, Inc. and distributed as part of the SMRT Link software. +* ~~[SMRT Link](https://github.com/PacificBiosciences/SMRT-Link).~~ Scripts rely on `bax2bam`, `ccs` and `blasr` command-line utilities, which are developed by Pacific Biosciences, Inc. and distributed as part of the SMRT Link software. + * As SMRT Link is out-of-date, using [PacificBiosciences/ccs](https://github.com/PacificBiosciences/ccs?tab=readme-ov-file) for `css`, + * and [PacificBiosciences/pbmm2](https://github.com/PacificBiosciences/pbmm2) (`pbmm2 align`) to replace `blasr`. + * `bax2bam` is no longer needed as current pacbio directly provides bam files. * [SAMtools](http://samtools.sourceforge.net/). Manipulating BAM files. * [BWA](http://bio-bwa.sourceforge.net/). Detecting chimeric reads. * [P7ZIP](http://p7zip.sourceforge.net/). Compressing output files to minimize disk usage. * [The R Project for Statistical Computing](https://www.r-project.org/). Extracting and tabulating data. +For easy installation, I recommand using [conda](https://github.com/conda/conda)/[mamba](https://github.com/mamba-org/mamba): +```bash +mamba create -n pcrfdelity pbmm2 pbccs samtools bwa p7zip r-base +mamba activate pcrfdelity +``` + +## Usage +```bash +mamba activate pcrfdelity +git clone https://github.com/BioChaoGroup/pcr-fidelity +cd pcrfdelity +# Preare data under input and references + +# View samples info +./workflow.sh input/samples.csv view + +# Run consensus locally +./workflow.sh input/samples.csv consensus local + +# Run chimeric locally with all sample at the same time (parallelld) +./workflow.sh input/samples.csv consensus localbp 8 +## This will run each sample a task using 8 CPUs + +# Run mutation on SGE +export SGEQ= +export SGEP= +./workflow.sh input/samples.csv mutation qsub 4 1g +## This will submit each task with 4 CPUs and 1gb memory + +# Summary and tabulate is quick so locally run is enough +./workflow.sh input/samples.csv summary local +./workflow.sh input/samples.csv tabulate local +``` + + ## Citations 1. Potapov V. & Ong JL. Examining Sources of Error in PCR by Single-Molecule Sequencing. PLOS ONE. 2016. doi:10.1371/journal.pone.0169774. ([View article](http://dx.doi.org/10.1371/journal.pone.0169774))