diff --git a/README.md b/README.md
index b700878..beef403 100644
--- a/README.md
+++ b/README.md
@@ -21,11 +21,49 @@ The analysis workflow consists of several consecutive steps as outlined below. T
The included [workflow](workflow.md) provides detailed instructions for data analysis conducted in the original publication [(1)](#ref1).
## Requirements
-* [SMRT Link](https://github.com/PacificBiosciences/SMRT-Link). Scripts rely on `bax2bam`, `ccs` and `blasr` command-line utilities, which are developed by Pacific Biosciences, Inc. and distributed as part of the SMRT Link software.
+* ~~[SMRT Link](https://github.com/PacificBiosciences/SMRT-Link).~~ Scripts rely on `bax2bam`, `ccs` and `blasr` command-line utilities, which are developed by Pacific Biosciences, Inc. and distributed as part of the SMRT Link software.
+ * As SMRT Link is out-of-date, using [PacificBiosciences/ccs](https://github.com/PacificBiosciences/ccs?tab=readme-ov-file) for `css`,
+ * and [PacificBiosciences/pbmm2](https://github.com/PacificBiosciences/pbmm2) (`pbmm2 align`) to replace `blasr`.
+ * `bax2bam` is no longer needed as current pacbio directly provides bam files.
* [SAMtools](http://samtools.sourceforge.net/). Manipulating BAM files.
* [BWA](http://bio-bwa.sourceforge.net/). Detecting chimeric reads.
* [P7ZIP](http://p7zip.sourceforge.net/). Compressing output files to minimize disk usage.
* [The R Project for Statistical Computing](https://www.r-project.org/). Extracting and tabulating data.
+For easy installation, I recommand using [conda](https://github.com/conda/conda)/[mamba](https://github.com/mamba-org/mamba):
+```bash
+mamba create -n pcrfdelity pbmm2 pbccs samtools bwa p7zip r-base
+mamba activate pcrfdelity
+```
+
+## Usage
+```bash
+mamba activate pcrfdelity
+git clone https://github.com/BioChaoGroup/pcr-fidelity
+cd pcrfdelity
+# Preare data under input and references
+
+# View samples info
+./workflow.sh input/samples.csv view
+
+# Run consensus locally
+./workflow.sh input/samples.csv consensus local
+
+# Run chimeric locally with all sample at the same time (parallelld)
+./workflow.sh input/samples.csv consensus localbp 8
+## This will run each sample a task using 8 CPUs
+
+# Run mutation on SGE
+export SGEQ=
+export SGEP=
+./workflow.sh input/samples.csv mutation qsub 4 1g
+## This will submit each task with 4 CPUs and 1gb memory
+
+# Summary and tabulate is quick so locally run is enough
+./workflow.sh input/samples.csv summary local
+./workflow.sh input/samples.csv tabulate local
+```
+
+
## Citations
1. Potapov V. & Ong JL. Examining Sources of Error in PCR by Single-Molecule Sequencing. PLOS ONE. 2016. doi:10.1371/journal.pone.0169774. ([View article](http://dx.doi.org/10.1371/journal.pone.0169774))
diff --git a/bin/ccs2-mutations.pl b/bin/ccs2-mutations.pl
index 736d343..82685ea 100755
--- a/bin/ccs2-mutations.pl
+++ b/bin/ccs2-mutations.pl
@@ -93,7 +93,7 @@
next if( $flag & 0x800 );
### skip reads with low mapping quality
- next if( $mapq < 254 );
+ next if( $mapq < 60 );
# ----- process CIGAR ------------------------------------------------------
diff --git a/scripts/ccs2-chimeric.sh b/scripts/ccs2-chimeric.sh
index 5f36f97..1c600f1 100644
--- a/scripts/ccs2-chimeric.sh
+++ b/scripts/ccs2-chimeric.sh
@@ -1,18 +1,21 @@
#!/bin/bash
-#$ -S /bin/bash
-#$ -P longrun
-#$ -pe smp 8
+if [ "$#" -ne 8 ]; then
+ echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads"
+ exit 1
+fi
-cat $JOB_SCRIPT >> $SGE_STDOUT_PATH
-
-echo ""
-echo "Started on $(date) on $(uname -n)"
-
-module load bwa
-module load p7zip
-module load samtools
+while getopts "a:s:c:p:" opt; do
+ case $opt in
+ a) amplicon="$OPTARG" ;;
+ s) sampleId="$OPTARG" ;;
+ c) collectionPathUri="$OPTARG" ;;
+ p) threads="$OPTARG" ;;
+ \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;;
+ esac
+done
+root=`pwd`
reference="$root/references/$amplicon/sequence/$amplicon.fasta"
### top run directory
diff --git a/scripts/ccs2-consensus.sh b/scripts/ccs2-consensus.sh
index 91f8fc4..e23615c 100644
--- a/scripts/ccs2-consensus.sh
+++ b/scripts/ccs2-consensus.sh
@@ -1,17 +1,21 @@
#!/bin/bash
-#$ -S /bin/bash
-#$ -P longrun
-#$ -pe smp 8
+if [ "$#" -ne 8 ]; then
+ echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads"
+ exit 1
+fi
-cat $JOB_SCRIPT >> $SGE_STDOUT_PATH
-
-echo ""
-echo "Started on $(date) on $(uname -n)"
-
-module load smrtlink
-module load samtools
+while getopts "a:s:c:p:" opt; do
+ case $opt in
+ a) amplicon="$OPTARG" ;;
+ s) sampleId="$OPTARG" ;;
+ c) collectionPathUri="$OPTARG" ;;
+ p) threads="$OPTARG" ;;
+ \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;;
+ esac
+done
+root=`pwd`
reference="$root/references/$amplicon/sequence/$amplicon.fasta"
### create run directory
@@ -21,20 +25,19 @@ mkdir -p $rundir
### switch to working directory
cd $rundir
+echo "Task 1 completed at $(date)"
### look up sequencing data
echo ""
-find "$collectionPathUri" -name "*.bax.h5" | sort -u > input.fofn
-echo "Task 1 completed at $(date)"
+raw_bam=`find "$collectionPathUri" -name "*.bam"`
-### convert
-echo ""
-bax2bam --fofn=input.fofn -o movie
+cmd="ln -s $raw_bam movie.subreads.bam"
+echo $cmd && $cmd && \
echo "Task 2 completed at $(date)"
-### map reads
echo ""
-blasr movie.subreads.bam "$reference" -bam -out aligned_reads.bam -useQuality -nproc 8 -bestn 1
+cmd="pbmm2 align "$reference" "$raw_bam" aligned_reads.bam -j $threads -N 1"
+echo $cmd && $cmd && \
echo "Task 3 completed at $(date)"
### extract mapping direction
@@ -57,7 +60,7 @@ do
### build ccs
echo ""
- ccs --reportFile=subreads_ccs.${strand}.csv --logFile=subreads_ccs.${strand}.log --numThreads=7 --minPasses=1 subreads.${strand}.bam subreads_ccs.${strand}.bam
+ ccs --reportFile=subreads_ccs.${strand}.csv --logFile=subreads_ccs.${strand}.log --num-threads=$threads --minPasses=1 subreads.${strand}.bam subreads_ccs.${strand}.bam
echo "Task 7 ($strand) completed at $(date)"
done
diff --git a/scripts/ccs2-mutation.sh b/scripts/ccs2-mutation.sh
index 40ae10c..c40be14 100644
--- a/scripts/ccs2-mutation.sh
+++ b/scripts/ccs2-mutation.sh
@@ -1,18 +1,21 @@
#!/bin/bash
-#$ -S /bin/bash
-#$ -P longrun
-#$ -pe smp 2
+if [ "$#" -ne 8 ]; then
+ echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads"
+ exit 1
+fi
-cat $JOB_SCRIPT >> $SGE_STDOUT_PATH
-
-echo ""
-echo "Started on $(date) on $(uname -n)"
-
-module load p7zip
-module load smrtlink
-module load samtools
+while getopts "a:s:c:p:" opt; do
+ case $opt in
+ a) amplicon="$OPTARG" ;;
+ s) sampleId="$OPTARG" ;;
+ c) collectionPathUri="$OPTARG" ;;
+ p) threads="$OPTARG" ;;
+ \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;;
+ esac
+done
+root=`pwd`
reference="$root/references/$amplicon/sequence/$amplicon.fasta"
### run directory
@@ -36,7 +39,7 @@ do
### align reads
echo ""
- blasr $ccsdir/subreads_ccs.$strand.bam reference.fasta -bam -out aligned_reads.bam -bestn 1 -nproc 2
+ pbmm2 align reference.fasta $ccsdir/subreads_ccs.$strand.bam aligned_reads.bam -j 8 -N 1
echo "Task 3 ($strand) completed at $(date)"
### call mutations
diff --git a/scripts/ccs2-summary.sh b/scripts/ccs2-summary.sh
index fb84add..2adb90e 100644
--- a/scripts/ccs2-summary.sh
+++ b/scripts/ccs2-summary.sh
@@ -1,16 +1,21 @@
#!/bin/bash
-#$ -S /bin/bash
-#$ -P longrun
-#$ -pe smp 2
-
-cat $JOB_SCRIPT >> $SGE_STDOUT_PATH
-
-echo ""
-echo "Started on $(date) on $(uname -n)"
-
-module load p7zip
-
+if [ "$#" -ne 8 ]; then
+ echo "Usage: $0 -a amplicon -s sampleId -c collectionPathUri -p threads"
+ exit 1
+fi
+
+while getopts "a:s:c:p:" opt; do
+ case $opt in
+ a) amplicon="$OPTARG" ;;
+ s) sampleId="$OPTARG" ;;
+ c) collectionPathUri="$OPTARG" ;;
+ p) threads="$OPTARG" ;;
+ \?) echo "Invalid option -$OPTARG" >&2; exit 1 ;;
+ esac
+done
+
+root=`pwd`
reference="$root/references/$amplicon/sequence/$amplicon.fasta"
### run directory
diff --git a/workflow.sh b/workflow.sh
index fa1983a..7fe370b 100755
--- a/workflow.sh
+++ b/workflow.sh
@@ -1,6 +1,6 @@
#!/bin/bash
-if [ "$#" -ne 2 ] || ( [ "$2" != "view" ] && [ "$2" != "consensus" ] && [ "$2" != "chimeric" ] && [ "$2" != "mutation" ] && [ "$2" != "summary" ] && [ "$2" != "tabulate" ] )
+if [ "$#" -lt 2 ] || ( [ "$2" != "view" ] && [ "$2" != "consensus" ] && [ "$2" != "chimeric" ] && [ "$2" != "mutation" ] && [ "$2" != "summary" ] && [ "$2" != "tabulate" ] )
then
echo "usage: $0 samples.csv view" >&2
echo " $0 samples.csv consensus" >&2
@@ -11,11 +11,16 @@ then
exit 1
fi
-samples=$1
+PROJDIR=`dirname $0`
+
+samples=$(realpath $1)
command=$2
+howtorun=$3 #either 'local', 'dryrun', 'qsub'
+threads=${4:-1}
+memory=${5:-4g}
-### define top project directory
-PROJDIR=
+# set environment value for $SGEQ and $SGEP
+resource="-l vf=$memory,num_proc=$threads -binding linear:$threads -q $SGEQ -P $SGEP"
cd $PROJDIR
@@ -27,28 +32,44 @@ while read line
do
if [[ ! $line =~ "SampleID" ]]
then
- ### extract SampleID, Amplicon, and data path for each sample
- sampleId=`echo $line | cut -d, -f1`
- amplicon=`echo $line | cut -d, -f3`
- collectionPathUri=`echo $line | cut -d, -f6`
-
- if [ "$command" == "view" ]
- then
- ### preview run info
- echo ""
- echo "sampleId=$sampleId"
- echo "amplicon=$amplicon"
- echo "collectionPathUri=$collectionPathUri"
- else
- if [ "$command" != "tabulate" ] ; then
- logdir=`printf "%s/samples/%05i" $PROJDIR $sampleId`
- mkdir -p "$logdir"
- qsub -v root="$PROJDIR",amplicon="$amplicon",sampleId="$sampleId",collectionPathUri="$PROJDIR/$collectionPathUri" -N "ccs$sampleId" -o $logdir/$command.log -j yes $PROJDIR/scripts/ccs2-$command.sh
- else
- rundir=`printf "%s/samples/%05i/summary" $PROJDIR $sampleId`
- echo $line | tr '\n' ","
- tail --lines 1 $rundir/summary.csv
- fi
- fi
+ ### extract SampleID, Amplicon, and data path for each sample
+ sampleId=`echo $line | cut -d, -f1`
+ amplicon=`echo $line | cut -d, -f3`
+ collectionPathUri=`echo $line | cut -d, -f6 | xargs -n1 realpath`
+ if [ "$command" == "view" ]
+ then
+ ### preview run info
+ echo ""
+ echo "sampleId=$sampleId"
+ echo "amplicon=$amplicon"
+ echo "collectionPathUri=$collectionPathUri"
+ else
+ if [ "$command" != "tabulate" ] ; then
+ logdir=`printf "%s/samples/%05i" $PROJDIR $sampleId`
+ mkdir -p "$logdir"
+
+ cmd="$PROJDIR/scripts/ccs2-$command.sh -a $amplicon -s $sampleId -c $collectionPathUri -p $threads"
+
+ if [ "$howtorun" == "qsub" ];then
+ echo "Run on SGE: $cmd"
+ qsub -V -cwd $resource -N "ccs-$sampleId" -o $logdir/$command.log -j yes $cmd
+ else
+ cmd="bash $cmd"
+ if [ "$howtorun" == "local" ];then
+ echo "Run on local: $cmd" && $cmd
+ else
+ if [ "$howtorun" == "localbg" ];then
+ echo "Run on local: $cmd" && $cmd &
+ else
+ echo "Dryrun(no run): $cmd"
+ fi
+ fi
+ fi
+ else
+ rundir=`printf "%s/samples/%05i/summary" $PROJDIR $sampleId`
+ echo $line | tr '\n' ","
+ tail --lines 1 $rundir/summary.csv
+ fi
+ fi
fi
done < $samples