Skip to content

Commit

Permalink
Add a whole workflow to debug
Browse files Browse the repository at this point in the history
  • Loading branch information
Shixiang Wang (王诗翔) committed Apr 22, 2024
1 parent 1f1dbd2 commit cdfb1c7
Show file tree
Hide file tree
Showing 5 changed files with 107 additions and 1 deletion.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ rv

### Pipeline (WES bam data only)

- for one tumor-normal pair, you can refer to [one-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/one-pair.R).
- for one tumor-normal pair, you can refer to [one-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/one-pair.R). [test-workflow/debug](test-workflow/debug) contains a full workflow for data obtained from SRA.
- for multiple tumor-normal pairs, you can refer to [two-pair.R](https://github.com/ShixiangWang/gcap/blob/master/test-workflow/two-pairs.R).

To run **gcap** from bam files, a machine with **at least 80GB RAM** is required for
Expand Down
11 changes: 11 additions & 0 deletions test-workflow/debug/0-dump-sra.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
mkdir -p /data3/wsx/share/gcap_debug
cd /data3/wsx/share

export PATH=$HOME/soft/sratoolkit/bin:$PATH

for i in ERR5242993 ERR5243012
do
echo handling $i
parallel-fastq-dump -t 20 -O gcap_debug/ --split-3 --gzip -s $i
done
51 changes: 51 additions & 0 deletions test-workflow/debug/1-align.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash
source activate circlemap
cd /data3/wsx/share/gcap_debug

cores=24

mkdir bam
sn=$(ls *.fastq.gz | tr ' ' '\n' | sed 's/_[12].fastq.gz//' | sort | uniq)
#sn=ERR5243012
INDEX=/data1/database/human/hg38/bwa_index/hg38_p7

for id in ${sn}; do

fastp -i ${id}_1.fastq.gz -I ${id}_2.fastq.gz -o ${id}_1.fq.gz -O ${id}_2.fq.gz -h ${id}.html -j ${id}.json --thread 16 --dont_overwrite

if [ ! -f bam/${id}.bam ]
then
if [ ! -f bam/${id}.sam ]
then
fq1=${id}_1.fq.gz
fq2=${id}_2.fq.gz
echo "Start aligning for ${id}"
bwa mem -M -t $cores -R "@RG\tID:${id}\tSM:${id}\tLB:WXS\tPL:Illumina" ${INDEX} ${fq1} ${fq2} \
> bam/${id}.sam 2>bam/${id}_bwa.log
else
echo "BWA align for ${id} is done before, directly go to sam > bam step"
fi

if [ $? -eq 0 ]
then
if [ ! -f bam/${id}.bam ]
then
samtools sort -@ $cores bam/${id}.sam -o bam/${id}.bam 2>bam/${id}_bam.log
samtools index bam/${id}.bam
if [ $? -eq 0 ]
then
echo "Removing sam files"
rm bam/${id}.sam
else
echo "Failed when using samtools sort, please check"
exit 1
fi
fi
echo "Done for ${id}." `date`
else
echo "Failed for ${id} in bwa." `date`
fi

fi

done
11 changes: 11 additions & 0 deletions test-workflow/debug/2-prepare.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Set up conda env
# mamba create -n cancerit -c bioconda cancerit-allelecount

cd /data3/wsx/share/gcap_reference
wget -c https://zenodo.org/records/6524005/files/1000G_loci_hg38.tar.gz
wget -c https://zenodo.org/records/6524005/files/GC_correction_hg38.txt.gz
wget -c https://zenodo.org/records/6524005/files/RT_correction_hg38.txt.gz

tar zxvf 1000G_loci_hg38.tar.gz
gunzip GC_correction_hg38.txt.gz
gunzip RT_correction_hg38.txt.gz
33 changes: 33 additions & 0 deletions test-workflow/debug/3-gcap.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# remotes::install_github("ShixiangWang/ascat@v3-for-gcap-v1", subdir = "ASCAT")
# remotes::install_github("ShixiangWang/gcap")
# install.packages("https://cran.r-project.org/src/contrib/Archive/xgboost/xgboost_1.5.2.1.tar.gz", repos = NULL)

library(gcap)

# id为PRJEB42904,wes_395LC是tumor,id:ERR5242993,wes_395N是normal,id:ERR5243012

# hg38 ----------------
gcap.workflow(
tumourseqfile = "~/share/gcap_debug/bam/ERR5242993.bam",
normalseqfile = "~/share/gcap_debug/bam/ERR5243012.bam",
tumourname = "wes_395LC",
normalname = "wes_395N",
jobname = "wes_395",
outdir = "~/share/gcap_debug/gcap_result",
allelecounter_exe = "~/miniconda3/envs/cancerit/bin/alleleCounter",
g1000allelesprefix = file.path(
"~/share/gcap_reference/1000G_loci_hg38/",
"1kg.phase3.v5a_GRCh38nounref_allele_index_chr"
),
g1000lociprefix = file.path("~/share/gcap_reference/1000G_loci_hg38/",
"1kg.phase3.v5a_GRCh38nounref_loci_chrstring_chr"
),
GCcontentfile = "~/share/gcap_reference/GC_correction_hg38.txt",
replictimingfile = "~/share/gcap_reference/RT_correction_hg38.txt",
skip_finished_ASCAT = TRUE,
skip_ascat_call = FALSE,
result_file_prefix = "wes_395",
genome_build = "hg38",
model = "XGB11"
)

0 comments on commit cdfb1c7

Please sign in to comment.