Skip to content

Commit

Permalink
Update crisprme_auto_test_docker.sh
Browse files Browse the repository at this point in the history
  • Loading branch information
ManuelTgn committed Oct 16, 2024
1 parent 653e86e commit 9903d09
Showing 1 changed file with 117 additions and 52 deletions.
169 changes: 117 additions & 52 deletions crisprme_auto_test_docker.sh
Original file line number Diff line number Diff line change
@@ -1,53 +1,118 @@
#crisprme download and test
echo "starting download and unzip of data"

echo "unzip gencode+encode annotations"
#unzip annotations
cd Annotations/
tar -xzf encode+gencode.hg38.tar.gz
rm encode+gencode.hg38.tar.gz
tar -xzf gencode.protein_coding.tar.gz
rm gencode.protein_coding.tar.gz
cd ../

#unzip gencode
# echo "unzip gencode protein-coding proximity file"
# cd Gencode/
# tar -xzf gencode.protein_coding.tar.gz
# rm gencode.protein_coding.tar.gz
# cd ../

echo "start download VCF data and genome (this may take a long time due to connection speed)"
#download VCFs data
cd VCFs/
#download 1000G
cd hg38_1000G/
echo "download 1000G VCFs"
for i in {1..22}; do
wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr$i.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
# ------------------------------------------------------------------------------
# TEST CRISPRme conda package

echo "Download and extract fundamental data..."

# download hg38 genome assembly
GENOMEDIR="Genomes"
HG38="hg38"
echo "Downloading hg38 genome assembly..."
mkdir -p $GENOMEDIR # create Genomes folder
cd $GENOMEDIR
# download chromosomes FASTA files
original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0" # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
while true; do # retry download if caught timeout
wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break
done
chromsfasta="hg38.chromFa.tar.gz"
local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)"
if [ "$original_md5sum" != "$local_md5sum" ]; then
echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz"
exit 1
fi
echo "Extracting ${chromsfasta}..."
tar -xzf $chromsfasta
mv chroms $HG38 # create hg38 directory
cd ..

# download 1000G VCFs
echo "Downloading 1000 Genomes VCFs..."
VCF="VCFs"
mkdir -p $VCF # create VCF folder
cd $VCF
VCF1000G="hg38_1000G"
mkdir -p $VCF1000G # create 1000 genomes folder
cd $VCF1000G
# download 1000 genomes VCF files
for i in $(seq 1 22; echo "X");
do
original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)" # compute original md5sum
while true; do # retry download if caught timeout
wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break
done
local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)"
if [ "$original_md5sum" != "$local_md5sum" ]; then # check download consistency
echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz"
exit 1
fi
done
wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
cd ../../

#download HGDP
#uncomment these lines if you want to download also HGDP VCFs
# cd ../hg38_HGDP
# echo "download HGDP VCFs"
# for i in {1..22}
# do
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr$i.vcf.gz
# done
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrX.vcf.gz
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrY.vcf.gz
# cd ../../

#download hg38
cd Genomes/
echo "download hg38"
wget -c -q https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz
tar -xzf hg38.chromFa.tar.gz
mv chroms hg38
cd ../

echo "start testing"
docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4
cd ../..

# initialize VCF config file
VCFCONFIG="vcf_config.1000G.txt"
printf "${VCF1000G}\n" > $VCFCONFIG

# download 1000G samplesIDs
SAMPLESIDS="samplesIDs"
mkdir -p $SAMPLESIDS # create sample ids dir
cd $SAMPLESIDS
# download 1000G samples IDs
echo "Downloading samples ids for 1000G dataset"
SAMPLES1000G="hg38_1000G.samplesID.txt"
wget https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G}
cd ..

# initialize samples config file
SAMPLESCONFIG="samplesIDs.1000G.txt"
printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG

# download annotation data
ANNOTATIONDIR="Annotations"
mkdir -p $ANNOTATIONDIR # create annotation folder
cd $ANNOTATIONDIR
echo "Downloading ENCODE+GENCODE annotation data..."
original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)"
encodegencode="dhs+encode+gencode.hg38.bed.zip"
while true; do # retry download if caught timeout
wget -T 15 -c -O $encodegencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break
done
local_md5sum="$(md5sum $encodegencode | cut -d ' ' -f 1)"
if [ "$original_md5sum" != "$local_md5sum" ]; then
echo "ERROR: unexpected failure while downloading ${encodegencode}"
exit 1
fi
echo "Extracting ${encodegencode}..."
tar -xvf $encodegencode

echo "Downloading GENCODE encoding sequences..."
original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)"
gencode="gencode.protein_coding.bed.zip"
while true; do # retry download if caught timeout
wget -T 15 -c -O $gencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break
done
local_md5sum="$(md5sum $gencode | cut -d ' ' -f 1)"
if [ "$original_md5sum" != "$local_md5sum" ]; then
echo "ERROR: unexpected failure while downloading ${gencode}"
exit 1
fi
echo "Extracting ${gencode}..."
tar -xvf $gencode
cd ..

# create Dictionaries folder
mkdir -p "Dictionaries"

# create sg1617 guide file
GUIDEFILE="sg1617.txt"
printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE

# create NGG PAM file
PAM="PAMs"
mkdir -p $PAM
cd $PAM
NGGPAM="20bp-NGG-spCas9.txt"
printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM
cd ..

echo "Start CRISPRme test..."
docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$encodegencode --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$gencode --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4

0 comments on commit 9903d09

Please sign in to comment.