From 9903d09ec4dcdf8b6eb380e5ce54259253709f1b Mon Sep 17 00:00:00 2001 From: ManuelTgn <51021763+ManuelTgn@users.noreply.github.com> Date: Wed, 16 Oct 2024 17:00:58 +0200 Subject: [PATCH] Update crisprme_auto_test_docker.sh --- crisprme_auto_test_docker.sh | 169 ++++++++++++++++++++++++----------- 1 file changed, 117 insertions(+), 52 deletions(-) diff --git a/crisprme_auto_test_docker.sh b/crisprme_auto_test_docker.sh index 4b6abd0..bff2490 100755 --- a/crisprme_auto_test_docker.sh +++ b/crisprme_auto_test_docker.sh @@ -1,53 +1,118 @@ -#crisprme download and test -echo "starting download and unzip of data" - -echo "unzip gencode+encode annotations" -#unzip annotations -cd Annotations/ -tar -xzf encode+gencode.hg38.tar.gz -rm encode+gencode.hg38.tar.gz -tar -xzf gencode.protein_coding.tar.gz -rm gencode.protein_coding.tar.gz -cd ../ - -#unzip gencode -# echo "unzip gencode protein-coding proximity file" -# cd Gencode/ -# tar -xzf gencode.protein_coding.tar.gz -# rm gencode.protein_coding.tar.gz -# cd ../ - -echo "start download VCF data and genome (this may take a long time due to connection speed)" -#download VCFs data -cd VCFs/ -#download 1000G -cd hg38_1000G/ -echo "download 1000G VCFs" -for i in {1..22}; do - wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr$i.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz +# ------------------------------------------------------------------------------ +# TEST CRISPRme conda package + +echo "Download and extract fundamental data..." + +# download hg38 genome assembly +GENOMEDIR="Genomes" +HG38="hg38" +echo "Downloading hg38 genome assembly..." +mkdir -p $GENOMEDIR # create Genomes folder +cd $GENOMEDIR +# download chromosomes FASTA files +original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0" # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt +while true; do # retry download if caught timeout + wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break +done +chromsfasta="hg38.chromFa.tar.gz" +local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)" +if [ "$original_md5sum" != "$local_md5sum" ]; then + echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz" + exit 1 +fi +echo "Extracting ${chromsfasta}..." +tar -xzf $chromsfasta +mv chroms $HG38 # create hg38 directory +cd .. + +# download 1000G VCFs +echo "Downloading 1000 Genomes VCFs..." +VCF="VCFs" +mkdir -p $VCF # create VCF folder +cd $VCF +VCF1000G="hg38_1000G" +mkdir -p $VCF1000G # create 1000 genomes folder +cd $VCF1000G +# download 1000 genomes VCF files +for i in $(seq 1 22; echo "X"); +do + original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)" # compute original md5sum + while true; do # retry download if caught timeout + wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break + done + local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)" + if [ "$original_md5sum" != "$local_md5sum" ]; then # check download consistency + echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" + exit 1 + fi done -wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz -cd ../../ - -#download HGDP -#uncomment these lines if you want to download also HGDP VCFs -# cd ../hg38_HGDP -# echo "download HGDP VCFs" -# for i in {1..22} -# do -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr$i.vcf.gz -# done -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrX.vcf.gz -# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrY.vcf.gz -# cd ../../ - -#download hg38 -cd Genomes/ -echo "download hg38" -wget -c -q https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz -tar -xzf hg38.chromFa.tar.gz -mv chroms hg38 -cd ../ - -echo "start testing" -docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 +cd ../.. + +# initialize VCF config file +VCFCONFIG="vcf_config.1000G.txt" +printf "${VCF1000G}\n" > $VCFCONFIG + +# download 1000G samplesIDs +SAMPLESIDS="samplesIDs" +mkdir -p $SAMPLESIDS # create sample ids dir +cd $SAMPLESIDS +# download 1000G samples IDs +echo "Downloading samples ids for 1000G dataset" +SAMPLES1000G="hg38_1000G.samplesID.txt" +wget https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} +cd .. + +# initialize samples config file +SAMPLESCONFIG="samplesIDs.1000G.txt" +printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG + +# download annotation data +ANNOTATIONDIR="Annotations" +mkdir -p $ANNOTATIONDIR # create annotation folder +cd $ANNOTATIONDIR +echo "Downloading ENCODE+GENCODE annotation data..." +original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" +encodegencode="dhs+encode+gencode.hg38.bed.zip" +while true; do # retry download if caught timeout + wget -T 15 -c -O $encodegencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break +done +local_md5sum="$(md5sum $encodegencode | cut -d ' ' -f 1)" +if [ "$original_md5sum" != "$local_md5sum" ]; then + echo "ERROR: unexpected failure while downloading ${encodegencode}" + exit 1 +fi +echo "Extracting ${encodegencode}..." +tar -xvf $encodegencode + +echo "Downloading GENCODE encoding sequences..." +original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" +gencode="gencode.protein_coding.bed.zip" +while true; do # retry download if caught timeout + wget -T 15 -c -O $gencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break +done +local_md5sum="$(md5sum $gencode | cut -d ' ' -f 1)" +if [ "$original_md5sum" != "$local_md5sum" ]; then + echo "ERROR: unexpected failure while downloading ${gencode}" + exit 1 +fi +echo "Extracting ${gencode}..." +tar -xvf $gencode +cd .. + +# create Dictionaries folder +mkdir -p "Dictionaries" + +# create sg1617 guide file +GUIDEFILE="sg1617.txt" +printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE + +# create NGG PAM file +PAM="PAMs" +mkdir -p $PAM +cd $PAM +NGGPAM="20bp-NGG-spCas9.txt" +printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM +cd .. + +echo "Start CRISPRme test..." +docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$encodegencode --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$gencode --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4