From 9903d09ec4dcdf8b6eb380e5ce54259253709f1b Mon Sep 17 00:00:00 2001
From: ManuelTgn <51021763+ManuelTgn@users.noreply.github.com>
Date: Wed, 16 Oct 2024 17:00:58 +0200
Subject: [PATCH] Update crisprme_auto_test_docker.sh

---
 crisprme_auto_test_docker.sh | 169 ++++++++++++++++++++++++-----------
 1 file changed, 117 insertions(+), 52 deletions(-)

diff --git a/crisprme_auto_test_docker.sh b/crisprme_auto_test_docker.sh
index 4b6abd0..bff2490 100755
--- a/crisprme_auto_test_docker.sh
+++ b/crisprme_auto_test_docker.sh
@@ -1,53 +1,118 @@
-#crisprme download and test
-echo "starting download and unzip of data"
-
-echo "unzip gencode+encode annotations"
-#unzip annotations
-cd Annotations/
-tar -xzf encode+gencode.hg38.tar.gz
-rm encode+gencode.hg38.tar.gz
-tar -xzf gencode.protein_coding.tar.gz
-rm gencode.protein_coding.tar.gz
-cd ../
-
-#unzip gencode
-# echo "unzip gencode protein-coding proximity file"
-# cd Gencode/
-# tar -xzf gencode.protein_coding.tar.gz
-# rm gencode.protein_coding.tar.gz
-# cd ../
-
-echo "start download VCF data and genome (this may take a long time due to connection speed)"
-#download VCFs data
-cd VCFs/
-#download 1000G
-cd hg38_1000G/
-echo "download 1000G VCFs"
-for i in {1..22}; do
-   wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr$i.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
+# ------------------------------------------------------------------------------
+# TEST CRISPRme conda package
+
+echo "Download and extract fundamental data..."
+
+# download hg38 genome assembly
+GENOMEDIR="Genomes"
+HG38="hg38"
+echo "Downloading hg38 genome assembly..."
+mkdir -p $GENOMEDIR  # create Genomes folder
+cd $GENOMEDIR
+# download chromosomes FASTA files
+original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0"  # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt
+while true; do  # retry download if caught timeout
+   wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break
+done
+chromsfasta="hg38.chromFa.tar.gz"
+local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)"
+if [ "$original_md5sum" != "$local_md5sum" ]; then
+   echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz"
+   exit 1
+fi
+echo "Extracting ${chromsfasta}..."
+tar -xzf $chromsfasta
+mv chroms $HG38  # create hg38 directory
+cd ..
+
+# download 1000G VCFs
+echo "Downloading 1000 Genomes VCFs..."
+VCF="VCFs"
+mkdir -p $VCF  # create VCF folder
+cd $VCF
+VCF1000G="hg38_1000G"
+mkdir -p $VCF1000G  # create 1000 genomes folder
+cd $VCF1000G
+# download 1000 genomes VCF files
+for i in $(seq 1 22; echo "X");
+do 
+   original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)"  # compute original md5sum
+   while true; do  # retry download if caught timeout
+      wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break
+   done
+   local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)"
+   if [ "$original_md5sum" != "$local_md5sum" ]; then  # check download consistency
+      echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz"
+      exit 1
+   fi
 done
-wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
-cd ../../
-
-#download HGDP
-#uncomment these lines if you want to download also HGDP VCFs
-# cd ../hg38_HGDP
-# echo "download HGDP VCFs"
-# for i in {1..22}
-# do
-#    wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr$i.vcf.gz
-# done
-# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrX.vcf.gz
-# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrY.vcf.gz
-# cd ../../
-
-#download hg38
-cd Genomes/
-echo "download hg38"
-wget -c -q https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz
-tar -xzf hg38.chromFa.tar.gz
-mv chroms hg38
-cd ../
-
-echo "start testing"
-docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4
+cd ../..
+
+# initialize VCF config file
+VCFCONFIG="vcf_config.1000G.txt"
+printf "${VCF1000G}\n" > $VCFCONFIG
+
+# download 1000G samplesIDs  
+SAMPLESIDS="samplesIDs"
+mkdir -p $SAMPLESIDS  # create sample ids dir
+cd $SAMPLESIDS
+# download 1000G samples IDs
+echo "Downloading samples ids for 1000G dataset"
+SAMPLES1000G="hg38_1000G.samplesID.txt"
+wget https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G}
+cd ..
+
+# initialize samples config file
+SAMPLESCONFIG="samplesIDs.1000G.txt"
+printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG
+
+# download annotation data
+ANNOTATIONDIR="Annotations"
+mkdir -p $ANNOTATIONDIR  # create annotation folder
+cd $ANNOTATIONDIR
+echo "Downloading ENCODE+GENCODE annotation data..."
+original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)"
+encodegencode="dhs+encode+gencode.hg38.bed.zip"
+while true; do  # retry download if caught timeout
+   wget -T 15 -c -O $encodegencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break
+done
+local_md5sum="$(md5sum $encodegencode | cut -d ' ' -f 1)"
+if [ "$original_md5sum" != "$local_md5sum" ]; then
+   echo "ERROR: unexpected failure while downloading ${encodegencode}"
+   exit 1
+fi
+echo "Extracting ${encodegencode}..."
+tar -xvf $encodegencode
+
+echo "Downloading GENCODE encoding sequences..."
+original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)"
+gencode="gencode.protein_coding.bed.zip"
+while true; do  # retry download if caught timeout
+   wget -T 15 -c -O $gencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break
+done
+local_md5sum="$(md5sum $gencode | cut -d ' ' -f 1)"
+if [ "$original_md5sum" != "$local_md5sum" ]; then
+   echo "ERROR: unexpected failure while downloading ${gencode}"
+   exit 1
+fi
+echo "Extracting ${gencode}..."
+tar -xvf $gencode
+cd ..
+
+# create Dictionaries folder
+mkdir -p "Dictionaries"
+
+# create sg1617 guide file
+GUIDEFILE="sg1617.txt"
+printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE
+
+# create NGG PAM file
+PAM="PAMs"
+mkdir -p $PAM
+cd $PAM
+NGGPAM="20bp-NGG-spCas9.txt"
+printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM
+cd ..
+
+echo "Start CRISPRme test..."
+docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$encodegencode --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$gencode --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4