-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
117 additions
and
52 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,53 +1,118 @@ | ||
#crisprme download and test | ||
echo "starting download and unzip of data" | ||
|
||
echo "unzip gencode+encode annotations" | ||
#unzip annotations | ||
cd Annotations/ | ||
tar -xzf encode+gencode.hg38.tar.gz | ||
rm encode+gencode.hg38.tar.gz | ||
tar -xzf gencode.protein_coding.tar.gz | ||
rm gencode.protein_coding.tar.gz | ||
cd ../ | ||
|
||
#unzip gencode | ||
# echo "unzip gencode protein-coding proximity file" | ||
# cd Gencode/ | ||
# tar -xzf gencode.protein_coding.tar.gz | ||
# rm gencode.protein_coding.tar.gz | ||
# cd ../ | ||
|
||
echo "start download VCF data and genome (this may take a long time due to connection speed)" | ||
#download VCFs data | ||
cd VCFs/ | ||
#download 1000G | ||
cd hg38_1000G/ | ||
echo "download 1000G VCFs" | ||
for i in {1..22}; do | ||
wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr$i.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | ||
# ------------------------------------------------------------------------------ | ||
# TEST CRISPRme conda package | ||
|
||
echo "Download and extract fundamental data..." | ||
|
||
# download hg38 genome assembly | ||
GENOMEDIR="Genomes" | ||
HG38="hg38" | ||
echo "Downloading hg38 genome assembly..." | ||
mkdir -p $GENOMEDIR # create Genomes folder | ||
cd $GENOMEDIR | ||
# download chromosomes FASTA files | ||
original_md5sum="a5aa5da14ccf3d259c4308f7b2c18cb0" # see https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/md5sum.txt | ||
while true; do # retry download if caught timeout | ||
wget -T 15 -c https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz && break | ||
done | ||
chromsfasta="hg38.chromFa.tar.gz" | ||
local_md5sum="$(md5sum $chromsfasta | cut -d ' ' -f 1)" | ||
if [ "$original_md5sum" != "$local_md5sum" ]; then | ||
echo "ERROR: unexpected failure while downloading https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz" | ||
exit 1 | ||
fi | ||
echo "Extracting ${chromsfasta}..." | ||
tar -xzf $chromsfasta | ||
mv chroms $HG38 # create hg38 directory | ||
cd .. | ||
|
||
# download 1000G VCFs | ||
echo "Downloading 1000 Genomes VCFs..." | ||
VCF="VCFs" | ||
mkdir -p $VCF # create VCF folder | ||
cd $VCF | ||
VCF1000G="hg38_1000G" | ||
mkdir -p $VCF1000G # create 1000 genomes folder | ||
cd $VCF1000G | ||
# download 1000 genomes VCF files | ||
for i in $(seq 1 22; echo "X"); | ||
do | ||
original_md5sum="$(curl -sL ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | md5sum | cut -d ' ' -f 1)" # compute original md5sum | ||
while true; do # retry download if caught timeout | ||
wget -T 15 -c ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz && break | ||
done | ||
local_md5sum="$(md5sum ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | cut -d ' ' -f 1)" | ||
if [ "$original_md5sum" != "$local_md5sum" ]; then # check download consistency | ||
echo "ERROR: unexpected failure while downloading ALL.chr${i}.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz" | ||
exit 1 | ||
fi | ||
done | ||
wget -c -q ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chrX.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz | ||
cd ../../ | ||
|
||
#download HGDP | ||
#uncomment these lines if you want to download also HGDP VCFs | ||
# cd ../hg38_HGDP | ||
# echo "download HGDP VCFs" | ||
# for i in {1..22} | ||
# do | ||
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chr$i.vcf.gz | ||
# done | ||
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrX.vcf.gz | ||
# wget -c -q ftp://ngs.sanger.ac.uk:21/production/hgdp/hgdp_wgs.20190516/hgdp_wgs.20190516.full.chrY.vcf.gz | ||
# cd ../../ | ||
|
||
#download hg38 | ||
cd Genomes/ | ||
echo "download hg38" | ||
wget -c -q https://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chromFa.tar.gz | ||
tar -xzf hg38.chromFa.tar.gz | ||
mv chroms hg38 | ||
cd ../ | ||
|
||
echo "start testing" | ||
docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf list_vcf.txt --guide sg1617.txt --pam PAMs/20bp-NGG-spCas9.txt --annotation Annotations/encode+gencode.hg38.bed --samplesID list_samplesID.txt --gene_annotation Annotations/gencode.protein_coding.bed --bMax 2 --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 | ||
cd ../.. | ||
|
||
# initialize VCF config file | ||
VCFCONFIG="vcf_config.1000G.txt" | ||
printf "${VCF1000G}\n" > $VCFCONFIG | ||
|
||
# download 1000G samplesIDs | ||
SAMPLESIDS="samplesIDs" | ||
mkdir -p $SAMPLESIDS # create sample ids dir | ||
cd $SAMPLESIDS | ||
# download 1000G samples IDs | ||
echo "Downloading samples ids for 1000G dataset" | ||
SAMPLES1000G="hg38_1000G.samplesID.txt" | ||
wget https://raw.githubusercontent.com/pinellolab/CRISPRme/refs/heads/gnomad-4.1-converter/download_data/${SAMPLES1000G} | ||
cd .. | ||
|
||
# initialize samples config file | ||
SAMPLESCONFIG="samplesIDs.1000G.txt" | ||
printf "${SAMPLES1000G}\n" > $SAMPLESCONFIG | ||
|
||
# download annotation data | ||
ANNOTATIONDIR="Annotations" | ||
mkdir -p $ANNOTATIONDIR # create annotation folder | ||
cd $ANNOTATIONDIR | ||
echo "Downloading ENCODE+GENCODE annotation data..." | ||
original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" | ||
encodegencode="dhs+encode+gencode.hg38.bed.zip" | ||
while true; do # retry download if caught timeout | ||
wget -T 15 -c -O $encodegencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz && break | ||
done | ||
local_md5sum="$(md5sum $encodegencode | cut -d ' ' -f 1)" | ||
if [ "$original_md5sum" != "$local_md5sum" ]; then | ||
echo "ERROR: unexpected failure while downloading ${encodegencode}" | ||
exit 1 | ||
fi | ||
echo "Extracting ${encodegencode}..." | ||
tar -xvf $encodegencode | ||
|
||
echo "Downloading GENCODE encoding sequences..." | ||
original_md5sum="$(curl -sL https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/dhs+encode+gencode.hg38.bed.tar.gz | md5sum | cut -d ' ' -f 1)" | ||
gencode="gencode.protein_coding.bed.zip" | ||
while true; do # retry download if caught timeout | ||
wget -T 15 -c -O $gencode https://raw.githubusercontent.com/pinellolab/CRISPRme/gnomad-4.1-converter/download_data/gencode.protein_coding.bed.tar.gz && break | ||
done | ||
local_md5sum="$(md5sum $gencode | cut -d ' ' -f 1)" | ||
if [ "$original_md5sum" != "$local_md5sum" ]; then | ||
echo "ERROR: unexpected failure while downloading ${gencode}" | ||
exit 1 | ||
fi | ||
echo "Extracting ${gencode}..." | ||
tar -xvf $gencode | ||
cd .. | ||
|
||
# create Dictionaries folder | ||
mkdir -p "Dictionaries" | ||
|
||
# create sg1617 guide file | ||
GUIDEFILE="sg1617.txt" | ||
printf "CTAACAGTTGCTTTTATCACNNN\n" > $GUIDEFILE | ||
|
||
# create NGG PAM file | ||
PAM="PAMs" | ||
mkdir -p $PAM | ||
cd $PAM | ||
NGGPAM="20bp-NGG-spCas9.txt" | ||
printf "NNNNNNNNNNNNNNNNNNNNNGG 3\n" > $NGGPAM | ||
cd .. | ||
|
||
echo "Start CRISPRme test..." | ||
docker run -v ${PWD}:/DATA -w /DATA -i pinellolab/crisprme crisprme.py complete-search --genome Genomes/hg38/ --vcf $VCFCONFIG --guide $GUIDEFILE --pam PAMs/$NGGPAM --annotation Annotations/$encodegencode --samplesID $SAMPLESCONFIG --gene_annotation Annotations/$gencode --mm 6 --bDNA 2 --bRNA 2 --merge 3 --output sg1617.6.2.2 --thread 4 |