0_GetData.sb

#!/bin/bash --login
########## SBATCH Lines for Resource Request ##########
 
#SBATCH --time=02:00:00             # limit of wall clock time - how long the job will run (same as -t)
#SBATCH --nodes=1-5                 # number of different nodes - could be an exact number or a range of nodes (same as -N)
#SBATCH --ntasks=1                  # number of tasks - how many tasks (nodes) that you require (same as -n)
#SBATCH --cpus-per-task=2           # number of CPUs (or cores) per task (same as -c)
#SBATCH --mem-per-cpu=2G            # memory required per allocated CPU (or core) - amount of memory (in bytes)
#SBATCH --job-name GetRawSNPs      # you can give your job a name for easier identification (same as -J)
 
########## Command Lines to Run ##########
 
module purge

#module load GCC/6.4.0-2.28 OpenMPI  ### load necessary modules, e.g.
cd $SLURM_SUBMIT_DIR || exit 
mkdir RawData
cd RawData || exit                  ### change to the directory where your code is located

#### NOTES ####
# Everything in the final analysis is using HG37/HG19
###############


################################
# In Hg37/Hg19

## ATACseq peaks: for atacseq.sb

#wget https://bendlj01.u.hpc.mssm.edu/multireg/resources/boca_peaks.zip

#unzip boca_peaks.zip

## Human gene list with exons, as gff3: for introns.sb and RNAs.sb

wget ftp://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/Homo_sapiens.GRCh37.87.gff3.gz

## Roadmap Chromatin states Core 15-state model (5 marks, 127 epigenomes) E067: for roadmap.sb

wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E067_15_coreMarks_mnemonics.bed.gz

## PsychENCODE enhancer list and H3K27ac peaks: for psychencode.sb

#wget http://resource.psychencode.org/Datasets/Derived/DER-03a_hg19_PEC_enhancers.bed

## Some annotations from the original paper. https://www.nature.com/articles/s41588-018-0081-4

wget https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_baseline_v1.1_ldscores.tgz
tar -xvzf 1000G_Phase3_baseline_v1.1_ldscores.tgz


################################
# In Hg38, for the conservation scores

## Conservation scores for alignments of 30 mammalian (27 primate) genomes with human
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/phastCons30way/hg38.phastCons30way.bw

## Basewise conservation scores (phyloP) of 30 mammalian (27 primate) genomes with human
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP30way/hg38.phyloP30way.bw

## Get the sql database for the conserved elements in human +29

wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.MYD
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.MYI
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.frm

wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.MYD
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.MYI
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.frm

################################
# Pre-processing

# Recommended SNP list from https://www.nature.com/articles/s41588-018-0081-4

wget https://data.broadinstitute.org/alkesgroup/LDSCORE/w_hm3.snplist.bz2
bzip2 -d w_hm3.snplist.bz2


## Build a SNP list file to pass to LD score calculation of chromosome 6 and 14
## The paper suggests using the one above, however it is about 1/10th the list of 
## SNPs used in the subset files, and there are SNPS in the hm3 list that aren't
## in the subsets, which seems odd. It may be that the hm3 list is filtered by 
## MAF or similar, and is a faster list to process, but for now, I'm going to 
## run both and see if it makes a difference.


## This takes all of the SNPs in the subsets *except* those in the MHC and GPHN
## region, and puts them in a single list. Even if we end up using the hm3 SNP
## list, we'll need this as a whitelist for the --print-snps flag when 
## calculating LD scores.

awk '{if($4 > 66705000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' ../s3/subset.14.bim | awk '{if($4 < 67900000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > GPHNsnps.txt
awk '{if($4 > 28477000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' ../s3/subset.6.bim | awk '{if($4 < 33448000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > MHCsnps.txt
diff ../s3/subset.6.bim MHCsnps.txt --suppress-common-lines --new-line-format= --unchanged-line-format= > chr6noMHC.txt
diff ../s3/subset.14.bim GPHNsnps.txt --suppress-common-lines --new-line-format= --unchanged-line-format=  > chr14noGPHN.txt

rm NoMHC_GPHN_SNP.txt
touch NoMHC_GPHN_SNP.txt
for i in `seq 1 5`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt

cat chr6noMHC.txt >> NoMHC_GPHN_SNP.txt

for i in `seq 7 13`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt

cat chr14noGPHN.txt >> NoMHC_GPHN_SNP.txt

for i in `seq 15 22`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt

## This makes a bedfile version, which is helpful for checking other code
awk '{ print "chr" $1 "\t" $4 "\t" $4+1 "\t" $2}' NoMHC_GPHN_SNP.txt > NoMHC_GPHN_SNP.bed


## This builds a list of just the SNP names, which is what the --print-snps flag expects
cut -f 2 NoMHC_GPHN_SNP.txt > NoMHC_GPHN_SNPlist.txt

## The hm3 SNP list doesn't have locations. So, to compare the output from using the
## hm3 SNPS vs all the SNPs in the subsets, I need to make it into a proper bed file
## This builds a SNP bed file from the recommended snplist: w_hm3.snplist and the location files
for i in `seq 1 22`
do cat ../s3/subset.${i}.bim
done >> allchr.bim

cut -f 1 w_hm3.snplist > names_w_hm3.snplist
grep -Fwf names_w_hm3.snplist allchr.bim | awk '{ print "chr" $1 "\t" $4 "\t" $4+1 "\t" $2}' > w_hm3.bed

## The conservation data we need only comes in Hg38. This is the 
## program and file needed for Hg19/Hg37 to Hg38 liftover

wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver

chmod +x liftOver

wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz

./liftOver NoMHC_GPHN_SNP.bed hg19ToHg38.over.chain.gz NoMHC_GPHN_SNP_Hg38.bed unMapped


## Make some files that might get re-used

zgrep 'exon' Homo_sapiens.GRCh37.87.gff3.gz > allexons.gff


cd .. || exit 

scontrol show job $SLURM_JOB_ID