-
Notifications
You must be signed in to change notification settings - Fork 1
/
0_GetData.sb
154 lines (98 loc) · 6.08 KB
/
0_GetData.sb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/bin/bash --login
########## SBATCH Lines for Resource Request ##########
#SBATCH --time=02:00:00 # limit of wall clock time - how long the job will run (same as -t)
#SBATCH --nodes=1-5 # number of different nodes - could be an exact number or a range of nodes (same as -N)
#SBATCH --ntasks=1 # number of tasks - how many tasks (nodes) that you require (same as -n)
#SBATCH --cpus-per-task=2 # number of CPUs (or cores) per task (same as -c)
#SBATCH --mem-per-cpu=2G # memory required per allocated CPU (or core) - amount of memory (in bytes)
#SBATCH --job-name GetRawSNPs # you can give your job a name for easier identification (same as -J)
########## Command Lines to Run ##########
module purge
#module load GCC/6.4.0-2.28 OpenMPI ### load necessary modules, e.g.
cd $SLURM_SUBMIT_DIR || exit
mkdir RawData
cd RawData || exit ### change to the directory where your code is located
#### NOTES ####
# Everything in the final analysis is using HG37/HG19
###############
################################
# In Hg37/Hg19
## ATACseq peaks: for atacseq.sb
#wget https://bendlj01.u.hpc.mssm.edu/multireg/resources/boca_peaks.zip
#unzip boca_peaks.zip
## Human gene list with exons, as gff3: for introns.sb and RNAs.sb
wget ftp://ftp.ensembl.org/pub/grch37/current/gff3/homo_sapiens/Homo_sapiens.GRCh37.87.gff3.gz
## Roadmap Chromatin states Core 15-state model (5 marks, 127 epigenomes) E067: for roadmap.sb
wget https://egg2.wustl.edu/roadmap/data/byFileType/chromhmmSegmentations/ChmmModels/coreMarks/jointModel/final/E067_15_coreMarks_mnemonics.bed.gz
## PsychENCODE enhancer list and H3K27ac peaks: for psychencode.sb
#wget http://resource.psychencode.org/Datasets/Derived/DER-03a_hg19_PEC_enhancers.bed
## Some annotations from the original paper. https://www.nature.com/articles/s41588-018-0081-4
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/1000G_Phase3_baseline_v1.1_ldscores.tgz
tar -xvzf 1000G_Phase3_baseline_v1.1_ldscores.tgz
################################
# In Hg38, for the conservation scores
## Conservation scores for alignments of 30 mammalian (27 primate) genomes with human
wget http://hgdownload.cse.ucsc.edu/goldenPath/hg38/phastCons30way/hg38.phastCons30way.bw
## Basewise conservation scores (phyloP) of 30 mammalian (27 primate) genomes with human
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/phyloP30way/hg38.phyloP30way.bw
## Get the sql database for the conserved elements in human +29
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.MYD
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.MYI
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastConsElements30way.frm
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.MYD
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.MYI
wget ftp://hgdownload.soe.ucsc.edu/mysql/hg38/phastCons30way.frm
################################
# Pre-processing
# Recommended SNP list from https://www.nature.com/articles/s41588-018-0081-4
wget https://data.broadinstitute.org/alkesgroup/LDSCORE/w_hm3.snplist.bz2
bzip2 -d w_hm3.snplist.bz2
## Build a SNP list file to pass to LD score calculation of chromosome 6 and 14
## The paper suggests using the one above, however it is about 1/10th the list of
## SNPs used in the subset files, and there are SNPS in the hm3 list that aren't
## in the subsets, which seems odd. It may be that the hm3 list is filtered by
## MAF or similar, and is a faster list to process, but for now, I'm going to
## run both and see if it makes a difference.
## This takes all of the SNPs in the subsets *except* those in the MHC and GPHN
## region, and puts them in a single list. Even if we end up using the hm3 SNP
## list, we'll need this as a whitelist for the --print-snps flag when
## calculating LD scores.
awk '{if($4 > 66705000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' ../s3/subset.14.bim | awk '{if($4 < 67900000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > GPHNsnps.txt
awk '{if($4 > 28477000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' ../s3/subset.6.bim | awk '{if($4 < 33448000 )print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > MHCsnps.txt
diff ../s3/subset.6.bim MHCsnps.txt --suppress-common-lines --new-line-format= --unchanged-line-format= > chr6noMHC.txt
diff ../s3/subset.14.bim GPHNsnps.txt --suppress-common-lines --new-line-format= --unchanged-line-format= > chr14noGPHN.txt
rm NoMHC_GPHN_SNP.txt
touch NoMHC_GPHN_SNP.txt
for i in `seq 1 5`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt
cat chr6noMHC.txt >> NoMHC_GPHN_SNP.txt
for i in `seq 7 13`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt
cat chr14noGPHN.txt >> NoMHC_GPHN_SNP.txt
for i in `seq 15 22`
do cat ../s3/subset.${i}.bim
done >> NoMHC_GPHN_SNP.txt
## This makes a bedfile version, which is helpful for checking other code
awk '{ print "chr" $1 "\t" $4 "\t" $4+1 "\t" $2}' NoMHC_GPHN_SNP.txt > NoMHC_GPHN_SNP.bed
## This builds a list of just the SNP names, which is what the --print-snps flag expects
cut -f 2 NoMHC_GPHN_SNP.txt > NoMHC_GPHN_SNPlist.txt
## The hm3 SNP list doesn't have locations. So, to compare the output from using the
## hm3 SNPS vs all the SNPs in the subsets, I need to make it into a proper bed file
## This builds a SNP bed file from the recommended snplist: w_hm3.snplist and the location files
for i in `seq 1 22`
do cat ../s3/subset.${i}.bim
done >> allchr.bim
cut -f 1 w_hm3.snplist > names_w_hm3.snplist
grep -Fwf names_w_hm3.snplist allchr.bim | awk '{ print "chr" $1 "\t" $4 "\t" $4+1 "\t" $2}' > w_hm3.bed
## The conservation data we need only comes in Hg38. This is the
## program and file needed for Hg19/Hg37 to Hg38 liftover
wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
chmod +x liftOver
wget http://hgdownload.soe.ucsc.edu/goldenPath/hg19/liftOver/hg19ToHg38.over.chain.gz
./liftOver NoMHC_GPHN_SNP.bed hg19ToHg38.over.chain.gz NoMHC_GPHN_SNP_Hg38.bed unMapped
## Make some files that might get re-used
zgrep 'exon' Homo_sapiens.GRCh37.87.gff3.gz > allexons.gff
cd .. || exit
scontrol show job $SLURM_JOB_ID