forked from tvkent/Degeneracy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_4fold_sites.sh
76 lines (64 loc) · 2.15 KB
/
get_4fold_sites.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/bin/bash
#pipeline for getting 4fold degenerate sites
#from gff and fasta, using python, awk, and bedtools
#Tyler Kent
#14 March 2017
###################################
# SET UP PATHS
#
# This is the only portion of the
# pipeline that needs to be
# adjusted. Assume gff is gzipped.
###################################
working_dir=/Volumes/Alter/LHISI
gff=${working_dir}/References/Annotation/LHISI_Scaffold_Assembly.annotation.gff.gz
fasta=${working_dir}//References/LHISI_Scaffold_Assembly.fasta
CDSbedout=${working_dir}/Analyses/DeleteriousMutations/LHISI_cds.bed
fastaCDSout=${working_dir}/Analyses/DeleteriousMutations/LHISI_cds.tab
fourfoldbedout=${working_dir}/Analyses/DeleteriousMutations/LHISI_degenerate
###################################
# STEP 1: GET BED FILE OF CDS AND
# SHIFT TO MATCH PHASE
#
# CDS sequence in GFF format
# contains sections of translated
# sequence, with phase info, which
# indicates the start of the first
# codon.
###################################
bash gff2bed.sh ${gff} CDS | awk -f gffphaseshift.awk - > ${CDSbedout}
###################################
# STEP 2: USE BED FILE AND FASTA
# TO GET FILE OF POS AND SEQUENCE
#
# Use bedtools to get relevant
# fasta sequence into useable
# format.
###################################
bedtools getfasta -s -tab -name -fi ${fasta} -bed ${CDSbedout} > ${fastaCDSout}
###################################
# STEP 3: KEEP ONLY LONGEST
# ISOFORM
#
# Drop all alternate isoforms but
# the longest.
###################################
# DEPRECATED--DONT DO THIS STEP
###python keep_longest_isoform.py -i ${fastaCDSout} -o ${longestonly}
###################################
# STEP 4: CONVERT FASTA DNA
# SEQUENCE INTO CODONS, FLIP FOR
# PHASE, AND REPORT 4FOLD SITES
#
# Using python and a codon table
###################################
python degeneracy.py -i ${fastaCDSout} -o ${fourfoldbedout}
###################################
# STEP 5: SORT OUTPUT
#
# Just need to sort the python
# output like you would a normal
# bed file, and drop mistake dups.
###################################
# DEPRECATED--DONT DO THIS STEP
#cat ${fourfoldbedout}.bed | sort -k1,1 -k2,2n | uniq > ${fourfoldbedout}.sorted.bed