-
Notifications
You must be signed in to change notification settings - Fork 0
/
CentromereMapped_LocationSplit.py
65 lines (52 loc) · 3.04 KB
/
CentromereMapped_LocationSplit.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#!/urs/bin/env python
#This script splits mapped reads by position on the consensus sequence they map to
#All reads that map to a position are written out to a csv file
##import modules
import pandas as pd
print('libraries imported')
####Set paths of mapped sam file
pathinput1 = '/projects/dumont-lab/uma/centromere_mapping/files/sample_Forward_bothEndsMapped.sam'
pathinput2 = '/projects/dumont-lab/uma/centromere_mapping/files/sample_Reverse_bothEndsMapped.sam'
print('paths read')
####################################
##Read in forward reads data frame##
####################################
col_names = ["col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", "col14", "col15", "col16", "col17"]
forward_reads = pd.read_table(pathinput1, sep = '\t', header = None, names = col_names)
##Remove rows that have reads that are not 100 bp
filter = (forward_reads['col10'].str.len() == 100)
forward_reads_100bp = forward_reads[filter]
##Separate minor and major satellite
major_reads = forward_reads_100bp[forward_reads_100bp['col3'] == 'MajorSatelliteConsensusWongandRattner1988x3']
minor_reads = forward_reads_100bp[forward_reads_100bp['col3'] == 'MinorSatelliteConsensusWongandRattner1988x3']
####Write out each position as a separate table
for i in range(360):
position_subset = minor_reads[minor_reads['col4'] == i + 1]
pathoutput = '/projects/dumont-lab/uma/centromere_mapping/files/CAST_Forward_minor_' + str(i) + '.csv'
position_subset.to_csv(pathoutput)
####Write out each position as a separate table
for i in range(702):
position_subset = major_reads[major_reads['col4'] == i + 1]
pathoutput = '/projects/dumont-lab/uma/centromere_mapping/files/CAST_Forward_major_' + str(i) + '.csv'
position_subset.to_csv(pathoutput)
####################################
##Read in reverse reads data frame##
####################################
col_names = ["col1", "col2", "col3", "col4", "col5", "col6", "col7", "col8", "col9", "col10", "col11", "col12", "col13", "col14", "col15", "col16", "col17"]
reverse_reads = pd.read_table(pathinput2, sep = '\t', header = None, names = col_names)
##Remove rows that have reads that are not 100 bp
filter = (reverse_reads['col10'].str.len() == 100)
reverse_reads_100bp = reverse_reads[filter]
##Separate minor and major satellite
major_reads = reverse_reads_100bp[reverse_reads_100bp['col3'] == 'MajorSatelliteConsensusWongandRattner1988x3']
minor_reads = reverse_reads_100bp[reverse_reads_100bp['col3'] == 'MinorSatelliteConsensusWongandRattner1988x3']
####Write out each position as a separate table
for i in range(360):
position_subset = minor_reads[minor_reads['col4'] == i + 1]
pathoutput = '/projects/dumont-lab/uma/centromere_mapping/files/CAST_Reverse_minor_' + str(i) + '.csv'
position_subset.to_csv(pathoutput)
####Write out each position as a separate table
for i in range(702):
position_subset = major_reads[major_reads['col4'] == i + 1]
pathoutput = '/projects/dumont-lab/uma/centromere_mapping/files/CAST_Reverse_major_' + str(i) + '.csv'
position_subset.to_csv(pathoutput)