-
Notifications
You must be signed in to change notification settings - Fork 3
/
PLINK_to_NicholsonFST.py
256 lines (240 loc) · 9.91 KB
/
PLINK_to_NicholsonFST.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
#!/usr/bin/env python
"""A script to generate input files for the Nicholson 2002 FST estimator, as
implemented in the R `popgen' package. This assumes PLINK input file formats.
It acceps a .PED and a .CLST file for genotyping information and cluster
membership. """
import argparse
def parse_args():
"""A function to parse arguments."""
parser = argparse.ArgumentParser(
description=('Generate input files for Nicholson 2002 FST estimation '
'from PLINK input files. Accepts .PED and .CLST files '
'and outputs .NUMA, .N, and .X files.'),
add_help=True)
parser.add_argument(
'--ped-file',
'-p',
required=True,
help='PED file')
parser.add_argument(
'--clst-file',
'-c',
required=True,
help='CLST file')
parser.add_argument(
'--missing-value',
'-m',
required=False,
help='Missing data value. Defaults to 0',
default='0')
parser.add_argument(
'--output-prefix',
'-o',
required=False,
help='Prefix for output files. Defaults to FST',
default='FST')
args = parser.parse_args()
return args
def summarize_data(clst, ped, miss):
"""A function to summarize the data in the given .CLST and .PED files.
Will return the number of loci, and the number of populations."""
# Empty dictionary to hold the data to summarize
assignment = {}
# First start to read through the cluster file
with open(clst, 'r') as fhandle:
for line in fhandle:
# Split it on whitespace
clst_data = line.strip().split()
# The two we are interested in are the individual ID, which is
# the first element, and the cluster ID, which is the last
# element.
ind_id = clst_data[0]
clst_id = clst_data[-1]
assignment[ind_id] = clst_id
# What are the cluster IDs?
clst_ids = sorted(list(set(assignment.values())))
# Now, we count how many loci and the alleles at each locus. We will also
gen_mat = []
with open(ped, 'r') as fhandle:
for index, line in enumerate(fhandle):
# split up the line on tabs
linedata = line.strip().split()
if index == 0:
# how many loci are there? We put this after an if, since
# we only really need to calculate this once.
# PLINK .PED files have six fields of non-genotype data, and
# also have diploid genotypes.
nloci = len(linedata[6:])/2
gen_mat.append(linedata[6:])
# We zip the matrix to transpose it
gen_mat = zip(*gen_mat)
# Then, iterate through the transposed matrix to get the alleles at each
# locus. Remeber it is diploid. Instead of iterating through the matrix
# itself, we iterate through the range of integers from 0 to nloci.
alleles = []
for index in xrange(0, nloci):
# Get the diploid calls
strand1 = ''.join(gen_mat[2*index])
strand2 = ''.join(gen_mat[(2*index) + 1])
# Create a set out of them to count the number of alleles
all_alleles = set(strand1 + strand2)
# Then remove the missing data value, if it's there
all_alleles.discard(miss)
# Then sort the alleles (so we can make sure they are always in the
# same order) and return them
alleles.append(sorted(list(all_alleles)))
# Return all the data
return(assignment, clst_ids, nloci, alleles)
def separate_clusters(pops, ped):
"""A function to separate the genotyping data into separate matrices by
cluster ID as listed in the cluster file. This function takes two lists
and returns a dictionary of lists."""
# Build the data structure to return
# This will be of the form:
# {
# 'pop1_ID': [Genotype matrix for Pop 1],
# 'pop2_ID': [Genotype matrix for Pop 2],
# ...
# 'popN_ID': [Genotype matrix for Pop 3]
# }
pop_data = {}
for popid in pops.values():
pop_data[popid] = []
# Next, we iterate through the ped file again, and separate the calls
# based on population
with open(ped, 'r') as fhandle:
for line in fhandle:
# We split on tabs
ind_data = line.strip().split()
# What is the individual ID? It is the second column
ind_id = ind_data[1]
# Based on that, what is the population of origin?
pop_orig = pops[ind_id]
# Then, tack that data onto the appropriate list in pop_data
# We just care about the genotypes.
pop_data[pop_orig].append(ind_data[6:])
return pop_data
def do_counts(nloci, alleles, miss, population_genotypes):
"""A function to iterate through the population-separated dictionary and
return the counts for the Nicholson estimator. This will count the number
of genotypes at each locus for each population, and the number of each
allele at each locus in each population."""
# Start a new dictionary for our counts. We can count alleles right away.
# Basically, it will look like this:
# {
# 'NumAlleles': [2, 2, 2, 2, ... N_Loci],
# 'NumGenotypes': {
# 'Pop1_ID': [200, 200, 200, ..., N_Loci],
# 'Pop2_ID': [200, 200, 200, ..., N_Loci],
# ...
# 'PopN_ID': [200, 200, 200, ..., N_Loci]
# }
# 'PopAlleles': {
# 'Pop1_ID': {
# [100, ..., N_Loci],
# [100, ..., N_Loci',
# ...
# [100, ..., N_Loci]
# },
# 'Pop2_ID': {
# ...
# },
# ...
# 'PopN_ID': {
# ...
# }
# }
# }
counts = {
'NumAlleles': [str(len(a)) for a in alleles],
'NumGenotypes': {},
'PopAlleles': {}
}
# Start iterating through the dictionary and performing the counts
for popid, genotypes in population_genotypes.iteritems():
# Start sub-lists for our data, since they will ultimately need to
# be matrices.
counts['NumGenotypes'][popid] = []
counts['PopAlleles'][popid] = []
# Then, iterate through the transposed genotypes and count up the
# relevant information
g_trans = zip(*genotypes)
for index in xrange(0, nloci):
# Start counting the number of genotypes
n_geno = 0
# Start a dict of the counts of inidividual alleles
p_allele = {}
for a in alleles[index]:
p_allele[a] = 0
strand1 = g_trans[2*index]
strand2 = g_trans[(2*index) + 1]
# Zip them and start counting
for call in zip(strand1, strand2):
if miss in call:
continue
else:
# No missing data -> add one to n_geno
n_geno += 1
# Count up the number of alleles in the pop.
for a in alleles[index]:
if a in call:
p_allele[a] += 1
# Then, tack the appropriate information onto the population
# specific list
counts['NumGenotypes'][popid].append(str(n_geno))
popallele = []
for a in p_allele:
popallele.append(str(p_allele[a]))
counts['PopAlleles'][popid].append(popallele)
# Return the big dictionary
return counts
def write_counts(clusters, numloci, nicholson_data, prefix):
"""This function just writes the matrices into the correct files."""
# Start the handles for writing data
numa_handle = open(prefix + '_NUMA.txt', 'w')
N_handle = open(prefix + '_N.txt', 'w')
X_handle = open(prefix + '_X.txt', 'w')
# Get the NUMA out of the way
numa_handle.write('\n'.join(nicholson_data['NumAlleles']))
numa_handle.flush()
numa_handle.close()
# Then iterate through the data and start writing things!
for index in xrange(0, numloci):
N_towrite = []
X_towrite = []
for c in clusters:
N_towrite.append(nicholson_data['NumGenotypes'][c][index])
X_singlepop = []
for a in xrange(0, int(nicholson_data['NumAlleles'][index])):
X_singlepop.append(nicholson_data['PopAlleles'][c][index][a])
X_towrite += X_singlepop
N_handle.write('\t'.join(N_towrite) + '\n')
X_handle.write('\t'.join(X_towrite) + '\n')
# Flush those handles and close them
N_handle.flush()
N_handle.close()
X_handle.flush()
X_handle.close()
return
def main():
"""Main function."""
arguments = vars(parse_args())
ped_file = arguments['ped_file']
clst_file = arguments['clst_file']
missing_value = arguments['missing_value']
output_prefix = arguments['output_prefix']
# What are the populations and the number of loci?
assignment, clusters, n_loci, alleles = summarize_data(
clst_file,
ped_file,
missing_value)
# Next, we separate the data matrix by population
population_genotypes = separate_clusters(assignment, ped_file)
count_data = do_counts(
n_loci,
alleles,
missing_value,
population_genotypes)
write_counts(clusters, n_loci, count_data, output_prefix)
return
main()