-
Notifications
You must be signed in to change notification settings - Fork 0
/
crunch_clusters
executable file
·424 lines (386 loc) · 17.7 KB
/
crunch_clusters
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# crunch_clusters
# Written by PD Blischak
from __future__ import print_function, division
import argparse
from Bio import SeqIO
import numpy as np
import pandas as pd
import operator
import math
import sys
import re
import os
import platform
try:
import subprocess32 as sps
except ImportError:
import subprocess as sps
#### header and version info. ##################################################
try:
vnum = open("VERSION.txt", 'r').readline().strip()
except:
vnum = "0.2.1"
__version__ = "This is Fluidigm2PURC v"+vnum+"."
__header__ = """\
**********************************************
crunch_clusters
Processing of PURC clusters into haplotype
configurations.
**********************************************
"""
############################################
# Functions for calling external executables
############################################
def rename(input_fasta):
"""
remove semicolons from sequence names and replace with
underscores using sed.
"""
#print(" Checking for a working version of sed...", end='')
#try:
# sps.check_call(['sed', '-h'])
#except sps.CalledProcessError:
# sys.exit("\n\n** ERROR: could not execute sed **\n\n")
#print("Good\n")
if platform.system() == 'Darwin':
print("Renaming sequences with sed. [Mac]\n")
sed_cmd = [
"sed",
"-i", ".tmp",
"s/\;/\_/g",
input_fasta
]
else:
print("Renaming sequences with sed. [Linux]\n")
sed_cmd = [
"sed",
"-i",
"s/\;/\_/g",
input_fasta
]
proc = sps.call(sed_cmd)
def mafft(input_fasta, locus):
"""
Align input fasta file using mafft.
"""
print("Realigning sequences using Mafft.\n")
mafft_cmd = "mafft --auto --quiet "+input_fasta+" > "+locus+"-realigned.fasta"
proc = sps.call(mafft_cmd, shell=True)
def phyutility(input_fasta, threshold, locus):
"""
Clean gappy sequences using phyutility.
"""
print("Cleaning sequence with Phyutility.\n")
phyutility_cmd = "phyutility -clean "+str(threshold)+" -in "+input_fasta+" -out "+locus+"-cleaned.fasta"
proc = sps.call(phyutility_cmd, shell=True)
# http://stackoverflow.com/questions/10035752/elegant-python-code-for-integer-partitioning
# Stack Overflow saves the day again
def partition(number):
"""
Gives the full list of partitions for an integer n.
"""
answer = set()
answer.add((number, ))
for x in range(1, number):
for y in partition(number - x):
answer.add(tuple(sorted((x, ) + y, reverse=True)))
return answer
def resolve_haps(ploidy, num_alleles):
"""
Enumerate the way that the integer n can be written as a sum of k, smaller
integers. Returns a list of tuples each of size num_alleles.
"""
assert num_alleles <= ploidy
return [i for i in partition(ploidy) if len(i) == num_alleles]
def haps_llik_with_ploidy(sizes, ploidy, error):
"""
Calculate the multinomial likelihood of observing a particular haplotype
configuration if ploidy information is given in the taxon table.
"""
lliks = {}
for a in range (1,len(sizes)+1):
if len(sizes) == ploidy:
haplotypes = resolve_haps(len(sizes), a)
else:
haplotypes = resolve_haps(ploidy, a)
for h in haplotypes:
lliks[h] = 0.0
for g in range(0,len(h)):
lliks[h] += sizes[g][1] * math.log(h[g] / ploidy)
for e in range(len(h), len(sizes)):
lliks[h] += sizes[e][1] * math.log(error)
return lliks
def haps_llik_no_ploidy(sizes, error):
"""
Calculate the likelihood of a haplotype being "real" vs. sequencing error.
Used when we don't have ploidy level information.
"""
llik = {}
n = sum([sizes[i][1] for i in range(0, len(sizes))])
for a in range(0, len(sizes)+1):
llik[a] = 0.0
llik[a] += sum([math.log(1 - error) * sizes[h][1] for h in range(0,a)])
llik[a] += sum([math.log(error) * sizes[e][1] for e in range(a, len(sizes))])
return llik
def id_alleles(lliks, cutoff):
"""
Use the log likelihoods for the alleles that are id'd by the haps_llik_no_ploidy
functions. We use a cutoff here that treats alleles as being errors if they do
not increase the likelihood by the specified amount (default = 0.05).
"""
y = np.array(lliks.values())
delta_ll = y[len(y)-1] - y[0]
ll_ratio = {i: (y[i+1] - y[i]) / delta_ll for i in range(0, len(y)-1)}
num_alleles = 0
for k,v in ll_ratio.items():
if v > cutoff:
num_alleles +=1
else:
break
return (num_alleles, ll_ratio)
def combine_matching_seqs(clusts, fasta_idx, taxon):
"""
Compares all clusters for an individual to determine which sequences are
actually identical when we don't include gaps. Gaps can occur because of
read trimming from Sickle and from unmerged reads from FLASH2. Matching
clusters sizes are combined and a list of matching clusters is kept.
Note: There are two trade-offs here. (1) We could ignore combining things
and treat all clusters as being unique. However, we might miss a true cluster
due to not combining it with its gappy counterpart. (2) We may be combining
things that aren't truly identical. There is no way to know if this is
what is happening.
"""
matches_dict = {} # dictionary for storing matched sequences (gets returned by func)
matches = [] # keeps track of current matches
already_matched = [] # keeps track of seqs that matched to prevent checking again
for i in range(len(clusts)):
if clusts[i] in already_matched:
#print(clusts[i], "already matched.", sep=' ')
continue
seq_one = fasta_idx[clusts[i]].seq
seq_one_size = int(clusts[i].split('_')[-2].split("=")[1])
total_size = seq_one_size
matches = [clusts[i]]
header_one = clusts[i].split('_')[-3]
for j in range(i+1, len(clusts)):
match=True # starts as True, set to False if they don't match
seq_two = fasta_idx[clusts[j]].seq
seq_two_size = int(clusts[j].split('_')[-2].split("=")[1])
assert len(seq_one) == len(seq_two)
for s in range(len(seq_one)):
if seq_one[s] == '-' or seq_two[s] == '-':
continue
elif seq_one[s] != seq_two[s]:
#print(clusts[i], clusts[j], s+1, seq_one[s], seq_two[s])
match = False
break
else:
continue
if match:
total_size += seq_two_size
matches.append(clusts[j])
already_matched.append(clusts[j])
else:
pass
match_sizes = [int(i.split('_')[-2].split("=")[1]) for i in matches]
sorted_matches = [m for _,m in sorted(zip(match_sizes,matches), reverse=True)]
#print(matches,sorted_matches, sep='\n')
matches_dict[taxon + "_" + header_one + "_size=" + str(total_size) + "_"] = sorted_matches
return matches_dict
def crunch_clusters(fasta, orig_fasta, species, error, locus, cutoff, haploid=False, unique=False):
"""
The workhorse function that processes the input fasta file and dtermines
haplotype configurations based on whether or not ploid levels are specified.
A flag for treating the locus as haploid can be specified so that only the
primary cluster is kept.
"""
idx = SeqIO.index(fasta, "fasta")
headers = [i for i in idx.keys()]
keep_orig = False
if fasta != orig_fasta:
keep_orig = True
idx_orig = SeqIO.index(orig_fasta, "fasta")
# Check if outfiles already exist
if not os.path.exists("./"+locus+"_crunched_clusters.fasta"):
outfile = open(locus+"_crunched_clusters.fasta", 'w')
print("Writing haplotypes to "+locus+"_crunched_clusters.fasta.\n")
else:
os.rename("./"+locus+"_crunched_clusters.fasta", "./"+locus+"_crunched_clusters_old.fasta")
outfile = open(locus+"_crunched_clusters.fasta", 'w')
print("Writing haplotypes to "+locus+"_crunched_clusters.fasta.")
print("Notice: File "+locus+"_crunched_clusters.fasta already exists.")
print(" Renaming to "+locus+"_crunched_clusters_old.txt.\n")
if not os.path.exists("./"+locus+"_log.txt"):
logfile = open(locus+"_log.txt", 'a')
else:
os.rename("./"+locus+"_log.txt", "./"+locus+"_log_old.txt")
logfile = open(locus+"_log.txt", 'a')
if keep_orig:
if not os.path.exists("./"+locus+"_crunched_clusters_orig.fasta"):
orig_outfile = open(locus+"_crunched_clusters_orig.fasta", 'w')
print("Writing original sequences to "+locus+"_crunched_clusters_orig.fasta.\n")
else:
print("Writing original sequences to "+locus+"_crunched_clusters_orig.fasta")
os.rename("./"+locus+"_crunched_clusters_orig.fasta", "./"+locus+"_crunched_clusters_orig_old.fasta.")
orig_outfile = open(locus+"_crunched_clusters_orig.fasta", 'w')
print("Notice: File "+locus+"_crunched_clusters_orig.fasta already exists.")
print(" Renaming to "+locus+"_crunched_clusters_orig_old.fasta.\n")
taxon_df = pd.read_csv(species, delim_whitespace=True)
method=""
empty="_"
for t in list(taxon_df.Taxon):
clusters = [m.group(0) for h in headers for m in [re.match(t+"_"+".*", h)] if m]
if len(clusters) == 0:
print("Taxon ", t, " not in current file.", sep='')
continue
unique_clusters = combine_matching_seqs(clusters, idx, t)
write_log(t, unique_clusters, logfile) # log clustering info
sizes = {s: int(s.split("_")[-2].split("=")[1]) for s in unique_clusters.keys()}
sorted_sizes = sorted(sizes.items(), key=operator.itemgetter(1), reverse=True)
if haploid:
method = "haploid"
header = unique_clusters[sorted_sizes[0][0]][0].split("_")[0:-3]
print(">"+empty.join(header), file=outfile)
print(str(idx[unique_clusters[sorted_sizes[0][0]][0]].seq), file=outfile)
if keep_orig:
print(">"+empty.join(header), file=orig_outfile)
print(str(idx_orig[unique_clusters[sorted_sizes[0][0]][0]].seq), file=orig_outfile)
#ll = haps_llik_with_ploidy(sorted_sizes[0], 1, error)
#mle = sorted(ll.items(), key=operator.itemgetter(1), reverse=True)
elif str(taxon_df.loc[taxon_df.Taxon == t].Ploidy.values[0]) == "None":
method="ranked"
ll = haps_llik_no_ploidy(sorted_sizes, error)
num_alleles, ll_diffs = id_alleles(ll, cutoff)
elif str(taxon_df.loc[taxon_df.Taxon == t].Ploidy.values[0]) != "None":
# Catch the case when # of clusters is less than ploidy
try:
ploidy = int(taxon_df.loc[taxon_df.Taxon == t].Ploidy.values[0])
except ValueError:
print("\n ** WARNING: Haplotyping for individual", t, "was unsuccessful (bad ploidy in OTU table). **")
continue
method="mle"
if len(sorted_sizes) >= ploidy:
ll = haps_llik_with_ploidy(sorted_sizes[0:ploidy], ploidy, error)
else:
ll = haps_llik_with_ploidy(sorted_sizes, ploidy, error)
mle = sorted(ll.items(), key=operator.itemgetter(1), reverse=True)
else:
print("\n ** WARNING: Haplotyping for individual", t, "was unsuccessful. **\n")
continue
if method == "mle":
allele_num = 1
for m in range(0, len(mle[0][0])):
if unique:
header = unique_clusters[sorted_sizes[m][0]][0].split("_")[0:-3]
print(">"+empty.join(header)+"_"+str(allele_num), file=outfile)
print(str(idx[unique_clusters[sorted_sizes[m][0]][0]].seq), file=outfile)
if keep_orig:
print(">"+empty.join(header)+"_"+str(allele_num), file=orig_outfile)
print(str(idx_orig[unique_clusters[sorted_sizes[m][0]][0]].seq), file=orig_outfile)
allele_num += 1
continue
for rep in range(0, mle[0][0][m]):
header = unique_clusters[sorted_sizes[m][0]][0].split("_")[0:-3]
print(">"+empty.join(header)+"_"+str(allele_num), file=outfile)
#print(">"+str(idx[unique_clusters[sorted_sizes[m][0]][0]].id)+"A_"+str(allele_num), file=outfile)
print(str(idx[unique_clusters[sorted_sizes[m][0]][0]].seq), file=outfile)
if keep_orig:
print(">"+empty.join(header)+"_"+str(allele_num), file=orig_outfile)
print(str(idx_orig[unique_clusters[sorted_sizes[m][0]][0]].seq), file=orig_outfile)
allele_num += 1
elif method == "ranked":
allele_num = 1
for a in range(0, num_alleles):
header = unique_clusters[sorted_sizes[a][0]][0].split("_")[0:-3]
print(">"+empty.join(header)+"_"+str(allele_num), file=outfile)
print(str(idx[unique_clusters[sorted_sizes[a][0]][0]].seq), file=outfile)
if keep_orig:
print(">"+empty.join(header)+"_"+str(allele_num), file=orig_outfile)
print(str(idx_orig[unique_clusters[sorted_sizes[a][0]][0]].seq), file=orig_outfile)
allele_num += 1
elif method == "haploid":
continue
else:
pass
def write_log(taxon, clusters, log):
"""
Write cluster merging and size info to log file.
"""
print(taxon+":\n", file=log)
for k,v in clusters.items():
print(k, ":", [i for i in v], file=log)
print("\n\n", file=log)
def _main():
"""
Setup of command line arguments to pass to the crunch_clusters function to
analyze clusters from PURC and determine haplotype configurations.
"""
parser = argparse.ArgumentParser(description="Options for crunch_clusters",
add_help=True)
parser.add_argument('-v', '--version', action="version",
version=__version__)
required = parser.add_argument_group("required arguments")
required.add_argument('-i', '--input_fasta', action="store", type=str, required=True,
metavar='\b', help="Input FASTA file with sequence clusters")
required.add_argument('-s', '--species_table', action="store", type=str, required=True,
metavar='\b', help="Data frame with taxon metadata")
required.add_argument('-e', '--error_rates', action="store", type=str, required=True,
metavar='\b', help="Per locus error rates table")
required.add_argument('-l', '--locus_name', action="store", type=str, required=True,
metavar='\b', help="Name of locus being analyzed (must match name in error rates table)")
additional = parser.add_argument_group("additional arguments")
additional.add_argument('-c', '--cutoff', action="store", type=float, default=0.1,
metavar='\b', help="cutoff for haplotyping w/o ploidy [default=0.1]")
additional.add_argument('-hap', '--haploid', action="store_true", default=False,
help="flag to treat locus as haploid (e.g., for cpDNA loci)")
additional.add_argument('--realign', action="store_true", default=False,
help="realign sequences using mafft prior to crunching clusters")
additional.add_argument('--clean', action="store", type=float, default=-999.9,
metavar='\b', help="clean gappy sequences with phyutility using specified threshold")
additional.add_argument('--unique_haps', action="store_true", default=False,
help="Only return unique haplotypes")
args = parser.parse_args()
input_fasta = args.input_fasta
locus = args.locus_name
otu = args.species_table
error = args.error_rates
cutoff = args.cutoff
hap = args.haploid
realign = args.realign
clean = args.clean
unique_haps = args.unique_haps
print("\n", __header__, sep='')
# Check if error file exists
error_df = pd.read_csv(error, delim_whitespace=True)
err = float(error_df.loc[error_df.Locus == locus].Error)
rename(input_fasta)
clean_seqs = False # logical for sequence cleaning
# Check if a value for cleaning has been specified and that it is
# valid (0.0 < x < 1.0).
if clean != -999.9 and clean > 0.0 and clean < 1.0:
clean_seqs = True
elif clean == -999.9:
pass
else:
print(" Invalid cleaning threshold (", clean, ").", sep='')
print(" Must be between 0.0 and 1.0.")
sys.exit()
if clean_seqs and realign:
mafft(input_fasta, locus)
phyutility(locus+"-realigned.fasta", clean, locus)
crunch_clusters(locus+"-cleaned.fasta", input_fasta, otu, err, locus, cutoff, hap, unique_haps)
elif clean_seqs and not realign:
phyutility(input_fasta, clean, locus)
crunch_clusters(locus+"-cleaned.fasta", input_fasta, otu, err, locus, cutoff, hap, unique_haps)
elif realign and not clean_seqs:
mafft(input_fasta, locus)
crunch_clusters(locus+"-realigned.fasta", input_fasta, otu, err, locus, cutoff, hap, unique_haps)
else:
crunch_clusters(input_fasta, input_fasta, otu, err, locus, cutoff, hap, unique_haps)
if __name__ == "__main__":
"""
Run the main sript when called from the command line.
"""
_main()