forked from 23andMe/yhaplo
-
Notifications
You must be signed in to change notification settings - Fork 1
/
sample.py
902 lines (707 loc) · 34.4 KB
/
sample.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
# David Poznik
# 2016.1.26
# sample.py
#
# Defines two classes:
# - Sample
# - Customer (a subclass of Sample)
#----------------------------------------------------------------------
from __future__ import absolute_import
import six
import sys
from collections import defaultdict
from operator import attrgetter
from six.moves import range
from . import utils
from .snp import PlatformSNP
class Sample(object):
'''
A sample:
- holds a genotype dictionary
- knows its haplogroup.
'''
tree = None
config = None
args = None
errAndLog = None
numAssigned = 0
numRootCalls = 0
sampleList = list()
prevCalledHaplogroupDict = dict()
def __init__(self, ID, sampleIndex=None):
self.ID = ID
self.sampleIndex = sampleIndex # for snp-major data
self.pos2genoDict = dict()
self.ancDerCountTupleList = list()
self.haplogroupNode = None
self.mostDerivedSNP = None
self.derSNPlist = None
self.ancSNPlist = None
self.prevCalledHaplogroup = self.config.missingHaplogroup
self.prevCalledHaplogroupDFSrank = 0
if Sample.config.compareToPrevCalls:
self.setPrevCalledHaplogroup()
@property
def haplogroup(self):
'haplogroup: YCC nomenclature (e.g., Q1a2a1a1)'
return self.haplogroupNodeAttribute('haplogroup')
@property
def hgSNP(self):
'haplogroup: representative-SNP form (e.g., Q-M3)'
return self.haplogroupNodeAttribute('hgSNP')
@property
def hgTrunc(self):
'haplogroup: truncated form (e.g., Q1a2a1a1 -> Q)'
return self.haplogroupNodeAttribute('hgTrunc')
@property
def hgSNPobs(self):
'''
like hgSNP, but rather than one representative SNP per haplogroup,
uses the most highly ranked SNP this individual was observed to possess
'''
if self.mostDerivedSNP:
return self.mostDerivedSNP.hgSNP
elif self.haplogroupNode:
return Sample.tree.root.haplogroup
else:
self.prematureHaplogroupAccessError()
@property
def haplogroupDFSrank(self):
'depth-first-search ranking of haplogroup node'
return self.haplogroupNodeAttribute('DFSrank')
def haplogroupNodeAttribute(self, attribute):
'look up attribute of self.haplogroupNode'
if self.haplogroupNode:
return getattr(self.haplogroupNode, attribute)
else:
self.prematureHaplogroupAccessError()
def prematureHaplogroupAccessError(self):
sys.exit('ERROR. Attempted to access haplogroup before assigned: %s' % self.ID)
# string representations and output
#----------------------------------------------------------------------
def __str__(self):
'string representation gets extra information if previous calls exist'
sampleString = ('%-8s %-15s %-15s %-25s' %
(self.ID, self.hgSNPobs, self.hgSNP, self.haplogroup))
if Sample.config.compareToPrevCalls:
prevHgStart = self.prevCalledHaplogroup[:Sample.config.numCharsToCompare]
hgStart = self.haplogroup[:Sample.config.numCharsToCompare]
matchFlag = '.' if prevHgStart == hgStart else '*'
sampleString = '%s %-25s %s' % (sampleString,
self.prevCalledHaplogroup, matchFlag)
return sampleString
def strCompressed(self):
return utils.compressWhitespace(str(self))
def strSimple(self):
'string representation with just ID, haplogroup, and hgSNP'
return '%-8s %-25s %-15s' % (self.ID, self.haplogroup, self.hgSNP)
def strForCounts(self):
'string representation for anc/der counts output'
leftPart = '%-8s %s' % (self.ID, self.haplogroup)
rightPart = '%s %s' % (self.hgSNPobs, self.hgSNP)
if Sample.config.compareToPrevCalls:
return '%s %s | %s' % (leftPart, self.prevCalledHaplogroup, rightPart)
else:
return '%s | %s' % (leftPart, rightPart)
def strSNPs(self, ancestral):
'constructs a string representation with derived or ancestral SNPs'
if ancestral:
snpList = self.ancSNPlist
else:
snpList = self.derSNPlist
snpListString = ' '.join(snp.strShort() for snp in snpList)
return '%s | %s' % (self.strSimple(), snpListString)
def strHaplogroupPath(self, include_SNPs=False):
'constructs a string representation with haplogroup path'
if self.mostDerivedSNP:
snp_label_list_dict = defaultdict(list)
for snp in self.derSNPlist:
snp_label_list_dict[snp.node.haplogroup].append(snp.labelCleaned)
path_info_list = list()
for node in self.mostDerivedSNP.backTracePath():
if node.haplogroup in snp_label_list_dict:
snp_label_list = snp_label_list_dict[node.haplogroup]
num_snps = len(snp_label_list)
path_info = '%s:%d' % (node.haplogroup, num_snps)
if include_SNPs:
path_info = '%s:%s' % (path_info, ','.join(snp_label_list))
path_info_list.append(path_info)
haplogroup_path = ' '.join(path_info_list)
else:
haplogroup_path = ''
return '%s | %s' % (self.strSimple(), haplogroup_path)
def realTimeOutput(self):
'generate real-time output if requested'
if Sample.args.writeHaplogroupsRealTime:
output = '%s %5d\n' % (str(self), self.haplogroupDFSrank)
Sample.config.haplogroupRealTimeFile.write(output)
if Sample.args.haplogroupToListGenotypesFor:
Sample.config.hgGenosFile.write('%s\n\n' % self.strCompressed())
# previously called haplogroups
#----------------------------------------------------------------------
def setPrevCalledHaplogroup(self):
'''
sets previously called haplogroup for testing/comparison.
also sets corresponding DFS rank for sorting
'''
if not Sample.prevCalledHaplogroupDict:
Sample.importPrevCalledHaplogroups()
if self.ID in Sample.prevCalledHaplogroupDict:
self.prevCalledHaplogroup = Sample.prevCalledHaplogroupDict[self.ID]
else:
Sample.errAndLog('WARNING. No previously called haplogroup for: %s\n' % self.ID)
self.setPrevCalledHaplogroupDFSrank()
@staticmethod
def importPrevCalledHaplogroups():
'''
reads file with previously called haplogroups,
assuming first col = ID & last col = haplogroup
'''
utils.checkFileExistence(Sample.config.prevCalledHgFN, 'Previously called haplogroups')
with open(Sample.config.prevCalledHgFN, 'r') as prevCalledHgFile:
for line in prevCalledHgFile:
lineList = line.strip().split()
ID, prevCalledHaplogroup = lineList[0], lineList[-1]
Sample.prevCalledHaplogroupDict[ID] = prevCalledHaplogroup
Sample.errAndLog('%sRead previously called haplogroups:\n %s\n\n' %
(utils.DASHES, Sample.config.prevCalledHgFN))
def setPrevCalledHaplogroupDFSrank(self, ignore=False):
'sets depth-first search rank of previously called haplogroup'
hg2nodeDict = Sample.tree.hg2nodeDict
if not ignore and self.prevCalledHaplogroup != self.config.missingHaplogroup:
haplogroupKey = self.prevCalledHaplogroup
while haplogroupKey not in hg2nodeDict and len(haplogroupKey) > 0:
haplogroupKey = haplogroupKey[:-1]
if haplogroupKey in hg2nodeDict:
self.prevCalledHaplogroupDFSrank = hg2nodeDict[haplogroupKey].DFSrank
# mutaters
#----------------------------------------------------------------------
def addGeno(self, position, genotype):
'''
adds one value to the genotype dictionary
if a contradiction is encountered, sets value to missing
Note: there is no reason to call this method with a missing genotype,
this should be the only way missing values enter the dictionary.
'''
if position in self.pos2genoDict:
if genotype != self.pos2genoDict[position]:
self.pos2genoDict[position] = self.config.missingGenotype
else:
self.pos2genoDict[position] = genotype
def appendAncDerCountTuple(self, node, numAncestral, numDerived):
'stores results of search path'
ancDerCountTuple = (node, numAncestral, numDerived)
self.ancDerCountTupleList.append(ancDerCountTuple)
def callHaplogroup(self):
'finds path through tree and returns haplogroup'
Sample.numAssigned += 1
path, self.ancSNPlist = Sample.tree.identifyPhylogeneticPath(self)
self.derSNPlist = path.derSNPlist
self.mostDerivedSNP = path.mostDerivedSNP
if self.mostDerivedSNP:
self.haplogroupNode = self.mostDerivedSNP.node
else:
Sample.numRootCalls += 1
self.haplogroupNode = Sample.tree.root
self.fixHaplogroupIfArtifact()
self.realTimeOutput()
self.freeUpMemory()
def fixHaplogroupIfArtifact(self):
'fixes artifactual haplogroup assignments; override as appropriate'
pass
def freeUpMemory(self):
'free up some memory if possible'
self.pos2genoDict = None
if not (Sample.args.writeDerSNPs or
Sample.args.writeDerSNPsDetail or
Sample.args.writeHaplogroupPaths or
Sample.args.writeHaplogroupPathsDetail):
self.derSNPlist = None
if not (Sample.args.writeAncSNPs or
Sample.args.writeAncSNPsDetail):
self.ancSNPlist = None
#----------------------------------------------------------------------
# Run: main entry point
#----------------------------------------------------------------------
@staticmethod
def callHaplogroups(config, tree):
'this method is the entry point and is to be called from outside.'
Sample.setTreeConfigAndArgs(config, tree)
Sample.testNumberOfRunModes(config)
if config.runFromSampleMajorTxt:
Sample.runFromSampleMajorTxt()
elif config.runFromVCF or config.runFromVCF4:
Sample.runFromVCF()
elif config.runFromAblocks:
Customer.runFromAblocks()
else:
Sample.errAndLog('%sNo input genotypes specified. Exiting.\n\n' % utils.DASHES)
config.closeFiles()
@staticmethod
def setTreeConfigAndArgs(config, tree):
'enables Sample class to know about the tree instance, config, and args'
Sample.config = config
Sample.args = config.args
Sample.errAndLog = config.errAndLog
Sample.tree = tree
if Sample.args.writeHaplogroupsRealTime:
Sample.realTimeHaplogroupWritingMessage()
@staticmethod
def realTimeHaplogroupWritingMessage():
'emit a message for real-time haplogroup writing'
Sample.errAndLog(
'%sWill write haplogroups as they are called:\n' % utils.DASHES +
' %s\n\n' % Sample.config.haplogroupRealTimeFN +
'Note: This file includes DFS rank, so it can be sorted ex post facto with:\n' +
' sort -nk5 %s\n\n' % Sample.config.haplogroupRealTimeFN)
@staticmethod
def testNumberOfRunModes(config):
'consistency check the number of run modes'
numberOfRunModesSelected = (config.runFromSampleMajorTxt +
config.runFromVCF + config.runFromVCF4 + config.runFromAblocks)
if numberOfRunModesSelected > 1:
sys.exit('ERROR. Expecting no more than one run mode\n' +
' %d selected\n' % numberOfRunModesSelected)
#----------------------------------------------------------------------
# Run option 1: sample-major text data
#----------------------------------------------------------------------
@staticmethod
def runFromSampleMajorTxt():
'run pipeline on sample-major data'
Sample.processSampleMajorTxtandCallHaplogroups()
Sample.sortSampleList()
Sample.writeSampleList()
@staticmethod
def processSampleMajorTxtandCallHaplogroups():
'''
reads in sample major data, calling haplogroup for each line.
returns list of sample objects with genotype data purged.
assumed format:
row 1 = physical coordinates
column 1 = sample ID
'''
genoFN = Sample.args.dataFN
genoFile, genoReader = utils.getCSVreader(genoFN, delimiter='\t')
Sample.errAndLog('%sReading genotype data:\n %s\n\n' %
(utils.DASHES, genoFN))
# determine relevant physical coordinates and corresponding columns
allPositionsList = [int(position) for position in six.next(genoReader)[1:]]
columnPositionTupleList = list()
for column, position in enumerate(allPositionsList):
if position in Sample.tree.snpPosSet:
columnPositionTupleList.append((column, position))
# read genotypes, call haplogroups
for genoList in genoReader:
ID, genoList = genoList[0], genoList[1:]
if Sample.args.singleSampleID and ID != Sample.args.singleSampleID:
continue
sample = Sample(ID)
for column, position in columnPositionTupleList:
genotype = genoList[column]
if genotype != Sample.config.missingGenotype:
sample.addGeno(position, genotype)
sample.callHaplogroup()
Sample.sampleList.append(sample)
genoFile.close()
#----------------------------------------------------------------------
# Run option 2: snp-major txt data (VCF)
#----------------------------------------------------------------------
@staticmethod
def runFromVCF():
'run pipeline on snp-major data'
Sample.loadDataFromVCF()
if (Sample.args.writeHaplogroupsRealTime
or Sample.args.haplogroupToListGenotypesFor):
Sample.sortSampleList(sortByPrevHg=True)
for sample in Sample.sampleList:
sample.callHaplogroup()
Sample.sortSampleList()
Sample.writeSampleList()
@staticmethod
def loadDataFromVCF():
'constructs list of sample objects, each with a genotype dictionary'
vcfFN = Sample.args.dataFN
vcfFile, vcfReader = utils.getCSVreader(vcfFN, delimiter='\t')
Sample.setSampleListFromVCFheader(vcfFile)
ref_geno_set = {'0', '0/0'}
alt_geno_set = {'1', '1/1'}
Sample.errAndLog('%sReading genotype data...\n %s\n\n' %
(utils.DASHES, vcfFN))
for lineList in vcfReader:
chromosome, position = lineList[0], int(lineList[1])
if (chromosome in Sample.config.vcf_chrom_label_set and
position in Sample.tree.snpPosSet):
genoList = lineList[Sample.config.vcfStartCol:]
for sample in Sample.sampleList:
genotype = genoList[sample.sampleIndex].split(':')[0]
if genotype == Sample.config.missingGenotype:
continue
elif Sample.config.runFromVCF: # as opposed to .vcf4
ref, alt = lineList[3:5]
if genotype in ref_geno_set:
genotype = ref
elif genotype in alt_geno_set:
genotype = alt
sample.addGeno(position, genotype)
vcfFile.close()
@staticmethod
def setSampleListFromVCFheader(vcfFile):
'''
skips over VCF metadata,
checks that the next line contains the column names,
extracts sample IDs,
and construct a list of sample objects
'''
for line in vcfFile:
if not line.startswith('##'):
break
lineList = line.strip().split('\t')
Sample.validateVCFheader(lineList)
idList = lineList[Sample.config.vcfStartCol:]
for sampleIndex, ID in enumerate(idList):
if not Sample.args.singleSampleID or ID == Sample.args.singleSampleID:
sample = Sample(ID, sampleIndex)
Sample.sampleList.append(sample)
@staticmethod
def validateVCFheader(lineList):
'checks that the second element of a list is POS'
col2label = lineList[1]
if col2label != 'POS':
sys.exit('ERROR. Invalid VCF. Expected column 2 header to be POS.\n' +
' Instead found: %s' % col2label)
# sample list manipulation and writing
#----------------------------------------------------------------------
@staticmethod
def sortSampleList(sortByPrevHg=False):
'''
sorts sample list:
1. primarily by haplogroup
2. secondarily by ID
If a previously called haplogroup is available and sortByPrevHg is True,
the primary sort uses it. Otherwise, if a newly called haplogroup is
available, the primary sort uses that. Otherwise, sorting is by ID only.
'''
Sample.sampleList = sorted(Sample.sampleList, key=attrgetter('ID'))
if sortByPrevHg and Sample.config.compareToPrevCalls:
Sample.sampleList = sorted(Sample.sampleList,
key=attrgetter('prevCalledHaplogroupDFSrank'))
elif Sample.sampleList[0].haplogroupNode:
Sample.sampleList = sorted(Sample.sampleList,
key=attrgetter('haplogroupDFSrank'))
@staticmethod
def writeSampleList():
'writes haplogroup and other optional data for each sample'
Sample.reportCounts()
Sample.errAndLog('%sOutput\n\n' % utils.DASHES)
if Sample.config.suppressOutputAndLog:
Sample.errAndLog('None (suppressed).\n\n')
else:
Sample.writeHaplogroups() # uses str(sample)
if Sample.args.writeAncDerCounts:
Sample.writeAncDerCounts() # uses sample.strForCounts()
if Sample.args.writeHaplogroupPathsDetail:
Sample.writeHaplogroupPaths(include_SNPs=True) # uses sample.strHaplogroupPath()
elif Sample.args.writeHaplogroupPaths:
Sample.writeHaplogroupPaths() # uses sample.strHaplogroupPath()
if Sample.args.writeDerSNPs:
Sample.writeSNPs() # uses sample.strSNPs(ancestral)
if Sample.args.writeDerSNPsDetail:
Sample.writeSNPsDetail() # uses sample.strCompressed()
if Sample.args.writeAncSNPs:
Sample.writeSNPs(ancestral=True)
if Sample.args.writeAncSNPsDetail:
Sample.writeSNPsDetail(ancestral=True)
@staticmethod
def reportCounts():
'report number assigned and number assigned to root'
Sample.errAndLog('%sCalled haplogroups:\n\n' % utils.DASHES +
' %8d assigned\n' % Sample.numAssigned +
' %8d assigned to root haplogroup: %s\n\n' %
(Sample.numRootCalls, Sample.tree.root.haplogroup))
if Sample.numRootCalls > 0:
Sample.warnVariantsOnlyData()
@staticmethod
def warnVariantsOnlyData():
'warning for datasets that exclude sites with no variation in the sample'
Sample.errAndLog(
'WARNING. If the dataset does not include fixed reference sites,\n' +
' re-run with alternative root (e.g., with: -r A0-T).\n\n\n')
@staticmethod
def writeHaplogroups():
'writes haplogroup of each sample'
with open(Sample.config.haplogroupCallsFN, 'w') as haplogroupCallsFile:
for sample in Sample.sampleList:
haplogroupCallsFile.write('%s\n' % str(sample))
Sample.errAndLog(
'Wrote called haplogroups:\n' +
' %s\n\n' % Sample.config.haplogroupCallsFN)
@staticmethod
def writeAncDerCounts():
'''
writes counts of ancestral and derived alleles encountered
at each node visited (excluding nodes with zero of each)
'''
with open(Sample.config.countsAncDerFN, 'w') as countsAncDerFile:
for sample in Sample.sampleList:
for node, numAncestral, numDerived in sample.ancDerCountTupleList:
if numAncestral > 0 or numDerived > 0:
countsAncDerFile.write('%-8s %-20s %3d %3d\n' %
(sample.ID, node.label, numAncestral, numDerived))
countsAncDerFile.write('%s\n\n' % sample.strForCounts())
Sample.errAndLog('Wrote counts of ancestral and derived alleles encountered\n' +
'at each node visited (excluding nodes with zero of each):\n' +
' %s\n\n' % Sample.config.countsAncDerFN)
@staticmethod
def writeHaplogroupPaths(include_SNPs=False):
'writes haplogroup path for each sample'
with open(Sample.config.haplogroupPathsFN, 'w') as haplogroupPathsFile:
for sample in Sample.sampleList:
path = sample.strHaplogroupPath(include_SNPs)
haplogroupPathsFile.write('%s\n' % path)
snps_included_text = ' and a list thereof' if include_SNPs else ''
Sample.errAndLog('Wrote sequences of haplogroups from root to calls,\n' +
'with counts of derived SNPs observed%s:\n' % snps_included_text +
' %s\n\n' % Sample.config.haplogroupPathsFN)
@staticmethod
def writeSNPs(ancestral=False):
'''
for each sample, writes list of derived SNPs on path
or list of ancestral SNPs encountered in search
'''
if ancestral:
snpFN = Sample.config.ancSNPsFN
typeOfSNPs = 'ancestral SNPs encountered in search'
else:
snpFN = Sample.config.derSNPsFN
typeOfSNPs = 'derived SNPs on path'
with open(snpFN, 'w') as snpFile:
for sample in Sample.sampleList:
snpFile.write('%s\n' % sample.strSNPs(ancestral))
Sample.errAndLog('Wrote lists of %s:\n %s\n\n' % (typeOfSNPs, snpFN))
@staticmethod
def writeSNPsDetail(ancestral=False):
'''
for each sample, writes detailed information about each
derived SNP on path or about each ancestral SNP encountered in search
'''
if ancestral:
snpDetailFN = Sample.config.ancSNPsDetailFN
typeOfSNPs = 'ancestral SNP encountered in search'
else:
snpDetailFN = Sample.config.derSNPsDetailFN
typeOfSNPs = 'derived SNP on path'
with open(snpDetailFN, 'w') as snpDetailFile:
for sample in Sample.sampleList:
snpDetailFile.write('%s\n' % sample.strCompressed())
snpList = sample.ancSNPlist if ancestral else sample.derSNPlist
for snp in snpList:
snpDetailFile.write('%-8s %s\n' % (sample.ID, snp))
snpDetailFile.write('\n')
Sample.errAndLog('Wrote detailed information about each %s:\n' % typeOfSNPs +
' %s\n\n' % snpDetailFN)
#--------------------------------------------------------------------------
class Customer(Sample):
'a "customer" is any sample whose genotypes are stored as 23andMe ablocks'
numNoAblock, numNoGenotypes = 0, 0
noAblocksFile, noGenotypesFile = None, None
def __init__(self, customerTuple):
self.customerTuple = customerTuple
Sample.__init__(self, customerTuple.resid)
def setPrevCalledHaplogroup(self):
'''
called from Customer.__init__() via Sample.__init__() (if config.compareToPrevCalls)
for testing/comparison, sets previously called haplogroup from
original 23andMe algorithm. does not set corresponding DFS rank
since the nomenclature has changed substantially
'''
self.prevCalledHaplogroup = self.customerTuple.y_haplogroup
def loadAblockAndCallHaplogroup(self, purgeGenotypes=True):
'loads ablock, reads relevant genotypes, calls haplogroup'
try:
ablock = Sample.config.ablockDS.load_block(self.ID)
except:
Customer.numNoAblock += 1
if Customer.noAblocksFile:
Customer.noAblocksFile.write('%d\n' % self.ID)
return False
if self.readAblockGenotypes(ablock):
self.callHaplogroup()
return True
else:
Customer.numNoGenotypes += 1
if Customer.noGenotypesFile:
Customer.noGenotypesFile.write('%d\n' % self.ID)
return False
def readAblockGenotypes(self, ablock):
'pulls phylogenetically informative genotypes from ablock'
hasGenotypes = False
for platformVersion in range(1, Sample.config.maxPlatformVersionPlusOne):
if getattr(self.customerTuple, 'is_v%d' % platformVersion):
hasGenotypes = True
platformSNPlist = PlatformSNP.platformSNPlistDict[platformVersion]
for platformSNP in platformSNPlist:
genotype = platformSNP.getConsensusGenotypeFromAblock(ablock)
self.addGeno(platformSNP.position, genotype)
return hasGenotypes
def fixHaplogroupIfArtifact(self):
'fixes artifactual haplogroup assignments'
for calledHaplogroup, replacementHaplogroup in (
six.iteritems(Sample.config.ttamHgCallReplacementDict)):
if self.haplogroup == calledHaplogroup:
self.haplogroupNode = Sample.tree.hg2nodeDict[replacementHaplogroup]
break
#----------------------------------------------------------------------
# Run option 3: sample-major 23andMe ablock data
#----------------------------------------------------------------------
@staticmethod
def runFromAblocks():
'run pipeline on sample-major 23andMe ablock data'
Customer.openAuxiliaryOutputFiles()
Sample.errAndLog('%sProcessing 23andMe customer data...\n\n' % utils.DASHES)
customerTupleList = Customer.buildCustomerTupleList()
Sample.errAndLog('\n%sCalling haplogroups...\n\n' % (utils.DASHES) +
' Progress...\n')
for customerTuple in customerTupleList:
customer = Customer(customerTuple)
if customer.loadAblockAndCallHaplogroup(purgeGenotypes=True):
Sample.sampleList.append(customer)
Customer.emitProgress()
Customer.closeAuxiliaryFilesAndReportCounts()
Sample.sortSampleList()
Sample.writeSampleList()
@staticmethod
def openAuxiliaryOutputFiles():
'open auxiliary output files'
if not Sample.config.suppressOutputAndLog:
Customer.noAblocksFile = open(Sample.config.noAblocksFN, 'w')
Customer.noGenotypesFile = open(Sample.config.noGenotypesFN, 'w')
@classmethod
def buildCustomerTupleList(cls):
'builds a list of CustomerTuple instances'
if Sample.args.ablockDSname:
customerTupleList = cls.buildCustomerTupleListFromFile()
else:
customerTupleList = cls.buildCustomerTupleListFromMetadata()
return customerTupleList
@staticmethod
def buildCustomerTupleListFromFile():
'''
builds a list of CustomerTuple instances from a two-column file.
column 1: ID
column 2: comma-separated list of platforms for this individual
example: Sample314159 1,2,5
'''
utils.checkFileExistence(Sample.args.dataFN, 'Sample IDs')
Sample.errAndLog('Reading sample IDs:\n %s\n' % Sample.args.dataFN)
customerTupleList = list()
IDset = set()
with open(Sample.args.dataFN, 'r') as idFile:
for line in idFile:
tokenList = line.strip().split()
if len(tokenList) != 2:
sys.exit('ERROR. When specifying non-default ablock dataset,\n' +
'ID file must have 2 columns: ID, comma-separated list of integers\n' +
'indicating platform versions.\n')
ID, platformVersions = tokenList
IDset.add(ID)
tupleKwargsDict = {
'resid': ID,
'y_haplogroup': Sample.config.missingHaplogroup, # previous call; not needed
}
platformVersionsSet = set([int(i) for i in platformVersions.split(',')])
for i in range(1, Sample.config.maxPlatformVersionPlusOne):
tupleKwargsDict['is_v%d' % i] = i in platformVersionsSet
customerTuple = Sample.config.CustomerTuple(**tupleKwargsDict)
customerTupleList.append(customerTuple)
Sample.errAndLog(' %8d read\n' % len(customerTupleList))
Sample.errAndLog(' %8d unique\n\n' % len(IDset))
return customerTupleList
@staticmethod
def buildCustomerTupleListFromMetadata():
'builds a list of CustomerTuple instances from customer metadata.'
import numpy as np
import pandas as pd
Sample.errAndLog('Building customer mask ... ')
metaDS = Sample.config.customerMetaDS
metaColList = Sample.config.customerMetaColList
prevHapCol = Sample.config.customerPrevHaplogroupCol
metaDF = pd.DataFrame(metaDS.load(metaColList))[metaColList]
metaDF[prevHapCol] = (
metaDS.load([prevHapCol])[prevHapCol] if Sample.config.compareToPrevCalls
else Sample.config.missingHaplogroup)
mask, maskType = Customer.buildCustomerMask(metaDF, np, pd)
metaDF = metaDF[mask]
customerTupleList = list()
for row in metaDF.itertuples():
customerTupleList.append(Sample.config.CustomerTuple(*row[1:]))
Sample.errAndLog('Done.\n' +
' %8d %s customers to be processed\n' % (len(customerTupleList), maskType))
return customerTupleList
@staticmethod
def buildCustomerMask(metaDF, np, pd):
'if resids have been specified, use those. otherwise, use all males.'
residList = Customer.generateResidList()
if residList:
maskType = 'specified'
mask = np.in1d(metaDF[Sample.config.customerIDcol], residList)
else:
maskType = 'male'
sexDF = pd.DataFrame(Sample.config.customerMetaDS.load(Sample.config.customerSexColList))
mask = np.ones(len(sexDF), dtype=bool)
for column in sexDF.columns:
mask = mask & (sexDF[column] == 'M')
return mask, maskType
@staticmethod
def generateResidList():
'''
4 possibilities:
residList specified at config instantiation
-a -s RESID -> a single research ID has been specified
-i FILENAME.resid.txt -> read research IDs from file
-a -> return empty list to indicate no subsetting
'''
residList = list()
if Sample.config.residList:
residList = Sample.config.residList
Sample.errAndLog('Research ID list supplied.\n' +
' %8d resids (%d unique)\n\n' % (len(residList), len(set(residList))))
elif Sample.args.singleSampleID:
resid = Customer.generateResid(Sample.args.singleSampleID)
residList = [resid]
Sample.errAndLog('Will call haplogroup for:\n %d\n\n' % resid)
elif Sample.args.dataFN:
utils.checkFileExistence(Sample.args.dataFN, 'Research IDs')
Sample.errAndLog('Reading research IDs:\n %s\n' % Sample.args.dataFN)
with open(Sample.args.dataFN, 'r') as residFile:
for line in residFile:
ID = line.strip().split()[0]
residList.append(Customer.generateResid(ID))
Sample.errAndLog(' %8d read\n' % len(residList))
Sample.errAndLog(' %8d unique\n\n' % len(set(residList)))
return residList
@staticmethod
def generateResid(ID):
'converts ID to integer, exiting gracefully if not possible'
try:
resid = int(ID)
except ValueError:
sys.exit('\nERROR. Cannot convert ID to integer: %s' % ID)
return resid
@staticmethod
def emitProgress():
'emit a message indicating how many haplogroups have been assigned thus far'
if (Sample.numAssigned in Sample.config.callingProgressEarlySet
or Sample.numAssigned % Sample.config.callingProgressInterval == 0):
Sample.errAndLog(' %8d haplogroups assigned\n' % Sample.numAssigned)
@staticmethod
def closeAuxiliaryFilesAndReportCounts():
'close auxiliary files and report counts'
utils.closeFiles([Customer.noAblocksFile, Customer.noGenotypesFile])
messageA = '\n %8d resid(s) ignored | could not load ablock:\n' % Customer.numNoAblock
messageB = ' %s\n' % Sample.config.noAblocksFN
messageC = ' %8d resid(s) ignored | no genotypes:\n' % Customer.numNoGenotypes
messageD = ' %s\n\n' % Sample.config.noGenotypesFN
if Sample.config.suppressOutputAndLog:
Sample.errAndLog(messageA + messageC + '\n')
else:
Sample.errAndLog(messageA + messageB + messageC + messageD)