forked from kkrizanovic/RNAseqEval
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreport.py
executable file
·460 lines (392 loc) · 21.7 KB
/
report.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
#! /usr/bin/python
class ReportType:
FASTA_REPORT = 0
MAPPING_REPORT = 1
ANNOTATION_REPORT = 2
TEMP_REPORT = 10 # Report used to temporarily store some data
class EvalReport:
def __init__(self, rtype = ReportType.FASTA_REPORT):
self.commandline = '' # Command which generated the report
# Basic information
self.rtype = rtype
# Reference information
self.chromlengths = {} # All chromosome lengths for the reference
self.reflength = 0
self.totalGeneLength = 0
# SAM file statistics
self.num_alignments = 0
self.num_unique_alignments = 0
self.num_multi_alignments = 0
self.num_possibly_split_alignements = 0 # Multiple alignments that are candidate split alignments
# Some aligners, such as BWA, do not generate split alignments
# using multiple alignments with clipping instead
self.num_split_alignments = 0
self.num_non_alignments = 0 # Alignments that do not have a CIGAR string
self.num_real_alignments = 0 # A final number of actual alignments with a correct CIGAR string
# To get this, alignments without a CIGAR string are dropped, several alignments of the
# same query are grouped into a split alignement and only the best alignment for each
# query is taken
self.num_real_split_alignments = 0
self.avg_mapping_quality = 0.0 # Not counting quality = 0
self.min_mapping_quality = 0
self.max_mapping_quality = 0
self.num_good_quality = 0 # Number of alignments with quality > 0
self.num_zero_quality = 0
# Maybe additional information should be considered
# i.e. avg indel length
self.num_match = -1
self.num_mismatch = -1
self.num_insert = -1
self.num_delete = -1
self.match_percentage = 0.0
self.mismatch_percentage = 0.0
self.insert_percentage = 0.0
self.delete_percentage = 0.0
# Information gained from annotation file
self.num_genes = 0
self.num_exons = 0
self.num_multiexon_genes = 0
self.max_exons_per_gene = 0 # ATM not used
self.min_gene_length = 0
self.max_gene_length = 0
self.avg_gene_length = 0.0
self.min_exon_length = 0
self.max_exon_length = 0
self.avg_exon_length = 0.0
# Information on groupped annotations / alternate splicings
self.num_annotation_groups = 0
self.num_alternate_spliced_genes = 0
self.max_spliced_alignments = 0
self.min_spliced_alignments = 0
self.max_spliced_exons = 0
self.min_spliced_exons = 0
# Advanced mapping information
self.allowed_inacc = 0
self.sum_read_length = 0
self.sum_bases_aligned = 0
self.percent_bases_aligned = 0.0
# Advanced mappin information for dfferent read categories
self.sum_read_length_correct = 0
self.sum_bases_aligned_correct = 0
self.percent_bases_aligned_correct = 0.0
self.sum_read_length_hitone = 0
self.sum_bases_aligned_hitone = 0
self.percent_bases_aligned_hitone = 0.0
self.num_genes_covered = 0 # How many genes from the annotation file were
# covered (intersected) by one or more alignments
self.num_missed_alignments = 0 # Alignements that missed genes, aligned outside genes
self.num_partial_alignments = 0 # partially overlapping genes
self.num_hit_alignments = 0 # completely inside genes
# Information on exons analogous to above information on genes
self.num_exons_covered = 0
self.num_exon_hit = 0
self.num_exon_partial = 0
self.num_exon_miss = 0
self.num_halfbases_hit = 0 # The number of reads whose alignment falling within an annotation
# covers at least half of the bases of the read
self.num_lowmatchcnt = 0 # The number of reads with a very low match count from extended CIGAR
# the number of matches is lower then the number of matches, inserts and deletes
self.num_partial_exon_miss = 0 # Then number of alignments that have a disjoined part completely missing an exon
# This disjoined part (partial alignment) needs to be greater than allowed_inaccuracy
# To be considered
self.num_almost_good = 0 # The number of alignments that would be contiguous, but have at least one partial alignment
# that missses all exons
self.num_extended_good = 0 # The number of alignments that are larger than the best match annotation (i.e. extend beyond
# annotation start or end), but otherwise match exon ends
self.num_multi_exon_alignments = 0 # alignments spanning multiple exons / this can be correct
self.num_multi_gene_alignments = 0 # Alignments that overlap more then one gene / this is definitely incorrect
self.num_inside_miss_alignments = 0 # Alignments that are inside genes, but missed all exons
self.num_bad_split_alignments = 0 # A number of alignments that are reported as split, but do not cover exons
# as expected
self.num_good_split_alignments = 0
# Statistical information on partial alignments
# TODO: have to see what would be interesting to put in here
self.avg_alignment_hit_perc = 0.0
self.avg_exon_hit_perc = 0.0
# Statistical information on split alignments
# NOTE: This variable already exists
# self.num_split_alignments
self.num_cover_all_exons = 0
self.num_cover_some_exons = 0
self.num_cover_no_exons = 0
self.num_multicover_exons = 0 # Exons covered by more than one part of a split alignment
self.num_equal_exons = 0 # Exons compeltely covered by a single alignment
self.num_partial_exons = 0 # Alignments partially covering an exon
self.num_undercover_alignments = 0
self.num_overcover_alignments = 0
self.num_possible_spliced_alignment = 0 # A number of alignments that are possibly spliced
# I.e. skipping an one or more exons between two covered exons
self.num_hit_all = 0 # A number of alignemnts that hit a series of exons without skipping any
# Each exon does not need to be covered correctly, only "hit"
# Should be fairly similar to Hit_all statistics for simulated reads
self.hit_all_percent = 0.0
# TODO: Some statistics about a precision of start and end exon points
self.num_good_starts = 0
self.num_good_ends = 0
# Final information that sums up the statistices
self.num_good_alignment = 0
self.num_bad_alignment = 0
self.good_alignment_percent = 0.0
self.bad_alignment_percent = 0.0
# Gene expression
# Looking at expressed genes, ones that overlap with at least one read in SAM file
# Storing them in a dictionary together with a number of hits for each exon in the gene
self.expressed_genes = {}
self.output_gene_expression = False
# Calculating gene coverage, how many bases of each gene and exon are covered by ready
# Bases covered multiple times are taken into account multiple times
# The structure of this dictionary is similar to expressed_genes above
# Each gene has one global counter (index 0), and one counter for each exon
self.gene_coverage = {}
# Alternate splicing
# Information on genes that have alternate splicing
self.alternate_splicing = {}
self.output_alternate_splicing = True
# Lists for saving qnames
# - qnames that overlap one exon
# - qnames that have contiguous alignment
# - qames that have incorrect alignment
self.hitone_names = []
self.hithalfbases_names = []
self.contig_names = []
self.incorr_names = []
self.unmapped_names = []
# Number of alignments after preprocessing - the number of alignments which are evaluated
self.num_evaluated_alignments = 0
# Potential new annotations
self.pot_new_annotations = []
self.alignments_with_pna = 0
self.detect_new_annotations = False
self.cna_count = {}
self.cna_readlist = {}
self.num_correct_new_annotations = 0
# DEBUGGING alignments and reads
self.reads2check = []
self.annotations2check = []
def chromosomes(self):
output = '\t\t'
for chrom in sorted(self.chromlengths.keys()):
chromlen = self.chromlengths[chrom]
output += "\t\t%s: %dbp\n" % (chrom, chromlen)
return output
def get_hitone_names(self):
report = ''
for name in self.hitone_names:
report += name + '\n'
return report
def get_hithalfbases_names(self):
report = ''
for name in self.hithalfbases_names:
report += name + '\n'
return report
def get_contig_names(self):
report = ''
for name in self.contig_names:
report += name + '\n'
return report
def get_incorr_names(self):
report = ''
for name in self.incorr_names:
report += name + '\n'
return report
def get_unmapped_names(self):
report = ''
for name in self.unmapped_names:
report += name + '\n'
return report
def getAnnotationReport(self):
report = ''
try:
for pna in self.pot_new_annotations:
name = pna.genename
reportline = "\nName: %s\nBased on:%s\nStrand: %s\nNumber of reads: %d\n" % (pna.genename, pna.source, pna.strand, self.cna_count[name])
reportline += "Type:%s\n" % pna.transcriptname
reportline += "Reads:\n"
for rname in self.cna_readlist[name]:
reportline += "\n" + rname
reportline += "\nItems:"
for item in pna.items:
reportline += " [%d, %d]" % (item.start, item.end)
reportline += "\n"
report += reportline
except Exception:
import pdb
pdb.set_trace()
return report
# New toString function for printing out reports
# Values reported are consistent with the benchmark paper!
def toString(self):
if self.rtype == ReportType.FASTA_REPORT:
report = """\n
Reference format: FASTA
General information:
Reference length = %d bp
Number of chromosomes = %d
Chromosomes:\n%s
Number of alignments (total / unique) = %d / %d
Alignments with / without CIGAR string = %d / %d
Mapping quality without zeroes (avg / min / max) = %.2f / %d / %d
Alignments with mapping quality (>0 / =0) = %d / %d
Number of matches / mismatches / inserts / deletes = %d / %d / %d / %d
Percentage of matches / mismatches / inserts / deletes = %.2f / %.2f / %.2f / %.2f
""" % (self.reflength, len(self.chromlengths), self.chromosomes(), \
self.num_alignments, self.num_unique_alignments, \
self.num_alignments - self.num_non_alignments, self.num_non_alignments, \
self.avg_mapping_quality, self.min_mapping_quality, self.max_mapping_quality, self.num_good_quality, self.num_zero_quality, \
self.num_match, self.num_mismatch, self.num_insert, self.num_delete, \
self.match_percentage, self.mismatch_percentage, self.insert_percentage, self.delete_percentage)
return report + '\n'
elif self.rtype == ReportType.MAPPING_REPORT:
report = """\n
Command Line: %s
Reference format: ANNOTATION
General information:
Reference length = %d bp
Number of chromosomes = %d
Chromosomes:
%s
Number of alignments in SAM file (total / unique) = %d / %d
Alignments with / without CIGAR string = %d / %d
Mapping quality without zeroes (avg / min / max) = %.2f / %d / %d
Alignments with mapping quality (>0 / =0) = %d / %d
Number of matches / mismatches / inserts / deletes = %d / %d / %d / %d
Percentage of matches / mismatches / inserts / deletes = %.2f / %.2f / %.2f / %.2f
""" % (self.commandline, self.reflength, len(self.chromlengths), self.chromosomes(), \
self.num_alignments, self.num_unique_alignments, \
self.num_alignments - self.num_non_alignments, self.num_non_alignments, \
self.avg_mapping_quality, self.min_mapping_quality, self.max_mapping_quality, self.num_good_quality, self.num_zero_quality, \
self.num_match, self.num_mismatch, self.num_insert, self.num_delete, \
self.match_percentage, self.mismatch_percentage, self.insert_percentage, self.delete_percentage)
report += """\n
Annotation statistics:
Total gene length = %d
Number of Transcripts / Exons (Multiexon transcripts) = %d / %d (%d)
Maximum number of exons in a gene = %d
Gene size (Min / Max / Avg) = %d / %d / %.2f
Exon size (Min / Max / Avg) = %d / %d / %.2f
""" % (self.totalGeneLength, self.num_genes, self.num_exons, self.num_multiexon_genes, self.max_exons_per_gene, \
self.min_gene_length, self.max_gene_length, self.avg_gene_length, \
self.min_exon_length, self.max_exon_length, self.avg_exon_length)
# report += """\n
# Grouped annotation (alternate splicing) statistics:
# Number of annotation groups (genes) = %d
# Number of genes with alternate splicing = %d
# Maximum / minimum number of alternate spliced alignments for a gene = %d / %d
# Maximum / minimum number of exons in spliced alignments = %d / %d
# """ % (self.num_annotation_groups, self.num_alternate_spliced_genes, \
# self.max_spliced_alignments, self.min_spliced_alignments, \
# self.max_spliced_exons, self.min_spliced_exons)
report += """\n
Mapping quality information:
Bases in reads (aligned / total) (percent) = (%d / %d) (%.2f%%)
Bases in reads (correct) (aligned / total) (percent) = (%d / %d) (%.2f%%)
Bases in reads (exon hit) (aligned / total) (percent) = (%d / %d) (%.2f%%)
Transcripts covered / missed / total = %d / %d / %d
Exons covered / missed / total = %d / %d / %d
Total number of evaluated alignments = %d
Alignments on transcript hit / missed = %d / %d
Alignments on exons hit / missed = %d / %d
Alignments for reads where more than 50%% bases falls within an annotation = %d
Alignments with low match count (lower than mismatch, insert and delete combined) = %d
Alignments hitting an exon (start / end / both) = %d / %d / %d
Alignments with a partial miss (not good / "almost "good) = %d / %d
"Extended good" alignments = %d
Contiguous / non contiguous alignments: %d (%.2f%%) / %d (%.2f%%)
Hit all for real reads: %d (%.2f%%)
""" % (self.sum_bases_aligned, self.sum_read_length, self.percent_bases_aligned, \
self.sum_bases_aligned_correct, self.sum_read_length_correct, self.percent_bases_aligned_correct, \
self.sum_bases_aligned_hitone, self.sum_read_length_hitone, self.percent_bases_aligned_hitone, \
self.num_genes_covered, self.num_genes - self.num_genes_covered, self.num_genes, \
self.num_exons_covered, self.num_exons - self.num_exons_covered, self.num_exons, \
self.num_evaluated_alignments, \
self.num_hit_alignments, self.num_missed_alignments, \
self.num_exon_hit, self.num_exon_miss, \
self.num_halfbases_hit, \
self.num_lowmatchcnt, \
self.num_good_starts, self.num_good_ends, self.num_equal_exons, \
self.num_partial_exon_miss, self.num_almost_good, \
self.num_extended_good, \
self.num_good_alignment, self.good_alignment_percent, self.num_bad_alignment, self.bad_alignment_percent, \
self.num_hit_all, self.hit_all_percent)
# Counting the number of expressed genes
# This has already been written in the report, but this way the value can be double checked
exp_gn_cnt = 0
for expression in self.expressed_genes.values():
if expression[0] > 0:
exp_gn_cnt += 1
if self.output_gene_expression:
report += """\n
Transcript/exon expression and coverage information:
Number of expressed Transcripts = %d
Expressed transcripts:
genename number_of_exons gene_hits / gene_covered_bases [(exon_hits / exon_covered_bases)]...
""" % exp_gn_cnt
if len(self.expressed_genes) != len(self.gene_coverage):
raise Exception('ERROR: Gene expression and gene coverage dictionaries do not match in length! (%d <> %d)' \
% (len(self.expressed_genes), len(self.gene_coverage)))
for genename in self.expressed_genes.keys():
numexons = len(self.expressed_genes[genename]) - 1
genehits = self.expressed_genes[genename][0]
gene_cov_bs = self.gene_coverage[genename][0]
if genehits > 0:
reportline = '%s %d %d %d' % (genename, numexons, genehits, gene_cov_bs)
for i in range(1, numexons+1):
exonhits = self.expressed_genes[genename][i]
exon_cov_bs = self.gene_coverage[genename][i]
reportline += ' (%d / %d)' % (exonhits, exon_cov_bs)
report += reportline + '\n'
if self.detect_new_annotations:
if len(self.pot_new_annotations) > 0:
report += """\n
Found %d potential new annotations with %d alignments
Number of new correct alignments for new annotations: %d correct
""" % (len(self.pot_new_annotations), self.alignments_with_pna, self.num_correct_new_annotations)
report += """\nDetailed report on annotations can be found in an '_annotations.report' file.\n"""
else:
report += """\n
Found NO potential new annotations:
"""
return report + '\n'
elif self.rtype == ReportType.ANNOTATION_REPORT:
report = """\n
Command Line: %s
Reference format: ANNOTATION
General information:
Reference length = %d bp
Number of chromosomes = %d
Chromosomes:
%s
""" % (self.commandline, self.reflength, len(self.chromlengths), self.chromosomes())
report += """\n
Annotation statistics:
Total gene length = %d
Number of Genes / Exons (Multiexon genes) = %d / %d (%d)
Maximum number of exons in a gene = %d
Gene size (Min / Max / Avg) = %d / %d / %.2f
Exon size (Min / Max / Avg) = %d / %d / %.2f
""" % (self.totalGeneLength, self.num_genes, self.num_exons, self.num_multiexon_genes, self.max_exons_per_gene, \
self.min_gene_length, self.max_gene_length, self.avg_gene_length, \
self.min_exon_length, self.max_exon_length, self.avg_exon_length)
# report += """\n
# Grouped annotation (alternate splicing) statistics:
# Number of annotation groups (genes) = %d
# Number of genes with alternate splicing = %d
# Maximum / minimum number of alternate spliced alignments for a gene = %d / %d
# """ % (self.num_annotation_groups, self.num_alternate_spliced_genes, \
# self.max_spliced_alignments, self.min_spliced_alignments)
if self.output_alternate_splicing:
report += """\n
Information on genes with alternate splicing:
Number of genes with alternate splicing = %d
Alternate splicing genes:
genename: [transcript name (number of exons)]...
""" % len(self.alternate_splicing)
for genename, alternate_splicing_info in self.alternate_splicing.items():
report += '%s: %s\n' % (genename, alternate_splicing_info)
return report + '\n'
elif self.rtype == ReportType.TEMP_REPORT:
return "\nTEMP report! Nothing to report!\n"
else:
return "\nERROR: Report not initialized!\n"
if __name__ == "__main__":
pass;