forked from kkrizanovic/pyhera
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathPAFutils.py
115 lines (93 loc) · 4.14 KB
/
PAFutils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#! /usr/bin/python
# To enable importing from samscripts submodule
import os, sys
SCRIPT_PATH = os.path.dirname(os.path.realpath(__file__))
sys.path.append(os.path.join(SCRIPT_PATH, 'samscripts/src'))
import utility_sam
###################################################################
### PAF file structure
# 1. QNAME: Query name
# 2. QLEN: Query length
# 3. QSTART: Query start (0-based)
# 4. QEND: Query end (0-based)
# 5. STRAND: Relative strand '+' or '-'
# 6. TNAME: Target name
# 7. TLEN: Target length
# 8. TSTART: Target start on original strand (0-based)
# 9. TEND: Target end on original strand (0-based)
# 10. TRM: Number of residue matches
# 11. ABL: Alignment block length
# 12. MQUAL: Mapping quality (0-255; 255 for missing)
# OPTIONAL:
# SAM-like key-value pairs
# If PAF is generated from an alignment, column 10 equals the number of sequence matches,
# and column 11 equals the total number of sequence matches, mismatches, insertions and
# deletions in the alignment. If alignment is not available, column 10 and 11 are still
# required but may be highly inaccurate.
###################################################################
# Reads a PAF file and return mappings as PAF lines
# Last element contains attributes in the form of 'TAG:TYPE:VALUE'
def load_paf(paf_file):
paf_lines = []
attributes = {}
with open(paf_file, 'rU') as pfile:
for line in pfile:
# Ignoring header lines (copied from GTF)
if line.startswith('#') or line.startswith('track') or line.startswith('browser'):
pass
else:
elements = line.split('\t') # splitting with tab as delimitters
elcount = len(elements)
pafline = {}
attributes = {}
pafline['QNAME'] = elements[0]
pafline['QLEN'] = int(elements[1])
pafline['QSTART'] = int(elements[2])
pafline['QEND'] = int(elements[3])
pafline['STRAND'] = elements[4]
pafline['TNAME'] = elements[5]
pafline['TLEN'] = int(elements[6])
pafline['TSTART'] = int(elements[7])
pafline['TEND'] = int(elements[8])
pafline['NRM'] = int(elements[9])
pafline['ABL'] = int(elements[10])
pafline['MQUAL'] = int(elements[11])
if elcount > 12:
for i in range(12, elcount):
element = elements[i]
att_list = element.split(':')
attributes['TAG'] = att_list[0]
attributes['TYPE'] = att_list[1]
attributes['VALUE'] = att_list[2]
pafline['ATTRIB'] = attributes
paf_lines.append(pafline)
pfile.close()
return paf_lines
# Reads a SAM file and return mappings as PAF lines
# SAM file elements are converted to PAF attributes
def load_sam(sam_file):
paf_lines = []
attributes = {}
# Loading SAM file into hash
# Keeping only SAM lines with regular CIGAR string, and sorting them according to position
qnames_with_multiple_alignments = {}
[sam_hash, sam_hash_num_lines, sam_hash_num_unique_lines] = utility_sam.HashSAMWithFilter(sam_file, qnames_with_multiple_alignments)
for (qname, sam_lines) in sam_hash.iteritems():
for samline in sam_lines:
pafline = {}
attributes = {}
# pafline['QNAME'] = samline.qname
# pafline['QLEN'] = int(elements[1])
# pafline['QSTART'] = int(elements[2])
# pafline['QEND'] = int(elements[3])
# pafline['STRAND'] = elements[4]
# pafline['TNAME'] = samline.rname
# pafline['TLEN'] = int(elements[6])
# pafline['TSTART'] = int(elements[7])
# pafline['TEND'] = int(elements[8])
# pafline['NRM'] = int(elements[9])
# pafline['ABL'] = int(elements[10])
# pafline['MQUAL'] = int(elements[11])
pafline['ATTRIB'] = attributes
paf_lines.append(pafline)
return paf_lines