This repository has been archived by the owner on Dec 4, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
cs2nt.c
196 lines (182 loc) · 5.92 KB
/
cs2nt.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include "bwtaln.h"
#include "stdaln.h"
#include "dbset.h"
/*
Here is a delicate example. ref_nt=ATTAAC(RBRBG), read_cs=RBBOG. If we
decode as ATTGAC(RBGOG), there are one color change and one nt change;
if we decode as ATTAAC(RBRBG), there are two color changes.
In DP, if color quality is smaller than COLOR_MM, we will use COLOR_MM
as the penalty; otherwise, we will use color quality as the
penalty. This means we always prefer two consistent color changes over
a nt change, but if a color has high quality, we may prefer one nt
change.
In the above example, the penalties of the two types of decoding are
q(B)+25 and q(B)+q(O), respectively. If q(O)>25, we prefer the first;
otherwise the second. Note that no matter what we choose, the fourth
base will get a low nt quality.
*/
#define COLOR_MM 19
#define NUCL_MM 25
static const int nst_ntnt2cs_table[] = { 4, 0, 0, 1, 0, 2, 3, 4, 0, 3, 2, 4, 1, 4, 4, 4 };
/*
{A,C,G,T,N} -> {0,1,2,3,4}
nt_ref[0..size]: nucleotide reference: 0/1/2/3/4
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
nt_read[0..size]: nucleotide read sequence: 0/1/2/3 (returned)
btarray[0..4*size]: backtrack array (working space)
*/
void cs2nt_DP(int size, const uint8_t *nt_ref, const uint8_t *cs_read, uint8_t *nt_read, uint8_t *btarray)
{
int h[8], curr, last;
int x, y, xmin, hmin, k;
// h[0..3] and h[4..7] are the current and last best score array, depending on curr and last
// recursion: initial value
if (nt_ref[0] >= 4) memset(h, 0, sizeof(int) << 2);
else {
for (x = 0; x != 4; ++x) h[x] = NUCL_MM;
h[nt_ref[0]] = 0;
}
// recursion: main loop
curr = 1; last = 0;
for (k = 1; k <= size; ++k) {
for (x = 0; x != 4; ++x) {
int min = 0x7fffffff, ymin = 0;
for (y = 0; y != 4; ++y) {
int s = h[last<<2|y];
if ((cs_read[k-1]&0x3f) != 63 && cs_read[k-1]>>6 != nst_ntnt2cs_table[1<<x|1<<y])
s += ((cs_read[k-1]&0x3f) < COLOR_MM)? COLOR_MM : (cs_read[k-1]&0x3f); // color mismatch
if (nt_ref[k] < 4 && nt_ref[k] != x) s += NUCL_MM; // nt mismatch
if (s < min) {
min = s; ymin = y;
}
}
h[curr<<2|x] = min; btarray[k<<2|x] = ymin;
}
last = curr; curr = 1 - curr; // swap
}
// back trace
hmin = 0x7fffffff; xmin = 0;
for (x = 0; x != 4; ++x) {
if (h[last<<2|x] < hmin) {
hmin = h[last<<2|x]; xmin = x;
}
}
nt_read[size] = xmin;
for (k = size - 1; k >= 0; --k)
nt_read[k] = btarray[(k+1)<<2 | nt_read[k+1]];
}
/*
nt_read[0..size]: nucleotide read sequence: 0/1/2/3
cs_read[0..size-1]: color read+qual sequence: base<<6|qual; qual==63 for N
tarray[0..size*2-1]: temporary array
*/
uint8_t *cs2nt_nt_qual(int size, const uint8_t *nt_read, const uint8_t *cs_read, uint8_t *tarray)
{
int k, c1, c2;
uint8_t *t2array = tarray + size;
// get the color sequence of nt_read
c1 = nt_read[0];
for (k = 1; k <= size; ++k) {
c2 = nt_read[k]; // in principle, there is no 'N' in nt_read[]; just in case
tarray[k-1] = (c1 >= 4 || c2 >= 4)? 4 : nst_ntnt2cs_table[1<<c1 | 1<<c2];
c1 = c2;
}
for (k = 1; k != size; ++k) {
int q = 0;
if (tarray[k-1] == cs_read[k-1]>>6 && tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k-1]&0x3f) + (int)(cs_read[k]&0x3f) + 10;
} else if (tarray[k-1] == cs_read[k-1]>>6) {
q = (int)(cs_read[k-1]&0x3f) - (int)(cs_read[k]&0x3f);
} else if (tarray[k] == cs_read[k]>>6) {
q = (int)(cs_read[k]&0x3f) - (int)(cs_read[k-1]&0x3f);
} // else, q = 0
if (q < 0) q = 0;
if (q > 60) q = 60;
t2array[k] = nt_read[k]<<6 | q;
if ((cs_read[k-1]&0x3f) == 63 || (cs_read[k]&0x3f) == 63) t2array[k] = 0;
}
return t2array + 1; // of size-2
}
// this function will be called when p->seq has been reversed by refine_gapped()
void bwa_cs2nt_core(bwa_seq_t *p, dbset_t *dbs)
{
uint8_t *ta, *nt_read, *btarray, *tarray, *nt_ref, *cs_read, *new_nt_read;
int i, len;
uint8_t *seq;
// set temporary arrays
if (p->type == BWA_TYPE_NO_MATCH) return;
len = p->len + p->n_gapo + p->n_gape + 100; // leave enough space
ta = (uint8_t*)malloc(len * 7);
nt_ref = ta;
cs_read = nt_ref + len;
nt_read = cs_read + len;
btarray = nt_read + len;
tarray = nt_read + len;
#define __gen_csbase(_cs, _i, _seq) do { \
int q = p->qual[p->strand? p->len - 1 - (_i) : (_i)] - 33; \
if (q > 60) q = 60; \
if (_seq[_i] > 3) q = 63; \
(_cs) = _seq[_i]<<6 | q; \
} while (0)
// generate len, nt_ref[] and cs_read
seq = p->strand? p->rseq : p->seq;
if (p->pos) {
dbset_extract_sequence(dbs, dbs->ntbns, nt_ref, p->pos-1, 1);
} else {
nt_ref[0] = 4;
}
if (p->cigar == 0) { // no gap or clipping
len = p->len;
dbset_extract_sequence(dbs, dbs->ntbns, nt_ref+1, p->pos, p->len);
for (i = 0; i < p->len; ++i) {
__gen_csbase(cs_read[i], i, seq);
}
} else {
int k, z;
bwtint_t x, y;
x = p->pos; y = 0;
for (k = z = 0; k < p->n_cigar; ++k) {
int l = __cigar_len(p->cigar[k]);
if (__cigar_op(p->cigar[k]) == FROM_M) {
dbset_extract_sequence(dbs, dbs->ntbns, nt_ref+z+1, x, l);
for (i = 0; i < l; ++i, ++x, ++y) {
__gen_csbase(cs_read[z], y, seq);
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_I) {
for (i = 0; i < l; ++i, ++y) {
__gen_csbase(cs_read[z], y, seq);
nt_ref[z+1] = 4;
++z;
}
} else if (__cigar_op(p->cigar[k]) == FROM_S) y += l;
else x += l;
}
len = z;
}
cs2nt_DP(len, nt_ref, cs_read, nt_read, btarray);
new_nt_read = cs2nt_nt_qual(len, nt_read, cs_read, tarray);
// update p
p->len = p->full_len = len - 1;
for (i = 0; i < p->len; ++i) {
if ((new_nt_read[i]&0x3f) == 63) {
p->qual[i] = 33; seq[i] = 4;
} else {
p->qual[i] = (new_nt_read[i]&0x3f) + 33;
seq[i] = new_nt_read[i]>>6;
}
}
p->qual[p->len] = seq[p->len] = 0;
if (p->strand) {
memcpy(p->seq, seq, p->len);
seq_reverse(p->len, p->seq, 1);
seq_reverse(p->len, p->qual, 0);
} else {
memcpy(p->rseq, seq, p->len);
seq_reverse(p->len, p->rseq, 1);
}
free(ta);
}