-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsum_up_snv.awk
137 lines (118 loc) · 4.15 KB
/
sum_up_snv.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/awk -f
#
# Script for summarizing variation ant single specified position in SAM/BAM file.
# Tested on GNU Awk 5.0.1.
#
# Version 0.2.a
# 2024-03-27
#
# == Usage ==
# 1
# Basic usage:
# samtools view <BAM/SAM> | awk -f sum_up_snv.awk pos=<POS>
# Example:
# samtools view my_mapping.bam | awk -f sum_up_snv.awk pos=1067
# Verbose: for each read print a tab-separated table having three columns: read_name, base, base_quality
# samtools view my_mapping.bam | awk -f sum_up_snv.awk pos=1067 verbose=1
# 2
# Faster -- specify reference sequence name:
# samtools view <BAM/SAM> <RNAME> | awk -f sum_up_snv.awk pos=<POS>
# Example:
# samtools view my_mapping.bam chr1 | awk -f sum_up_snv.awk pos=1067
# 3
# Even faster -- specify single-base region:
# samtools view <BAM/SAM> <RNAME>:<POS>-<POS> | awk -f sum_up_snv.awk pos=<POS>
# Example:
# samtools view my_mapping.bam chr1:1067-1067 | awk -f sum_up_snv.awk pos=1067
BEGIN{
cigar_operations="MIDHS";
a_count = 0;
t_count = 0;
g_count = 0;
c_count = 0;
del_count = 0;
for (ascii_code = 0; ascii_code < 256; ascii_code++) {
ord[sprintf("%c", ascii_code)] = ascii_code;
}
}
{
# $4 -- alignment start pos; $10 -- read sequence.
# If read does not map to `pos` -- omit it immidiately.
if ($4 > pos || $4 + length($10) <= pos) {
next; # go to next line in bam/sam file
}
start_aln = $4;
seq = toupper($10);
cigar = $6;
len_cigar = length(cigar);
# Position "in reference axis"
ref_pos = start_aln - 1;
# Position "in read axis"
read_pos = 0;
# Current CIGAR operation, e.g. '163M'
curr_cirar = "";
# Length of `curr_cirar`, e.g. '163'
curr_operation_len = "";
# Go through CIGAR string in order to take into account all indels and hard/soft clips
for (i = 1; i <= len_cigar; i++) {
# Current character of CIGAR string
chr = substr(cigar, i, 1);
# Update CIGAR
curr_cirar = sprintf("%s%s", curr_cirar, chr);
# If `chr` is a letter -- it's a CIGAR operation
if ( index(cigar_operations, chr) != 0 ) {
curr_operation_len = substr(curr_cirar, 1, length(curr_cirar)-1);
# Go further "in reference axis"
if (chr == "M" || chr == "D") {
ref_pos += curr_operation_len;
}
# Go further "in read axis"
if (chr != "D" && chr != "H") {
read_pos += curr_operation_len;
}
# If we reach our `pos`
if (ref_pos >= pos) {
# Not a deletion (not "*") but a base
if (chr != "D") {
base_pos = read_pos - (ref_pos-pos);
base = substr(seq, base_pos, 1);
if (verbose) {
qual_encoded = substr($11, base_pos, 1);
qual = ord[qual_encoded] - 33;
printf("%s\t%s\t%d\n", $1, base, qual);
}
if (base == "A") {
a_count++;
} else if (base == "T") {
t_count++;
} else if (base == "G") {
g_count++;
} else if (base == "C") {
c_count++;
}
} else {
# Deletion -- "*"
del_count++;
if (verbose) {
printf("%s\t*\t-\n", $1)
}
}
break;
}
# Clear CIGAR
curr_cirar = "";
curr_operation_len = "";
}
}
}
END{
cov = a_count + t_count + g_count + c_count + del_count;
printf("Position %d:\n", pos)
printf("Coverage: %d\n", cov)
cov_ratio = 100 / cov;
printf("A:%5d | %5.1f%%\n", a_count, a_count * cov_ratio);
printf("T:%5d | %5.1f%%\n", t_count, t_count * cov_ratio);
printf("G:%5d | %5.1f%%\n", g_count, g_count * cov_ratio);
printf("C:%5d | %5.1f%%\n", c_count, c_count * cov_ratio);
printf("*:%5d | %5.1f%%\n", del_count, del_count * cov_ratio);
}