-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathseqstat.c
125 lines (112 loc) · 3.98 KB
/
seqstat.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
/* File: seqstat.c
* Author: Richard Durbin ([email protected])
* Copyright (C) Richard Durbin, Cambridge University, 2018
*-------------------------------------------------------------------
* Description:
* Exported functions:
* HISTORY:
* Last edited: Sep 28 00:51 2024 (rd109)
* * May 19 09:12 2024 (rd109): renamed composition to seqstat, for consistency
* Created: Sun Nov 11 17:21:40 2018 (rd109)
*-------------------------------------------------------------------
*/
#include "seqio.h"
#include "array.h"
#include <stdbool.h>
#include <ctype.h>
#include <math.h>
static int lengthBins = 20 ;
void usage (void)
{
fprintf (stderr, "Usage: seqstat [opts] <filename>\n") ;
fprintf (stderr, " will read fasta, fastq, bam/sam/cram, 1code, custom-binary. Use filename '-' for stdin (not 1code binary)\n") ;
fprintf (stderr, " options:\n") ;
fprintf (stderr, " -b : show base counts\n") ;
fprintf (stderr, " -q : show quality counts\n") ;
fprintf (stderr, " -t : show time and memory used\n") ;
fprintf (stderr, " -l : show length distribution in up to %d quadratic bins\n", lengthBins) ;
exit (0) ;
}
int main (int argc, char *argv[])
{
--argc ; ++argv ;
U64 *totBase = 0, *totQual = 0 ;
bool isTime = false ;
Array lengthCount = 0, lengthSum = 0 ;
if (!argc) usage () ;
while (argc && **argv == '-' && strcmp (*argv, "-"))
if (!strcmp (*argv, "-b")) { totBase = new0 (256, U64) ; --argc ; ++argv ; }
else if (!strcmp (*argv, "-q")) { totQual = new0 (256, U64) ; --argc ; ++argv ; }
else if (!strcmp (*argv, "-t")) { isTime = true ; --argc ; ++argv ; }
else if (!strcmp (*argv, "-l"))
{ lengthCount = arrayCreate (10000, int) ;
lengthSum = arrayCreate (10000, U64) ;
--argc ; ++argv ;
}
else usage () ;
if (isTime) timeUpdate (stdout) ;
SeqIO *si = seqIOopenRead (*argv, 0, true) ;
if (!si) die ("failed to open sequence file %s\n", *argv) ;
U64 lenMin = 0, lenMax = 0, totLen = 0 ;
int i ;
while (seqIOread (si))
{ char *s = sqioSeq(si), *e = s + si->seqLen ;
if (totBase) while (s < e) ++totBase[(int)*s++] ;
totLen += si->seqLen ;
if (si->seqLen > lenMax) lenMax = si->seqLen ;
if (!lenMin || si->seqLen < lenMin) lenMin = si->seqLen ;
if (lengthCount)
{ i = 10.*sqrt((double)si->seqLen) ;
++array(lengthCount, i, int) ; array(lengthSum, i, U64) += si->seqLen ;
}
if (totQual && si->isQual)
{ char *q = sqioQual(si), *e = q + si->seqLen ;
while (q < e) ++totQual[(int)*q++] ;
}
}
printf ("%s file, %llu sequences >= 0, %llu total, %.2f average, %llu min, %llu max\n",
seqIOtypeName[si->type], si->nSeq, totLen, totLen / (double) si->nSeq, lenMin, lenMax) ;
if (totBase)
{ printf ("bases\n") ;
for (i = 0 ; i < 256 ; ++i)
if (totBase[i])
{ if (isprint(i))
printf (" %c %llu %4.1f %%\n", i, totBase[i], totBase[i]*100.0/totLen) ;
else
printf (" unprint-%d %llu %4.1f %%\n", i, totBase[i], totBase[i]*100.0/totLen) ;
}
free (totBase) ;
}
if (totQual && si->isQual)
{ printf ("qualities\n") ;
U64 sum = 0 ;
for (i = 0 ; i < 256 ; ++i)
{ sum += totQual[i] ;
if (totQual[i]) printf (" %3d %llu %4.1f %% %5.1f %%\n",
i, totQual[i], totQual[i]*100.0/totLen, sum*100.0/totLen) ;
}
free (totQual) ;
}
if (lengthSum)
{ if (lenMin < lenMax)
{ U64 tot50 = 0 ;
for (i = 0 ; i < arrayMax(lengthSum) && tot50 < 0.5*totLen ; ++i)
tot50 += arr(lengthSum, i, U64) ;
printf ("approximate N50 %ld\n", ((long)i*(long)(i+1))/100) ;
printf ("length distribution (quadratic bins)\n") ;
int s = 0 ;
int d = arrayMax(lengthCount) / 20 ;
for (i = 0 ; i < arrayMax(lengthCount) ; ++i)
{ s += arr(lengthCount, i, int) ;
if (s && !((arrayMax(lengthCount)-1-i) % d))
{ printf (" %lld\t%d\n", (i*(U64)i)/100, s) ;
s = 0 ;
}
}
}
arrayDestroy (lengthCount) ; arrayDestroy (lengthSum) ;
}
if (isTime) timeTotal (stdout) ;
seqIOclose (si) ;
}
/****************/