Skip to content

Commit

Permalink
* added bam_verbose global variable
Browse files Browse the repository at this point in the history
 * fixed a bug when grouping reads in mpileup (due to recent changes)
 * do not validate SAM header
 * estimate genotype frequency
 * added HWE test back, but with a different method
  • Loading branch information
Heng Li committed Apr 8, 2011
1 parent e6ca3f8 commit 50706be
Show file tree
Hide file tree
Showing 10 changed files with 114 additions and 18 deletions.
2 changes: 1 addition & 1 deletion bam.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
#include "kstring.h"
#include "sam_header.h"

int bam_is_be = 0;
int bam_is_be = 0, bam_verbose = 2;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";

/**************************
Expand Down
8 changes: 7 additions & 1 deletion bam.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
@copyright Genome Research Ltd.
*/

#define BAM_VERSION "0.1.14 (r933:176)"
#define BAM_VERSION "0.1.14 (r947:199)"

#include <stdint.h>
#include <stdlib.h>
Expand Down Expand Up @@ -264,6 +264,12 @@ typedef struct __bam_iter_t *bam_iter_t;
*/
extern int bam_is_be;

/*!
@abstract Verbose level between 0 and 3; 0 is supposed to disable all
debugging information, though this may not have been implemented.
*/
extern int bam_verbose;

/*! @abstract Table for converting a nucleotide character to the 4-bit encoding. */
extern unsigned char bam_nt16_table[256];

Expand Down
2 changes: 1 addition & 1 deletion bam2depth.c
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ int main_depth(int argc, char *argv[])
while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position
if (pos < beg || pos >= end) continue; // out of range; skip
if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip
printf("%s\t%d", h->target_name[tid], pos+1);
fputs(h->target_name[tid], stdout); printf("\t%d", pos+1);
for (i = 0; i < n; ++i) {
int j, m = 0;
for (j = 0; j < n_plp[i]; ++j) {
Expand Down
4 changes: 2 additions & 2 deletions bam_plcmd.c
Original file line number Diff line number Diff line change
Expand Up @@ -623,7 +623,7 @@ static void group_smpl(mplp_pileup_t *m, bam_sample_t *sm, kstring_t *buf,
if (id < 0) id = bam_smpl_rg2smid(sm, fn[i], 0, buf);
if (id < 0 || id >= m->n) {
assert(q); // otherwise a bug
fprintf(stderr, "[%s] Read group %s used in file %s but not defined in the header.\n", __func__, (char*)q+1, fn[i]);
fprintf(stderr, "[%s] Read group %s used in file %s but absent from the header or an alignment missing read group.\n", __func__, (char*)q+1, fn[i]);
exit(1);
}
if (m->n_plp[id] == m->m_plp[id]) {
Expand Down Expand Up @@ -742,7 +742,7 @@ static int mpileup(mplp_conf_t *conf, int n, char **fn)
fprintf(stderr, "(%s) Max depth is above 1M. Potential memory hog!\n", __func__);
if (max_depth * sm->n < 8000) {
max_depth = 8000 / sm->n;
fprintf(stderr, "<%s> Set max per-sample depth to %d\n", __func__, max_depth);
fprintf(stderr, "<%s> Set max per-file depth to %d\n", __func__, max_depth);
}
max_indel_depth = conf->max_indel_depth * sm->n;
bam_mplp_set_maxcnt(iter, max_depth);
Expand Down
29 changes: 27 additions & 2 deletions bcftools/call1.c
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,23 @@ void *bed_read(const char *fn);
void bed_destroy(void *_h);
int bed_overlap(const void *_h, const char *chr, int beg, int end);

static double test_hwe(const double g[3])
{
extern double kf_gammaq(double p, double x);
double fexp, chi2, f[3], n;
int i;
n = g[0] + g[1] + g[2];
fexp = (2. * g[2] + g[1]) / (2. * n);
if (fexp > 1. - 1e-10) fexp = 1. - 1e-10;
if (fexp < 1e-10) fexp = 1e-10;
f[0] = n * (1. - fexp) * (1. - fexp);
f[1] = n * 2. * fexp * (1. - fexp);
f[2] = n * fexp * fexp;
for (i = 0, chi2 = 0.; i < 3; ++i)
chi2 += (g[i] - f[i]) * (g[i] - f[i]) / f[i];
return kf_gammaq(.5, chi2 / 2.);
}

typedef struct {
double p[4];
int mq, depth, is_tested, d[4];
Expand Down Expand Up @@ -117,7 +134,11 @@ static int update_bcf1(int n_smpl, bcf1_t *b, const bcf_p1aux_t *pa, const bcf_p
kputs(b->info, &s);
if (b->info[0]) kputc(';', &s);
// ksprintf(&s, "AF1=%.4lg;AFE=%.4lg;CI95=%.4lg,%.4lg", 1.-pr->f_em, 1.-pr->f_exp, pr->cil, pr->cih);
ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih);
ksprintf(&s, "AF1=%.4g;CI95=%.4g,%.4g;G3=%.4g,%.4g,%.4g", 1.-pr->f_em, pr->cil, pr->cih, pr->g[2], pr->g[1], pr->g[0]);
if (n_smpl > 5) {
double hwe = test_hwe(pr->g);
if (hwe < 0.1) ksprintf(&s, ";HWE=%.4g", hwe);
}
if (has_I16) ksprintf(&s, ";DP4=%d,%d,%d,%d;MQ=%d", a.d[0], a.d[1], a.d[2], a.d[3], a.mq);
fq = pr->p_ref_folded < 0.5? -4.343 * log(pr->p_ref_folded) : 4.343 * log(pr->p_var_folded);
if (fq < -999) fq = -999;
Expand Down Expand Up @@ -216,6 +237,10 @@ static void write_header(bcf_hdr_t *h)
kputs("##INFO=<ID=FQ,Number=1,Type=Float,Description=\"Phred probability of all samples being the same\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=AF1,"))
kputs("##INFO=<ID=AF1,Number=1,Type=Float,Description=\"Max-likelihood estimate of the site allele frequency of the first ALT allele\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=G3,"))
kputs("##INFO=<ID=G3,Number=3,Type=Float,Description=\"ML estimate of genotype frequencies\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=HWE,"))
kputs("##INFO=<ID=HWE,Number=1,Type=Float,Description=\"Chi^2 based HWE test P-value based on G3\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=CI95,"))
kputs("##INFO=<ID=CI95,Number=2,Type=Float,Description=\"Equal-tail Bayesian credible interval of the site allele frequency at the 95% level\">\n", &str);
if (!strstr(str.s, "##INFO=<ID=PV4,"))
Expand Down Expand Up @@ -454,7 +479,7 @@ int bcfview(int argc, char *argv[])
kstring_t s;
s.m = s.l = 0; s.s = 0;
if (*b->info) kputc(';', &s);
ksprintf(&s, "NEIR=%.3f;NEIF=%.3f,%.3f", r2, f[0]+f[2], f[0]+f[1]);
ksprintf(&s, "NEIR=%.3f;NEIF4=%.3f,%.3f,%.3f,%.3f", r2, f[0], f[1], f[2], f[3]);
bcf_append_info(b, s.s, s.l);
free(s.s);
}
Expand Down
28 changes: 26 additions & 2 deletions bcftools/prob1.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
KSTREAM_INIT(gzFile, gzread, 16384)

#define MC_MAX_EM_ITER 16
#define MC_EM_EPS 1e-4
#define MC_EM_EPS 1e-5
#define MC_DEF_INDEL 0.15

unsigned char seq_nt4_table[256] = {
Expand Down Expand Up @@ -224,6 +224,24 @@ static double mc_freq_iter(double f0, const bcf_p1aux_t *ma, int beg, int end)
return f;
}

static double mc_gtfreq_iter(double g[3], const bcf_p1aux_t *ma, int beg, int end)
{
double err, gg[3];
int i;
gg[0] = gg[1] = gg[2] = 0.;
for (i = beg; i < end; ++i) {
double *pdg, sum, tmp[3];
pdg = ma->pdg + i * 3;
tmp[0] = pdg[0] * g[0]; tmp[1] = pdg[1] * g[1]; tmp[2] = pdg[2] * g[2];
sum = (tmp[0] + tmp[1] + tmp[2]) * (end - beg);
gg[0] += tmp[0] / sum; gg[1] += tmp[1] / sum; gg[2] += tmp[2] / sum;
}
err = fabs(gg[0] - g[0]) > fabs(gg[1] - g[1])? fabs(gg[0] - g[0]) : fabs(gg[1] - g[1]);
err = err > fabs(gg[2] - g[2])? err : fabs(gg[2] - g[2]);
g[0] = gg[0]; g[1] = gg[1]; g[2] = gg[2];
return err;
}

int bcf_p1_call_gt(const bcf_p1aux_t *ma, double f0, int k)
{
double sum, g[3];
Expand Down Expand Up @@ -533,6 +551,13 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
}
}
}
{ // compute g[3]
rst->g[0] = (1. - rst->f_em) * (1. - rst->f_em);
rst->g[1] = 2. * rst->f_em * (1. - rst->f_em);
rst->g[2] = rst->f_em * rst->f_em;
for (i = 0; i < MC_MAX_EM_ITER; ++i)
if (mc_gtfreq_iter(rst->g, ma, 0, ma->n) < MC_EM_EPS) break;
}
{ // estimate equal-tail credible interval (95% level)
int l, h;
double p;
Expand All @@ -546,7 +571,6 @@ int bcf_p1_cal(const bcf1_t *b, bcf_p1aux_t *ma, bcf_p1rst_t *rst)
h = i;
rst->cil = (double)(ma->M - h) / ma->M; rst->cih = (double)(ma->M - l) / ma->M;
}
rst->g[0] = rst->g[1] = rst->g[2] = -1.;
rst->cmp[0] = rst->cmp[1] = rst->cmp[2] = rst->p_chi2 = -1.0;
if (rst->p_var > 0.1) // skip contrast2() if the locus is a strong non-variant
rst->p_chi2 = contrast2(ma, rst->cmp);
Expand Down
12 changes: 6 additions & 6 deletions sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
append_header_text(fp->header, textheader->text, textheader->l_text);
bam_header_destroy(textheader);
}
if (fp->header->n_targets == 0)
if (fp->header->n_targets == 0 && bam_verbose >= 1)
fprintf(stderr, "[samopen] no @SQ lines in the header.\n");
} else fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
} else if (bam_verbose >= 2) fprintf(stderr, "[samopen] SAM header is present: %d sequences.\n", fp->header->n_targets);
}
} else if (strchr(mode, 'w')) { // write
fp->header = bam_header_dup((const bam_header_t*)aux);
Expand Down Expand Up @@ -93,10 +93,10 @@ samfile_t *samopen(const char *fn, const char *mode, const void *aux)
sam_header_parse(alt);
alt->l_text = 0; alt->text = 0;
// check if there are @SQ lines in the header
fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw);
fwrite(fp->header->text, 1, fp->header->l_text, fp->x.tamw); // FIXME: better to skip the trailing NULL
if (alt->n_targets) { // then write the header text without dumping ->target_{name,len}
if (alt->n_targets != fp->header->n_targets)
fprintf(stderr, "[samopen] inconsistent number of target sequences.\n");
if (alt->n_targets != fp->header->n_targets && bam_verbose >= 1)
fprintf(stderr, "[samopen] inconsistent number of target sequences. Output the text header.\n");
} else { // then dump ->target_{name,len}
for (i = 0; i < fp->header->n_targets; ++i)
fprintf(fp->x.tamw, "@SQ\tSN:%s\tLN:%d\n", fp->header->target_name[i], fp->header->target_len[i]);
Expand Down Expand Up @@ -168,7 +168,7 @@ char *samfaipath(const char *fn_ref)
if (access(fn_ref, R_OK) == -1) {
fprintf(stderr, "[samfaipath] fail to read file %s.\n", fn_ref);
} else {
fprintf(stderr, "[samfaipath] build FASTA index...\n");
if (bam_verbose >= 3) fprintf(stderr, "[samfaipath] build FASTA index...\n");
if (fai_build(fn_ref) == -1) {
fprintf(stderr, "[samfaipath] fail to build FASTA index.\n");
free(fn_list); fn_list = 0;
Expand Down
3 changes: 2 additions & 1 deletion sam_header.c
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,7 @@ void *sam_header_parse2(const char *headerText)
const char *text;
char *buf=NULL;
size_t nbuf = 0;
int tovalidate = 0;

if ( !headerText )
return 0;
Expand All @@ -571,7 +572,7 @@ void *sam_header_parse2(const char *headerText)
while ( (text=nextline(&buf, &nbuf, text)) )
{
hline = sam_header_line_parse(buf);
if ( hline && sam_header_line_validate(hline) )
if ( hline && (!tovalidate || sam_header_line_validate(hline)) )
// With too many (~250,000) reference sequences the header parsing was too slow with list_append.
hlines = list_append_to_end(hlines, hline);
else
Expand Down
40 changes: 40 additions & 0 deletions sam_view.c
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "sam_header.h"
#include "sam.h"
#include "faidx.h"
#include "kstring.h"
#include "khash.h"
KHASH_SET_INIT_STR(rg)

Expand Down Expand Up @@ -68,6 +69,37 @@ static inline int __g_skip_aln(const bam_header_t *h, const bam1_t *b)
return 0;
}

static char *drop_rg(char *hdtxt, rghash_t h, int *len)
{
char *p = hdtxt, *q, *r, *s;
kstring_t str;
memset(&str, 0, sizeof(kstring_t));
while (1) {
int toprint = 0;
q = strchr(p, '\n');
if (q == 0) q = p + strlen(p);
if (q - p < 3) break; // the line is too short; then stop
if (strncmp(p, "@RG\t", 4) == 0) {
int c;
khint_t k;
if ((r = strstr(p, "\tID:")) != 0) {
r += 4;
for (s = r; *s != '\0' && *s != '\n' && *s != '\t'; ++s);
c = *s; *s = '\0';
k = kh_get(rg, h, r);
*s = c;
if (k != kh_end(h)) toprint = 1;
}
} else toprint = 1;
if (toprint) {
kputsn(p, q - p, &str); kputc('\n', &str);
}
p = q + 1;
}
*len = str.l;
return str.s;
}

// callback function for bam_fetch() that prints nonskipped records
static int view_func(const bam1_t *b, void *data)
{
Expand Down Expand Up @@ -164,6 +196,14 @@ int main_samview(int argc, char *argv[])
ret = 1;
goto view_end;
}
if (g_rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for...
char *tmp;
int l;
tmp = drop_rg(in->header->text, g_rghash, &l);
free(in->header->text);
in->header->text = tmp;
in->header->l_text = l;
}
if (!is_count && (out = samopen(fn_out? fn_out : "-", out_mode, in->header)) == 0) {
fprintf(stderr, "[main_samview] fail to open \"%s\" for writing.\n", fn_out? fn_out : "standard output");
ret = 1;
Expand Down
4 changes: 2 additions & 2 deletions sample.c
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ int bam_smpl_add(bam_sample_t *sm, const char *fn, const char *txt)
p = q > r? q : r;
++n;
}
// if (n == 0) add_pair(sm, sm2id, fn, fn);
add_pair(sm, sm2id, fn, fn);
if (n == 0) add_pair(sm, sm2id, fn, fn);
// add_pair(sm, sm2id, fn, fn);
free(buf.s);
return 0;
}
Expand Down

0 comments on commit 50706be

Please sign in to comment.