Skip to content

Commit

Permalink
* This revision is SERIOUSLY BUGGY. Please NOT use it.
Browse files Browse the repository at this point in the history
 * Start to incorporate header parsing from Petr Danecek
  • Loading branch information
Heng Li committed Oct 24, 2009
1 parent bb9a823 commit d0e30ee
Show file tree
Hide file tree
Showing 12 changed files with 123 additions and 224 deletions.
7 changes: 4 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
CC= gcc
CFLAGS= -g -Wall -O2 #-m64 #-arch ppc
CFLAGS= -g -Wall #-O2 #-m64 #-arch ppc
DFLAGS= -D_FILE_OFFSET_BITS=64 -D_USE_KNETFILE -D_CURSES_LIB=1
LOBJS= bgzf.o kstring.o bam_aux.o bam.o bam_import.o sam.o bam_index.o \
bam_pileup.o bam_lpileup.o bam_md.o glf.o razf.o faidx.o knetfile.o \
bam_sort.o
bam_sort.o sam_header.o
AOBJS= bam_tview.o bam_maqcns.o bam_plcmd.o sam_view.o \
bam_rmdup.o bam_rmdupse.o bam_mate.o bam_stat.o bam_color.o \
bamtk.o kaln.o
Expand Down Expand Up @@ -45,7 +45,7 @@ bgzip:bgzip.o bgzf.o
$(CC) $(CFLAGS) -o $@ bgzf.o bgzip.o -lz

razip.o:razf.h
bam.o:bam.h razf.h bam_endian.h kstring.h
bam.o:bam.h razf.h bam_endian.h kstring.h sam_header.h
sam.o:sam.h bam.h
bam_import.o:bam.h kseq.h khash.h razf.h
bam_pileup.o:bam.h razf.h ksort.h
Expand All @@ -57,6 +57,7 @@ bam_maqcns.o:bam.h ksort.h bam_maqcns.h
bam_sort.o:bam.h ksort.h razf.h
bam_md.o:bam.h faidx.h
glf.o:glf.h
sam_header.o:sam_header.h khash.h

faidx.o:faidx.h razf.h khash.h
faidx_main.o:faidx.h razf.h
Expand Down
13 changes: 10 additions & 3 deletions bam.c
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "bam.h"
#include "bam_endian.h"
#include "kstring.h"
#include "sam_header.h"

int bam_is_be = 0;
char *bam_flag2char_table = "pPuUrR12sfd\0\0\0\0\0";
Expand Down Expand Up @@ -59,10 +60,9 @@ void bam_header_destroy(bam_header_t *header)
free(header->target_len);
}
free(header->text);
#ifndef BAM_NO_HASH
if (header->rg2lib) bam_strmap_destroy(header->rg2lib);
if (header->dict) sam_header_free(header->dict);
if (header->rg2lib) sam_tbl_destroy(header->rg2lib);
bam_destroy_header_hash(header);
#endif
free(header);
}

Expand Down Expand Up @@ -291,3 +291,10 @@ void bam_view1(const bam_header_t *header, const bam1_t *b)
printf("%s\n", s);
free(s);
}

const char *bam_get_library(const bam_header_t *header, const bam1_t *b)
{
const uint8_t *rg;
rg = bam_aux_get(b, "RG");
return (rg == 0)? 0 : sam_tbl_get(header->rg2lib, (const char*)(rg + 1));
}
5 changes: 4 additions & 1 deletion bam.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ typedef gzFile bamFile;
@field n_targets number of reference sequences
@field target_name names of the reference sequences
@field target_len lengths of the referene sequences
@field dict header dictionary
@field hash hash table for fast name lookup
@field rg2lib hash table for @RG-ID -> LB lookup
@field l_text length of the plain text in the header
Expand All @@ -85,7 +86,7 @@ typedef struct {
int32_t n_targets;
char **target_name;
uint32_t *target_len;
void *hash, *rg2lib;
void *dict, *hash, *rg2lib;
int l_text;
char *text;
} bam_header_t;
Expand Down Expand Up @@ -437,6 +438,8 @@ extern "C" {

char *bam_format1_core(const bam_header_t *header, const bam1_t *b, int of);

const char *bam_get_library(const bam_header_t *header, const bam1_t *b);

/*! @typedef
@abstract Structure for one alignment covering the pileup position.
@field b pointer to the alignment
Expand Down
67 changes: 0 additions & 67 deletions bam_aux.c
Original file line number Diff line number Diff line change
Expand Up @@ -180,70 +180,3 @@ char *bam_aux2Z(const uint8_t *s)
if (type == 'Z' || type == 'H') return (char*)s;
else return 0;
}

/******************
* rg2lib related *
******************/

int bam_strmap_put(void *rg2lib, const char *rg, const char *lib)
{
int ret;
khint_t k;
khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
char *key;
if (h == 0) return 1;
key = strdup(rg);
k = kh_put(r2l, h, key, &ret);
if (ret) kh_val(h, k) = strdup(lib);
else {
fprintf(stderr, "[bam_rg2lib_put] duplicated @RG ID: %s\n", rg);
free(key);
}
return 0;
}

const char *bam_strmap_get(const void *rg2lib, const char *rg)
{
const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
khint_t k;
if (h == 0) return 0;
k = kh_get(r2l, h, rg);
if (k != kh_end(h)) return (const char*)kh_val(h, k);
else return 0;
}

void *bam_strmap_dup(const void *rg2lib)
{
const khash_t(r2l) *h = (const khash_t(r2l)*)rg2lib;
khash_t(r2l) *g;
khint_t k, l;
int ret;
if (h == 0) return 0;
g = kh_init(r2l);
for (k = kh_begin(h); k < kh_end(h); ++k) {
if (kh_exist(h, k)) {
char *key = strdup(kh_key(h, k));
l = kh_put(r2l, g, key, &ret);
kh_val(g, l) = strdup(kh_val(h, k));
}
}
return g;
}

void *bam_strmap_init()
{
return (void*)kh_init(r2l);
}

void bam_strmap_destroy(void *rg2lib)
{
khash_t(r2l) *h = (khash_t(r2l)*)rg2lib;
khint_t k;
if (h == 0) return;
for (k = kh_begin(h); k < kh_end(h); ++k) {
if (kh_exist(h, k)) {
free((char*)kh_key(h, k)); free(kh_val(h, k));
}
}
kh_destroy(r2l, h);
}
104 changes: 18 additions & 86 deletions bam_import.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#endif
#include "kstring.h"
#include "bam.h"
#include "sam_header.h"
#include "kseq.h"
#include "khash.h"

Expand Down Expand Up @@ -172,104 +173,35 @@ static inline void append_text(bam_header_t *header, kstring_t *str)

int sam_header_parse_rg(bam_header_t *h)
{
kstring_t *rgid, *rglib;
char *p, *q, *s, *r;
int n = 0;

// free
if (h == 0) return 0;
bam_strmap_destroy(h->rg2lib); h->rg2lib = 0;
if (h->l_text < 3) return 0;
// parse @RG lines
h->rg2lib = bam_strmap_init();
rgid = calloc(1, sizeof(kstring_t));
rglib = calloc(1, sizeof(kstring_t));
s = h->text;
while ((s = strstr(s, "@RG")) != 0) {
if (rgid->l && rglib->l) {
bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
++n;
}
rgid->l = rglib->l = 0;
s += 3;
r = s;
if ((p = strstr(s, "ID:")) != 0) {
q = p + 3;
for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
kputsn(q, p - q, rgid);
} else {
fprintf(stderr, "[bam_header_parse] missing ID tag in @RG lines.\n");
break;
}
if (r < p) r = p;
if ((p = strstr(s, "LB:")) != 0) {
q = p + 3;
for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
kputsn(q, p - q, rglib);
} else {
fprintf(stderr, "[bam_header_parse] missing LB tag in @RG lines.\n");
break;
}
if (r < p) r = p;
}
if (rgid->l && rglib->l) {
bam_strmap_put(h->rg2lib, rgid->s, rglib->s);
++n;
}
free(rgid->s); free(rgid);
free(rglib->s); free(rglib);
if (n == 0) {
bam_strmap_destroy(h->rg2lib);
h->rg2lib = 0;
}
return n;
if (h->dict == 0) h->dict = sam_header_parse2(h->text);
if (h->rg2lib) h->rg2lib = sam_header2tbl(h->dict, "RG", "ID", "LB");
return sam_tbl_size(h->rg2lib);
}

int sam_header_parse(bam_header_t *h)
{
void *tbl;
char **tmp;
int i;
char *s, *p, *q, *r;

// free
free(h->target_len); free(h->target_name);
h->n_targets = 0; h->target_len = 0; h->target_name = 0;
if (h->l_text < 3) return 0;
// count number of @SQ
s = h->text;
while ((s = strstr(s, "@SQ")) != 0) {
++h->n_targets;
s += 3;
if (h->dict == 0) h->dict = sam_header_parse2(h->text);
tbl = sam_header2tbl(h->dict, "SQ", "SN", "LN");
h->n_targets = sam_tbl_size(tbl);
if (h->n_targets == 0) {
sam_tbl_destroy(tbl);
return 0;
}
if (h->n_targets == 0) return 0;
h->target_len = (uint32_t*)calloc(h->n_targets, 4);
h->target_name = (char**)calloc(h->n_targets, sizeof(void*));
// parse @SQ lines
i = 0;
s = h->text;
while ((s = strstr(s, "@SQ")) != 0) {
s += 3;
r = s;
if ((p = strstr(s, "SN:")) != 0) {
q = p + 3;
for (p = q; *p && *p != '\t' && *p != '\r' && *p != '\n'; ++p);
h->target_name[i] = (char*)calloc(p - q + 1, 1);
strncpy(h->target_name[i], q, p - q);
} else goto header_err_ret;
if (r < p) r = p;
if ((p = strstr(s, "LN:")) != 0) h->target_len[i] = strtol(p + 3, 0, 10);
else goto header_err_ret;
if (r < p) r = p;
s = r + 3;
++i;
}
sam_header_parse_rg(h);
tmp = (char**)calloc(h->n_targets, sizeof(void*));
sam_tbl_pair(tbl, h->target_name, tmp);
for (i = 0; i < h->n_targets; ++i)
h->target_len[i] = atoi(tmp[i]);
free(tmp);
sam_tbl_destroy(tbl);
return h->n_targets;

header_err_ret:
fprintf(stderr, "[bam_header_parse] missing SN or LN tag in @SQ lines.\n");
free(h->target_len); free(h->target_name);
h->n_targets = 0; h->target_len = 0; h->target_name = 0;
return 0;
}

bam_header_t *sam_header_read(tamFile fp)
Expand Down
4 changes: 1 addition & 3 deletions bam_rmdup.c
Original file line number Diff line number Diff line change
Expand Up @@ -127,11 +127,9 @@ void bam_rmdup_core(samfile_t *in, samfile_t *out)
} else if (c->isize > 0) { // paired, head
uint64_t key = (uint64_t)c->pos<<32 | c->isize;
const char *lib;
const uint8_t *rg;
lib_aux_t *q;
int ret;
rg = bam_aux_get(b, "RG");
lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
lib = bam_get_library(in->header, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
k = kh_put(pos, q->best_hash, key, &ret);
Expand Down
4 changes: 1 addition & 3 deletions bam_rmdupse.c
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,11 @@ void bam_rmdupse_core(samfile_t *in, samfile_t *out, int force_se)
push_queue(queue, b, endpos, score);
} else {
const char *lib;
const uint8_t *rg;
lib_aux_t *q;
besthash_t *h;
uint32_t key;
int ret;
rg = bam_aux_get(b, "RG");
lib = (rg == 0)? 0 : bam_strmap_get(in->header->rg2lib, (char*)(rg + 1));
lib = bam_get_library(in->header, b);
q = lib? get_aux(aux, lib) : get_aux(aux, "\t");
++q->n_checked;
h = (c->flag&BAM_FREVERSE)? q->rght : q->left;
Expand Down
11 changes: 0 additions & 11 deletions kaln.c
Original file line number Diff line number Diff line change
Expand Up @@ -182,17 +182,6 @@ typedef struct {
int M, I, D;
} dpscore_t;

/* build score profile for accelerating alignment, in theory */
static void aln_init_score_array(uint8_t *seq, int len, int row, int *score_matrix, int **s_array)
{
int *tmp, *tmp2, i, k;
for (i = 0; i != row; ++i) {
tmp = score_matrix + i * row;
tmp2 = s_array[i];
for (k = 0; k != len; ++k)
tmp2[k] = tmp[seq[k]];
}
}
/***************************
* banded global alignment *
***************************/
Expand Down
1 change: 0 additions & 1 deletion sam.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ bam_header_t *bam_header_dup(const bam_header_t *h0)
h->target_len[i] = h0->target_len[i];
h->target_name[i] = strdup(h0->target_name[i]);
}
if (h0->rg2lib) h->rg2lib = bam_strmap_dup(h0->rg2lib);
return h;
}
static void append_header_text(bam_header_t *header, char* text, int len)
Expand Down
Loading

0 comments on commit d0e30ee

Please sign in to comment.