diff --git a/cram/cram_decode.c b/cram/cram_decode.c index dba7cf89c..6f1528dc0 100644 --- a/cram/cram_decode.c +++ b/cram/cram_decode.c @@ -1087,6 +1087,25 @@ static inline int add_md_char(cram_slice *s, int decode_md, char c, int32_t *md_ return -1; } +static int add_cigar(cram_slice *s, uint64_t cig_len, int cig_op) { + int nlen = 1 + cig_len/(1<<28); + if (s->ncigar + nlen >= s->cigar_alloc) { + s->cigar_alloc = (s->ncigar + nlen) < 1024 ? 1024 : (s->ncigar + nlen)*2; + uint32_t *c2 = realloc(s->cigar, s->cigar_alloc * sizeof(*s->cigar)); + if (!c2) + return -1; + s->cigar = c2; + } + + while (cig_len) { + int sub_len = cig_len < (1<<28) ? cig_len : (1<<28)-1; + s->cigar[s->ncigar++] = (sub_len<<4) + cig_op; + cig_len -= sub_len; + } + + return 0; +} + /* * Internal part of cram_decode_slice(). * Generates the sequence, quality and cigar components. @@ -1097,13 +1116,10 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int has_MD, int has_NM) { int prev_pos = 0, f, r = 0, out_sz = 1; int seq_pos = 1; - int cig_len = 0; + uint64_t cig_len = 0; int64_t ref_pos = cr->apos; int32_t fn, i32; enum cigar_op cig_op = BAM_CMATCH; - uint32_t *cigar = s->cigar; - uint32_t ncigar = s->ncigar; - uint32_t cigar_alloc = s->cigar_alloc; uint32_t nm = 0; int32_t md_dist = 0; int orig_aux = 0; @@ -1134,7 +1150,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } ref_pos--; // count from 0 - cr->cigar = ncigar; + cr->cigar = s->ncigar; if (!(ds & (CRAM_FC | CRAM_FP))) goto skip_cigar; @@ -1143,13 +1159,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t pos = 0; char op; - if (ncigar+2 >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } - if (ds & CRAM_FC) { if (!c->comp_hdr->codecs[DS_FC]) return -1; r |= c->comp_hdr->codecs[DS_FC]->decode(s, @@ -1231,13 +1240,15 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CBASE_MATCH; #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CMATCH; @@ -1258,7 +1269,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int have_sc = 0; if (cig_len) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } switch (CRAM_MAJOR_VERS(fd->version)) { @@ -1297,7 +1309,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (have_sc) { if (r) return r; - cigar[ncigar++] = (out_sz2<<4) + BAM_CSOFT_CLIP; + if (add_cigar(s, out_sz2, BAM_CSOFT_CLIP) < 0) + goto err; cig_op = BAM_CSOFT_CLIP; seq_pos += out_sz2; } @@ -1308,7 +1321,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, unsigned char base; #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MISMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BS) { @@ -1323,7 +1337,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, #else int ref_base; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BS) { @@ -1365,7 +1380,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'D': { // Deletion; DL if (cig_len && cig_op != BAM_CDEL) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_DL) { @@ -1419,7 +1435,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t out_sz2 = 1; if (cig_len && cig_op != BAM_CINS) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1441,7 +1458,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'i': { // Insertion (single base); BA if (cig_len && cig_op != BAM_CINS) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_BA) { @@ -1463,7 +1481,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t len = 1; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1513,7 +1532,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, int32_t len = 1; if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } @@ -1537,12 +1557,14 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'B': { // Read base; BA, QS #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MISMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } #endif @@ -1601,7 +1623,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'H': { // hard clip; HC if (cig_len && cig_op != BAM_CHARD_CLIP) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_HC) { @@ -1618,7 +1641,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'P': { // padding; PD if (cig_len && cig_op != BAM_CPAD) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_PD) { @@ -1635,7 +1659,8 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, case 'N': { // Ref skip; RS if (cig_len && cig_op != BAM_CREF_SKIP) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } if (ds & CRAM_RS) { @@ -1722,21 +1747,17 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, ref_pos += cr->len - seq_pos + 1; } - if (ncigar+1 >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } #ifdef USE_X if (cig_len && cig_op != BAM_CBASE_MATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CBASE_MATCH; #else if (cig_len && cig_op != BAM_CMATCH) { - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; cig_len = 0; } cig_op = BAM_CMATCH; @@ -1752,17 +1773,11 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } if (cig_len) { - if (ncigar >= cigar_alloc) { - cigar_alloc = cigar_alloc ? cigar_alloc*2 : 1024; - if (!(cigar = realloc(cigar, cigar_alloc * sizeof(*cigar)))) - return -1; - s->cigar = cigar; - } - - cigar[ncigar++] = (cig_len<<4) + cig_op; + if (add_cigar(s, cig_len, cig_op) < 0) + goto err; } - cr->ncigar = ncigar - cr->cigar; + cr->ncigar = s->ncigar - cr->cigar; cr->aend = ref_pos; //printf("2: %.*s %d .. %d\n", cr->name_len, DSTRING_STR(name_ds) + cr->name, cr->apos, ref_pos); @@ -1787,10 +1802,6 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, } } - s->cigar = cigar; - s->cigar_alloc = cigar_alloc; - s->ncigar = ncigar; - if (cr->cram_flags & CRAM_FLAG_NO_SEQ) cr->len = 0; @@ -1834,6 +1845,7 @@ static int cram_decode_seq(cram_fd *fd, cram_container *c, cram_slice *s, hts_log_error("CRAM CIGAR extends beyond slice reference extents"); return -1; + err: block_err: return -1; } diff --git a/cram/cram_encode.c b/cram/cram_encode.c index 6c52c5892..8cbf9391c 100644 --- a/cram/cram_encode.c +++ b/cram/cram_encode.c @@ -2773,6 +2773,10 @@ static int process_one_read(cram_fd *fd, cram_container *c, c->num_bases += cr->len; cr->apos = bam_pos(b)+1; + if (cr->apos >= UINT32_MAX) { + hts_log_error("Sequence position out of valid range"); + goto err; + } if (c->pos_sorted) { if (cr->apos < s->last_apos) { c->pos_sorted = 0; diff --git a/header.c b/header.c index 5a4a77d3d..e34168aac 100644 --- a/header.c +++ b/header.c @@ -47,7 +47,6 @@ typedef khash_t(rm) rmhash_t; static int sam_hdr_link_pg(sam_hdr_t *bh); static int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap); -static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...); #define MAX_ERROR_QUOTE 320 // Prevent over-long error messages @@ -182,14 +181,10 @@ static int sam_hrecs_update_hashes(sam_hrecs_t *hrecs, if (hrecs->ref[nref].ty == NULL) { // Attach header line to existing stub entry. hrecs->ref[nref].ty = h_type; - // Check lengths match; correct if not. - if (len != hrecs->ref[nref].len) { - char tmp[32]; - snprintf(tmp, sizeof(tmp), "%" PRIhts_pos, - hrecs->ref[nref].len); - if (sam_hrecs_update(hrecs, h_type, "LN", tmp, NULL) < 0) - return -1; - } + // The old hrecs length may be incorrect as it is initially + // populated from h->target_len which has a max 32-bit size. + // We always take our latest data as canonical. + hrecs->ref[nref].len = len; if (sam_hrecs_add_ref_altnames(hrecs, nref, altnames) < 0) return -1; @@ -749,6 +744,23 @@ static int sam_hrecs_rebuild_lines(const sam_hrecs_t *hrecs, kstring_t *ks) { return 0; } +/* + * Reconstructs a kstring from the header hash table. + * Returns 0 on success + * -1 on failure + */ +int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) { + ks->l = 0; + + if (!hrecs->h || !hrecs->h->size) { + return kputsn("", 0, ks) >= 0 ? 0 : -1; + } + if (sam_hrecs_rebuild_lines(hrecs, ks) != 0) + return -1; + + return 0; +} + static int sam_hrecs_parse_lines(sam_hrecs_t *hrecs, const char *hdr, size_t len) { size_t i, lno; @@ -916,6 +928,8 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, // hrecs->refs_changed is the first ref that has been updated, so ones // before that can be skipped. int i; + khint_t k; + khash_t(s2i) *long_refs = bh->sdict; for (i = refs_changed; i < hrecs->nref; i++) { if (i >= bh->n_targets || strcmp(bh->target_name[i], hrecs->ref[i].name) != 0) { @@ -927,13 +941,38 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, } if (hrecs->ref[i].len < UINT32_MAX) { bh->target_len[i] = hrecs->ref[i].len; + + if (!long_refs) + continue; + + // Check if we have an old length, if so remove it. + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); } else { bh->target_len[i] = UINT32_MAX; + + if (!long_refs) { + if (!(bh->sdict = long_refs = kh_init(s2i))) + return -1; + } + + // Add / update length + int absent; + k = kh_put(s2i, long_refs, bh->target_name[i], &absent); + if (absent < 0) + return -1; + kh_val(long_refs, k) = hrecs->ref[i].len; } } // Free up any names that have been removed for (; i < bh->n_targets; i++) { + if (long_refs) { + k = kh_get(s2i, long_refs, bh->target_name[i]); + if (k < kh_end(long_refs)) + kh_del(s2i, long_refs, k); + } free(bh->target_name[i]); } @@ -941,7 +980,11 @@ int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs, return 0; } -static int rebuild_target_arrays(sam_hdr_t *bh) { +/*! Rebuild own target_name and target_len arrays from internal hrecs + * + * @return 0 on success; -1 on failure + */ +static int sam_hdr_rebuild_target_arrays(sam_hdr_t *bh) { if (!bh || !bh->hrecs) return -1; @@ -1102,7 +1145,7 @@ int sam_hdr_fill_hrecs(sam_hdr_t *bh) { bh->hrecs = hrecs; - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; return 0; @@ -1197,11 +1240,9 @@ int sam_hdr_rebuild(sam_hdr_t *bh) { if (!(hrecs = bh->hrecs)) return bh->text ? 0 : -1; - if (hrecs->refs_changed >= 0) { - if (rebuild_target_arrays(bh) < 0) { - hts_log_error("Header target array rebuild has failed"); - return -1; - } + if (sam_hdr_rebuild_target_arrays(bh) < 0) { + hts_log_error("Header target array rebuild has failed"); + return -1; } /* If header text wasn't changed or header is empty, don't rebuild it. */ @@ -1258,7 +1299,7 @@ int sam_hdr_add_lines(sam_hdr_t *bh, const char *lines, size_t len) { if (sam_hrecs_parse_lines(hrecs, lines, len) != 0) return -1; - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; hrecs->dirty = 1; @@ -1293,7 +1334,7 @@ int sam_hdr_add_line(sam_hdr_t *bh, const char *type, ...) { va_end(args); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1388,7 +1429,7 @@ int sam_hdr_remove_line_id(sam_hdr_t *bh, const char *type, const char *ID_key, int ret = sam_hrecs_remove_line(hrecs, type, type_found); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1428,7 +1469,7 @@ int sam_hdr_remove_line_pos(sam_hdr_t *bh, const char *type, int position) { int ret = sam_hrecs_remove_line(hrecs, type, type_found); if (ret == 0) { - if (hrecs->refs_changed >= 0 && rebuild_target_arrays(bh) != 0) + if (sam_hdr_rebuild_target_arrays(bh) != 0) return -1; if (hrecs->dirty) @@ -1558,7 +1599,7 @@ int sam_hdr_update_line(sam_hdr_t *bh, const char *type, ret = sam_hrecs_update_hashes(hrecs, TYPEKEY(type), ty); if (!ret && hrecs->refs_changed >= 0) - ret = rebuild_target_arrays(bh); + ret = sam_hdr_rebuild_target_arrays(bh); if (!ret && hrecs->dirty) redact_header_text(bh); @@ -1905,23 +1946,6 @@ int sam_hdr_remove_tag_id(sam_hdr_t *bh, return ret; } -/* - * Reconstructs a kstring from the header hash table. - * Returns 0 on success - * -1 on failure - */ -int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks) { - ks->l = 0; - - if (!hrecs->h || !hrecs->h->size) { - return kputsn("", 0, ks) >= 0 ? 0 : -1; - } - if (sam_hrecs_rebuild_lines(hrecs, ks) != 0) - return -1; - - return 0; -} - /* * Looks up a reference sequence by name and returns the numerical ID. * Returns -1 if unknown reference; -2 if header could not be parsed. @@ -2485,25 +2509,6 @@ int sam_hrecs_vupdate(sam_hrecs_t *hrecs, sam_hrec_type_t *type, va_list ap) { return 0; } -/* - * Adds or updates tag key,value pairs in a header line. - * Eg for adding M5 tags to @SQ lines or updating sort order for the - * @HD line. - * - * Specify multiple key,value pairs ending in NULL. - * - * Returns 0 on success - * -1 on failure - */ -static int sam_hrecs_update(sam_hrecs_t *hrecs, sam_hrec_type_t *type, ...) { - va_list args; - int res; - va_start(args, type); - res = sam_hrecs_vupdate(hrecs, type, args); - va_end(args); - return res; -} - /* * Looks for a specific key in a single sam header line identified by *type. * If prev is non-NULL it also fills this out with the previous tag, to diff --git a/header.h b/header.h index 810a3dda1..c6fbd7f5f 100644 --- a/header.h +++ b/header.h @@ -249,7 +249,7 @@ sam_hrecs_t *sam_hrecs_new(void); */ sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs); -/*! Update sam_hdr_t target_name and target_len arrays +/*! Update sam_hdr_t target_name and target_len arrays from source sam_hrecs_t * * sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup * can use it to construct target arrays from the source header. diff --git a/test/test.pl b/test/test.pl index ca0d766c1..69cc7b5e1 100755 --- a/test/test.pl +++ b/test/test.pl @@ -645,6 +645,12 @@ sub test_view testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.sam.gz"; testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # CRAM disabled for now as the positions cannot be 32-bit. (These tests are useful for + # checking SQ headers only.) + # testv $opts, "./test_view $tv_args -C -o no_ref -p longrefs/longref.tmp.cram longrefs/longref.sam"; + # testv $opts, "./test_view $tv_args -p longrefs/longref.tmp.sam_ longrefs/longref.tmp.cram"; + # testv $opts, "./compare_sam.pl longrefs/longref.sam longrefs/longref.tmp.sam_"; + # Build index and compare with on-the-fly one made earlier. test_compare $opts, "$$opts{path}/test_index -c longrefs/longref.tmp.sam.gz", "longrefs/longref.tmp.sam.gz.csi.otf", "longrefs/longref.tmp.sam.gz.csi", gz=>1;