From 1f3a88b3a243a0d5443e727f21c4746c1bdfc65a Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Thu, 18 Apr 2024 10:23:41 +1000 Subject: [PATCH 1/7] attempt to collapse in m6anet style --- src/eventalign.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++ src/f5c.c | 4 ++ src/f5c.h | 1 + src/f5cmisc.h | 5 ++ src/meth_main.c | 19 +++++++ 5 files changed, 167 insertions(+) diff --git a/src/eventalign.c b/src/eventalign.c index efc764ac..956dde21 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -2161,6 +2161,144 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, return sp->s; } +//cleanup unused function param shit +char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, + const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, + const std::vector& alignments, + int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, + int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr) +{ + + kstring_t str; + kstring_t *sp = &str; + str_init(sp, sizeof(char)*alignments.size()*120); + + size_t n_collapse = 1; + for(size_t i = 0; i < alignments.size(); i+=n_collapse) { + + const event_alignment_t& ea = alignments[i]; + + // basic information + if (!print_read_names) + { + sprintf_append(sp, "%s\t%d\t%s\t%ld\t%c\t", + ref_name, //ea.ref_name.c_str(), + ea.ref_position, + ea.ref_kmer, + (long)read_index, + 't'); //"tc"[ea.strand_idx]); + } + else + { + sprintf_append(sp, "%s\t%d\t%s\t%s\t%c\t", + ref_name, //ea.ref_name.c_str(), + ea.ref_position, + ea.ref_kmer, + read_name, //sr.read_name.c_str(), + 't'); //"tc"[ea.strand_idx]); + } + + // event information + uint64_t length = 0; + float event_mean = 0; + float event_stdv = 0; + float event_duration = 0; + + // if(strcmp(ea.ref_kmer,ea.model_kmer)==0){ //the ref and model kmers should match + // uint64_t len_curr = (uint64_t)((et->event)[ea.event_idx].length); + // length += len_curr; + // event_mean += get_fully_scaled_level((et->event)[ea.event_idx].mean,scalings) * len_curr; + // event_stdv += (et->event)[ea.event_idx].stdv * len_curr; + // event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr; + // } + + uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); + float model_mean = 0.0; + float model_stdv = 0.0; + // unscaled model parameters + if(ea.hmm_state != 'B') { + model_t model1 = model[rank]; + model_mean = model1.level_mean; + model_stdv = model1.level_stdv; + } + + //uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive + //uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive + + + n_collapse = 0; + while (i + n_collapse < alignments.size() && ea.ref_position == alignments[i+n_collapse].ref_position){ + assert(strcmp(ea.ref_kmer,alignments[i+n_collapse].ref_kmer)==0); + // if(strcmp(ea.model_kmer,alignments[i+n_collapse].model_kmer)!=0){ //TODO: NNNN kmers must be handled + // fprintf(stderr, "model kmer does not match! %s vs %s\n",ea.model_kmer,alignments[i+n_collapse].model_kmer); + // } + const event_alignment_t& ea_curr = alignments[i+n_collapse]; + + if(strcmp(ea_curr.ref_kmer,ea_curr.model_kmer)==0){ //the ref and model kmers should match + uint64_t len_curr = (uint64_t)((et->event)[ea_curr.event_idx].length); + length += len_curr; + event_mean += get_fully_scaled_level((et->event)[ea_curr.event_idx].mean,scalings) * len_curr; + event_stdv += (et->event)[ea_curr.event_idx].stdv * len_curr; + event_duration += get_duration_seconds(et, ea_curr.event_idx, sample_rate) * len_curr; + } + + n_collapse++; + } + event_mean /= length; + event_stdv /= length; + event_duration /= length; + + // if(scale_events) { + + // // scale reads to the model + // event_mean = get_fully_scaled_level(event_mean, scalings); + + // // unscaled model parameters + // if(ea.hmm_state != 'B') { + // model_t model1 = model[rank]; + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } + // } else { + + // // scale model to the reads + // if(ea.hmm_state != 'B') { + + // model_t model1 = get_scaled_gaussian_from_pore_model_state(model, scalings, rank); + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } + // } + + float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); + sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); + sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, + model_mean, + model_stdv, + standard_level); + + // if(write_signal_index) { + // sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); + // } + + // if(write_samples) { + // std::vector samples = get_scaled_samples(rawptr, start_idx, end_idx, scalings); + // std::stringstream sample_ss; + // std::copy(samples.begin(), samples.end(), std::ostream_iterator(sample_ss, ",")); + + // // remove trailing comma + // std::string sample_str = sample_ss.str(); + // sample_str.resize(sample_str.size() - 1); + // sprintf_append(sp, "\t%s", sample_str.c_str()); + // } + sprintf_append(sp, "\n"); + } + + + //str_free(sp); //freeing is later done in free_db_tmp() + return sp->s; +} + char *emit_event_alignment_paf(const event_table* et, int64_t len_raw_signal, int64_t ref_len, uint32_t kmer_size, scalings_t scalings, const std::vector& alignments, bam1_t* bam_record, char* read_name, char *ref_name, int8_t rna) diff --git a/src/f5c.c b/src/f5c.c index 319ca4c8..8baec8c1 100644 --- a/src/f5c.c +++ b/src/f5c.c @@ -863,6 +863,7 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){ int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0; int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0; int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0; + int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0; int8_t rna = (core->opt.flag & F5C_RNA) ? 1 : 0; if(paf_output){ @@ -872,6 +873,9 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){ int8_t sam_out_version = core->opt.sam_out_version; int64_t ref_len = core->m_hdr->target_len[db->bam_rec[i]->core.tid]; db->event_alignment_result_str[i] = emit_event_alignment_sam(qname, core->m_hdr, db->bam_rec[i], *event_alignment_result, sam_out_version, &(db->et[i]), db->sig[i]->nsample, ref_len, rna, db->scalings[i]); + } else if (m6anet_output){ + db->event_alignment_result_str[i] = emit_event_alignment_tsv_m6anet(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events, + db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr); } else { db->event_alignment_result_str[i] = emit_event_alignment_tsv(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events, db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr); diff --git a/src/f5c.h b/src/f5c.h index a3dbae51..51e157ae 100644 --- a/src/f5c.h +++ b/src/f5c.h @@ -57,6 +57,7 @@ #define F5C_COLLAPSE_EVENTS 0x20000 //collapse events #define F5C_R10 0x40000 //r10 #define F5C_PAF 0x80000 //paf (eventalign only) +#define F5C_M6ANET 0x100000 //m6anet (eventalign only) /************************************************************* * flags for a read status (related to db_t->read_stat_flag) * diff --git a/src/f5cmisc.h b/src/f5cmisc.h index 821857c4..54387868 100644 --- a/src/f5cmisc.h +++ b/src/f5cmisc.h @@ -80,6 +80,11 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, const std::vector& alignments, int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); +char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, + const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, + const std::vector& alignments, + int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, + int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index); void emit_sam_header(samFile* fp, const bam_hdr_t* hdr); char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record, diff --git a/src/meth_main.c b/src/meth_main.c index 4d9e0e58..2b2f7838 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -105,6 +105,7 @@ static struct option long_options[] = { {"pore",required_argument,0,0}, //46 pore {"paf",no_argument,0,'c'}, //47 if print in paf format (only for eventalign) {"sam-out-version",required_argument,0,0}, //48 specify the version of the sam output for eventalign (eventalign only) + {"m6anet",no_argument,0,0}, //49 m6anet output (eventalign only) {0, 0, 0, 0}}; @@ -447,6 +448,12 @@ int meth_main(int argc, char* argv[], int8_t mode) { exit(EXIT_FAILURE); } is_sam_out_version_set = 1; + } else if (c == 0 && longindex == 49){ //m6anet output + if(mode!=1){ + ERROR("%s","Option --m6anet is available only in eventalign"); + exit(EXIT_FAILURE); + } + yes_or_no(&opt, F5C_M6ANET, longindex, "yes", 1); } } @@ -513,6 +520,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { fprintf(fp_help," --paf write output in PAF format\n"); fprintf(fp_help," --sam write output in SAM format\n"); fprintf(fp_help," --sam-out-version INT sam output version (set 1 to revert to old nanopolish style format) [%d]\n",opt.meth_out_version); + fprintf(fp_help," --m6anet write output in m6anet format\n"); fprintf(fp_help," --print-read-names print read names instead of indexes\n"); fprintf(fp_help," --scale-events scale events to the model, rather than vice-versa\n"); @@ -573,16 +581,27 @@ int meth_main(int argc, char* argv[], int8_t mode) { int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0; int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0 ; int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0 ; + int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0 ; if(sam_output && paf_output){ ERROR("%s","-c and --sam cannot be used together"); exit(EXIT_FAILURE); } + if(sam_output && m6anet_output){ + ERROR("%s","-c and --m6anet cannot be used together"); + exit(EXIT_FAILURE); + } + if(paf_output && m6anet_output){ + ERROR("%s","--paf and --m6anet cannot be used together"); + exit(EXIT_FAILURE); + } if(sam_output){ emit_sam_header(core->sam_output, core->m_hdr); } else if (paf_output){ //none + } else if (m6anet_output){ + emit_event_alignment_tsv_header(stdout, 0, 0, 0); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); } From 9d6892901a4af5ea60cb1af0dac19da71b5ace19 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Thu, 18 Apr 2024 10:34:26 +1000 Subject: [PATCH 2/7] signal-index option now --- src/eventalign.c | 51 ++++++++++++++++-------------------------------- src/meth_main.c | 2 +- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/src/eventalign.c b/src/eventalign.c index 956dde21..dd2273d1 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -2222,8 +2222,8 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, model_stdv = model1.level_stdv; } - //uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive - //uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive + uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive + uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive n_collapse = 0; @@ -2248,27 +2248,7 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, event_stdv /= length; event_duration /= length; - // if(scale_events) { - - // // scale reads to the model - // event_mean = get_fully_scaled_level(event_mean, scalings); - // // unscaled model parameters - // if(ea.hmm_state != 'B') { - // model_t model1 = model[rank]; - // model_mean = model1.level_mean; - // model_stdv = model1.level_stdv; - // } - // } else { - - // // scale model to the reads - // if(ea.hmm_state != 'B') { - - // model_t model1 = get_scaled_gaussian_from_pore_model_state(model, scalings, rank); - // model_mean = model1.level_mean; - // model_stdv = model1.level_stdv; - // } - // } float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); @@ -2277,20 +2257,23 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, model_stdv, standard_level); - // if(write_signal_index) { - // sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); - // } + if(write_signal_index) { + if(n_collapse > 1){ + uint64_t start_idx1 = start_idx; + uint64_t end_idx1 = end_idx; - // if(write_samples) { - // std::vector samples = get_scaled_samples(rawptr, start_idx, end_idx, scalings); - // std::stringstream sample_ss; - // std::copy(samples.begin(), samples.end(), std::ostream_iterator(sample_ss, ",")); + const event_alignment_t& ea2 = alignments[i+n_collapse-1]; + uint64_t start_idx2 = (et->event)[ea2.event_idx].start; + uint64_t end_idx2 = (et->event)[ea2.event_idx].start + (uint64_t)((et->event)[ea2.event_idx].length); + + //min + start_idx = start_idx1 < start_idx2 ? start_idx1 : start_idx2; + //max + end_idx = end_idx1 > end_idx2 ? end_idx1 : end_idx2; + } + sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); + } - // // remove trailing comma - // std::string sample_str = sample_ss.str(); - // sample_str.resize(sample_str.size() - 1); - // sprintf_append(sp, "\t%s", sample_str.c_str()); - // } sprintf_append(sp, "\n"); } diff --git a/src/meth_main.c b/src/meth_main.c index 2b2f7838..1cc964dc 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -601,7 +601,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { } else if (paf_output){ //none } else if (m6anet_output){ - emit_event_alignment_tsv_header(stdout, 0, 0, 0); + emit_event_alignment_tsv_header(stdout, print_read_names, 0, write_signal_index); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); } From e16132aa28da69c09042fcd88496b29306d207db Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Wed, 24 Apr 2024 14:26:47 +1000 Subject: [PATCH 3/7] update the output format --- src/eventalign.c | 63 +++++++++++++++++++++++++++++++----------------- src/f5cmisc.h | 1 + src/meth_main.c | 2 +- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/eventalign.c b/src/eventalign.c index dd2273d1..83f5929c 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -1659,6 +1659,20 @@ void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t w fprintf(fp, "\n"); } +// Note: reference_kmer will be same as the model_kmer in this mode +// Also, event_level_mean and event_stdv are always scaled to the model (--scale-events is implicit) +void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index) +{ + fprintf(fp, "%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer", + (print_read_names? "read_name" : "read_index")); + fprintf(fp, "%s\t%s\t%s\t", "event_level_mean", "event_stdv", "event_length"); + + if(write_signal_index) { + fprintf(fp, "\t%s\t%s", "start_idx", "end_idx"); + } + + fprintf(fp, "\n"); +} typedef struct { uint64_t start_raw; @@ -2161,6 +2175,13 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, return sp->s; } +// algo: +// 1. Remove rows in eventalign.tsv where reference kmer does not match model kmer +// 2. For each row in eventalign.tsv, length of each event = end_idx - start_idx +// 3. Group by each read and each reference position +// - collapsed_event_level_mean = sum(event_level_mean * length) / sum(length) +// - collapsed_event_stdv = sum(event_stdv * length) / sum(length) +// - collapsed_event_duration = sum(event_duration * length) / sum(length) //cleanup unused function param shit char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, @@ -2181,21 +2202,20 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, // basic information if (!print_read_names) { - sprintf_append(sp, "%s\t%d\t%s\t%ld\t%c\t", + sprintf_append(sp, "%s\t%d\t%s\t%ld\t", ref_name, //ea.ref_name.c_str(), ea.ref_position, ea.ref_kmer, - (long)read_index, - 't'); //"tc"[ea.strand_idx]); + (long)read_index); } else { - sprintf_append(sp, "%s\t%d\t%s\t%s\t%c\t", + sprintf_append(sp, "%s\t%d\t%s\t%s\t", ref_name, //ea.ref_name.c_str(), ea.ref_position, ea.ref_kmer, - read_name, //sr.read_name.c_str(), - 't'); //"tc"[ea.strand_idx]); + read_name //sr.read_name.c_str(), + ); } // event information @@ -2212,15 +2232,15 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, // event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr; // } - uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); - float model_mean = 0.0; - float model_stdv = 0.0; - // unscaled model parameters - if(ea.hmm_state != 'B') { - model_t model1 = model[rank]; - model_mean = model1.level_mean; - model_stdv = model1.level_stdv; - } + // uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); + // float model_mean = 0.0; + // float model_stdv = 0.0; + // // unscaled model parameters + // if(ea.hmm_state != 'B') { + // model_t model1 = model[rank]; + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive @@ -2249,13 +2269,12 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, event_duration /= length; - - float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); - sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); - sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, - model_mean, - model_stdv, - standard_level); + // float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); + sprintf_append(sp, "%.2f\t%.3f\t%.5f\t", event_mean, event_stdv, event_duration); + // sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, + // model_mean, + // model_stdv, + // standard_level); if(write_signal_index) { if(n_collapse > 1){ diff --git a/src/f5cmisc.h b/src/f5cmisc.h index 54387868..694867cb 100644 --- a/src/f5cmisc.h +++ b/src/f5cmisc.h @@ -86,6 +86,7 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index); +void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index); void emit_sam_header(samFile* fp, const bam_hdr_t* hdr); char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record, const std::vector& alignments, int8_t sam_out_version, diff --git a/src/meth_main.c b/src/meth_main.c index 1cc964dc..7d6401a8 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -601,7 +601,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { } else if (paf_output){ //none } else if (m6anet_output){ - emit_event_alignment_tsv_header(stdout, print_read_names, 0, write_signal_index); + emit_event_alignment_tsv_m6anet_header(stdout, print_read_names, write_signal_index); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); } From 3c03d2dbee73dc84b61ed9cbb26f6bf7abcd25fb Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Sun, 4 Aug 2024 20:27:33 +1000 Subject: [PATCH 4/7] update slow5lib and workflows --- .github/workflows/f5c-x86_64.yml | 35 +++++ slow5lib/Makefile | 4 +- slow5lib/include/slow5/slow5.h | 4 + slow5lib/include/slow5/slow5_defs.h | 2 +- slow5lib/include/slow5/slow5_mt.h | 2 - slow5lib/src/slow5.c | 94 ++++++++---- slow5lib/src/slow5_byte.h | 229 ++++++++++++++++++++++++++++ slow5lib/src/slow5_idx.c | 28 +++- slow5lib/src/slow5_idx.h | 4 +- slow5lib/src/slow5_press.c | 10 ++ 10 files changed, 365 insertions(+), 47 deletions(-) create mode 100644 slow5lib/src/slow5_byte.h diff --git a/.github/workflows/f5c-x86_64.yml b/.github/workflows/f5c-x86_64.yml index b7acd4e7..e01c3621 100644 --- a/.github/workflows/f5c-x86_64.yml +++ b/.github/workflows/f5c-x86_64.yml @@ -86,6 +86,19 @@ jobs: run: autoreconf --install && ./scripts/install-hts.sh && ./configure --disable-hdf5 && make -j8 disable_hdf5=1 - name: test run: test/test_slow5.sh + ubuntu_24: + name: Ubuntu 24 + runs-on: ubuntu-24.04 + steps: + - uses: actions/checkout@v2 + with: + submodules: recursive + - name: install packages + run: sudo apt-get update && sudo apt-get install -y zlib1g-dev + - name: build + run: autoreconf --install && ./scripts/install-hts.sh && ./configure --disable-hdf5 && make -j8 disable_hdf5=1 + - name: test + run: test/test_slow5.sh ubuntu_14_cuda_6_5: name: Ubuntu 14 cuda 6.5 runs-on: ubuntu-20.04 @@ -128,6 +141,28 @@ jobs: run: autoreconf --install && CC=gcc CXX=g++ CFLAGS="-Wno-implicit-function-declaration" ./scripts/install-hdf5.sh && ./scripts/install-hts.sh && ./configure --enable-localhdf5 && make -j8 - name: test run: make test + os_x_13: + name: OSX 13 + runs-on: macos-13 + steps: + - uses: actions/checkout@v2 + - name: install packages + run: brew install hdf5 autoconf automake + - name: build + run: autoreconf --install && ./scripts/install-hts.sh && ./configure && make -j8 + - name: test + run: make test + os_x_14: + name: OSX 14 + runs-on: macos-14 + steps: + - uses: actions/checkout@v2 + - name: install packages + run: brew install hdf5 autoconf automake + - name: build + run: autoreconf --install && ./scripts/install-hts.sh && ./configure && make -j8 disable_hdf5=1 + - name: test + run: test/test_slow5.sh ubuntu_arm: name: ubuntu arm runs-on: ubuntu-latest diff --git a/slow5lib/Makefile b/slow5lib/Makefile index c1913b58..7e7d95fd 100644 --- a/slow5lib/Makefile +++ b/slow5lib/Makefile @@ -55,10 +55,10 @@ $(SHAREDLIB): $(OBJ) $(SVBLIB) $(SVBLIB): make -C $(SVB) no_simd=$(no_simd) libstreamvbyte.a -$(BUILD_DIR)/slow5.o: src/slow5.c src/slow5_extra.h src/slow5_idx.h src/slow5_misc.h src/klib/ksort.h $(SLOW5_H) +$(BUILD_DIR)/slow5.o: src/slow5.c src/slow5_extra.h src/slow5_idx.h src/slow5_misc.h src/klib/ksort.h src/slow5_byte.h $(SLOW5_H) $(CC) $(CFLAGS) $(CPPFLAGS) $< -c -fpic -o $@ -$(BUILD_DIR)/slow5_idx.o: src/slow5_idx.c src/slow5_idx.h src/slow5_extra.h src/slow5_misc.h $(SLOW5_H) +$(BUILD_DIR)/slow5_idx.o: src/slow5_idx.c src/slow5_idx.h src/slow5_extra.h src/slow5_misc.h src/slow5_byte.h $(SLOW5_H) $(CC) $(CFLAGS) $(CPPFLAGS) $< -c -fpic -o $@ $(BUILD_DIR)/slow5_misc.o: src/slow5_misc.c src/slow5_misc.h include/slow5/slow5_error.h diff --git a/slow5lib/include/slow5/slow5.h b/slow5lib/include/slow5/slow5.h index b6ee89eb..dc944863 100755 --- a/slow5lib/include/slow5/slow5.h +++ b/slow5lib/include/slow5/slow5.h @@ -624,6 +624,10 @@ void slow5_set_log_level(enum slow5_log_level_opt log_level); //sets a global variable, so not thread safe void slow5_set_exit_condition(enum slow5_exit_condition_opt exit_condition); +//experimental +/* no error messages printed and not exit when a requested read ID is not found in index*/ +// being tested, do not use until added to documentation +void slow5_set_skip_rid(); //get the list of hdr data keys in sorted order (only the returned pointer must be freed, not the ones inside - subjet to change) //len is the numberof elements diff --git a/slow5lib/include/slow5/slow5_defs.h b/slow5lib/include/slow5/slow5_defs.h index 845df541..8fb27e68 100644 --- a/slow5lib/include/slow5/slow5_defs.h +++ b/slow5lib/include/slow5/slow5_defs.h @@ -44,7 +44,7 @@ The API documentation is available at https://hasindu2008.github.io/slow5tools/ */ // library version -#define SLOW5_LIB_VERSION "1.2.0-beta" +#define SLOW5_LIB_VERSION "1.2.0" // maximum file version supported by this library - independent of slow5 library version above // if updating change all 4 below diff --git a/slow5lib/include/slow5/slow5_mt.h b/slow5lib/include/slow5/slow5_mt.h index 51c66be7..4fc37df8 100644 --- a/slow5lib/include/slow5/slow5_mt.h +++ b/slow5lib/include/slow5/slow5_mt.h @@ -12,8 +12,6 @@ extern "C" { *** Easy Multi-thread API ************************************************************************* **************************************************************************************************/ -/**************** This API is still in beta stage, there could be bugs *************************/ - /* This is a easy multi-thread API that can fetch a batch of slow5 records using multiple threads in parallel. This API uses a fork-join thread model. It is not meant to be used by a programmer who has the expertise to write multi-threaded code and use the slow5 low-level API directly. diff --git a/slow5lib/src/slow5.c b/slow5lib/src/slow5.c index 6a57948b..79bfcea3 100644 --- a/slow5lib/src/slow5.c +++ b/slow5lib/src/slow5.c @@ -41,9 +41,9 @@ SOFTWARE. #include "slow5_extra.h" #include "slow5_idx.h" #include "slow5_misc.h" +#include "slow5_byte.h" #include "klib/ksort.h" - /* IMPORTANT: The comments in this are NOT the API documentation The API documentation is available at https://hasindu2008.github.io/slow5tools/ The comments here are for internal use and do not rely on them. Open a GitHub issue for any questions. @@ -86,6 +86,8 @@ static inline slow5_file_t *slow5_open_append(const char *filename, enum slow5_ enum slow5_log_level_opt slow5_log_level = SLOW5_LOG_INFO; enum slow5_exit_condition_opt slow5_exit_condition = SLOW5_EXIT_OFF; +int8_t slow5_bigend = 0; +int8_t slow5_skip_rid = 0; __thread int slow5_errno_intern = 0; @@ -207,7 +209,6 @@ struct slow5_file *slow5_init(FILE *fp, const char *pathname, enum slow5_fmt for * allocate memory for header, SLOW5 file structure and populate file number and offset * * errors - * SLOW5_ERR_OTH big endian machine * SLOW5_ERR_ARG fp is NULL * SLOW5_ERR_UNK format could not be determined from extension * SLOW5_ERR_MEM memory allocation failed @@ -216,9 +217,10 @@ struct slow5_file *slow5_init(FILE *fp, const char *pathname, enum slow5_fmt for struct slow5_file *slow5_init_empty(FILE *fp, const char *pathname, enum slow5_fmt format) { if (slow5_is_big_endian()) { - SLOW5_ERROR("%s","Big endian machine detected. slow5lib only support little endian at this time. Please open a github issue stating your machine spec ."); - slow5_errno = SLOW5_ERR_OTH; - return NULL; + SLOW5_WARNING("%s","Big endian machine detected and the support is experimental. Report issues at ."); + slow5_bigend = 1; + //slow5_errno = SLOW5_ERR_OTH; + //return NULL; } // pathname allowed to be NULL at this point if (!fp) { @@ -304,7 +306,6 @@ struct slow5_file *slow5_open(const char *pathname, const char *mode) { * slow5_close() should be called when finished with the structure. * * On error, NULL is returned and slow5_errno is set to indicate the error. - * SLOW5_ERR_OTH Big endian is not supported. * SLOW5_ERR_ARG The pathname or mode provided was NULL. * SLOW5_ERR_IO The file could not be opened. See errno for details. * slow5_init errors @@ -318,9 +319,10 @@ struct slow5_file *slow5_open(const char *pathname, const char *mode) { */ struct slow5_file *slow5_open_with(const char *pathname, const char *mode, enum slow5_fmt format) { if (slow5_is_big_endian()) { - SLOW5_ERROR_EXIT("%s", "Big endian machine detected. slow5lib only supports little endian at this time. Please open a github issue stating your machine spec ."); - slow5_errno = SLOW5_ERR_OTH; - return NULL; + SLOW5_WARNING("%s","Big endian machine detected and the support is experimental. Report issues at ."); + slow5_bigend = 1; + //slow5_errno = SLOW5_ERR_OTH; + //return NULL; } if (!pathname || !mode) { if (!pathname) { @@ -818,7 +820,7 @@ struct slow5_hdr *slow5_hdr_init(FILE *fp, enum slow5_fmt format, slow5_press_me } else if (fread(&record_method, sizeof record_method, 1, fp) != 1) { SLOW5_ERROR("Malformed blow5 header. Failed to read the record compression method.%s", feof(fp) ? " EOF reached." : ""); goto err_fread; - } else if (fread(&header->num_read_groups, sizeof header->num_read_groups, 1, fp) != 1) { + } else if (SLOW5_FREAD(&header->num_read_groups, sizeof header->num_read_groups, 1, fp) != 1) { SLOW5_ERROR("Malformed blow5 header. Failed to read the number of read groups.%s", feof(fp) ? " EOF reached." : ""); goto err_fread; } else if (slow5_signal_press_version_cmp(header->version) >= 0 && fread(&signal_method, sizeof signal_method, 1, fp) != 1) { @@ -829,7 +831,7 @@ struct slow5_hdr *slow5_hdr_init(FILE *fp, enum slow5_fmt format, slow5_press_me free(header); slow5_errno = SLOW5_ERR_IO; return NULL; - } else if (fread(&header_size, sizeof header_size, 1, fp) != 1) { + } else if (SLOW5_FREAD(&header_size, sizeof header_size, 1, fp) != 1) { SLOW5_ERROR("Malformed blow5 header. Failed to read the ascii header size.%s", feof(fp) ? " EOF reached." : ""); goto err_fread; } @@ -1000,7 +1002,7 @@ void *slow5_hdr_to_mem(struct slow5_hdr *header, enum slow5_fmt format, slow5_pr len += sizeof version->patch; memcpy(mem + len, &record_comp, sizeof record_comp); len += sizeof record_comp; - memcpy(mem + len, &header->num_read_groups, sizeof header->num_read_groups); + SLOW5_MEMCPY(mem + len, &header->num_read_groups, sizeof header->num_read_groups); len += sizeof header->num_read_groups; memcpy(mem + len, &signal_comp, sizeof signal_comp); len += sizeof signal_comp; @@ -1144,7 +1146,7 @@ void *slow5_hdr_to_mem(struct slow5_hdr *header, enum slow5_fmt format, slow5_pr mem[len] = '\0'; } else if (format == SLOW5_FORMAT_BINARY) { //write the header size in bytes (which was skipped previously) header_size = len - (SLOW5_BINARY_HDR_SIZE_OFFSET + sizeof header_size); - memcpy(mem + SLOW5_BINARY_HDR_SIZE_OFFSET, &header_size, sizeof header_size); + SLOW5_MEMCPY(mem + SLOW5_BINARY_HDR_SIZE_OFFSET, &header_size, sizeof header_size); } if (n != NULL) { @@ -2522,6 +2524,9 @@ int slow5_get(const char *read_id, struct slow5_rec **read, struct slow5_file *s size_t bytes; char *mem; if (!(mem = (char *)slow5_get_mem(read_id, &bytes, s5p))) { + if(slow5_errno == SLOW5_ERR_NOTFOUND && slow5_skip_rid == 1){ + return slow5_errno; + } SLOW5_EXIT_IF_ON_ERR(); return slow5_errno; } @@ -2816,7 +2821,7 @@ int slow5_rec_parse(char *read_mem, size_t read_size, const char *read_id, struc case COL_read_id: size = sizeof read->read_id_len; - memcpy(&read->read_id_len, read_mem + offset, size); + SLOW5_MEMCPY(&read->read_id_len, read_mem + offset, size); offset += size; size = read->read_id_len * sizeof *read->read_id; @@ -2842,37 +2847,37 @@ int slow5_rec_parse(char *read_mem, size_t read_size, const char *read_id, struc case COL_read_group: size = sizeof read->read_group; - memcpy(&read->read_group, read_mem + offset, size); + SLOW5_MEMCPY(&read->read_group, read_mem + offset, size); offset += size; break; case COL_digitisation: size = sizeof read->digitisation; - memcpy(&read->digitisation, read_mem + offset, size); + SLOW5_MEMCPY(&read->digitisation, read_mem + offset, size); offset += size; break; case COL_offset: size = sizeof read->offset; - memcpy(&read->offset, read_mem + offset, size); + SLOW5_MEMCPY(&read->offset, read_mem + offset, size); offset += size; break; case COL_range: size = sizeof read->range; - memcpy(&read->range, read_mem + offset, size); + SLOW5_MEMCPY(&read->range, read_mem + offset, size); offset += size; break; case COL_sampling_rate: size = sizeof read->sampling_rate; - memcpy(&read->sampling_rate, read_mem + offset, size); + SLOW5_MEMCPY(&read->sampling_rate, read_mem + offset, size); offset += size; break; case COL_len_raw_signal: size = sizeof read->len_raw_signal; - memcpy(&read->len_raw_signal, read_mem + offset, size); + SLOW5_MEMCPY(&read->len_raw_signal, read_mem + offset, size); offset += size; break; @@ -2902,6 +2907,7 @@ int slow5_rec_parse(char *read_mem, size_t read_size, const char *read_id, struc } memcpy(read->raw_signal, read_mem + offset, size); + SLOW5_BYTE_SWAP_ARRAY(read->raw_signal, read->len_raw_signal); offset += size; if (signal_method != SLOW5_COMPRESS_NONE) { @@ -3094,7 +3100,7 @@ static int slow5_rec_aux_parse(char *tok, char *read_mem, uint64_t offset, size_ if (SLOW5_IS_PTR(aux_meta->types[i])) { /* Type is an array */ size = sizeof len; - memcpy(&len, read_mem + offset, size); + SLOW5_MEMCPY(&len, read_mem + offset, size); offset += size; } @@ -3121,6 +3127,15 @@ static int slow5_rec_aux_parse(char *tok, char *read_mem, uint64_t offset, size_ return -1; } memcpy(data, read_mem + offset, bytes); + if(slow5_bigend){ + if(len == 1){ + SLOW5_BYTE_SWAP_VOID(data,bytes); + } else if (len > 1 && bytes/len==2){ + SLOW5_BYTE_SWAP_ARRAY_VOID(data, 2, len); + } else if (len > 1 && bytes/len>2){ + SLOW5_ENDIAN_ERROR("multi-byte array"); + } + } offset += bytes; } @@ -3243,6 +3258,7 @@ void *slow5_get_next_mem(size_t *n, const struct slow5_file *s5p) { } goto err; } + SLOW5_BYTE_SWAP(&bytes_tmp); bytes = bytes_tmp; mem = (char *) malloc(bytes); @@ -3936,19 +3952,19 @@ void *slow5_rec_to_mem(struct slow5_rec *read, struct slow5_aux_meta *aux_meta, mem = (char *) malloc(cap * sizeof *mem); SLOW5_MALLOC_CHK(mem); - memcpy(mem + curr_len, &read->read_id_len, sizeof read->read_id_len); + SLOW5_MEMCPY(mem + curr_len, &read->read_id_len, sizeof read->read_id_len); curr_len += sizeof read->read_id_len; memcpy(mem + curr_len, read->read_id, read->read_id_len * sizeof *read->read_id); curr_len += read->read_id_len * sizeof *read->read_id; - memcpy(mem + curr_len, &read->read_group, sizeof read->read_group); + SLOW5_MEMCPY(mem + curr_len, &read->read_group, sizeof read->read_group); curr_len += sizeof read->read_group; - memcpy(mem + curr_len, &read->digitisation, sizeof read->digitisation); + SLOW5_MEMCPY(mem + curr_len, &read->digitisation, sizeof read->digitisation); curr_len += sizeof read->digitisation; - memcpy(mem + curr_len, &read->offset, sizeof read->offset); + SLOW5_MEMCPY(mem + curr_len, &read->offset, sizeof read->offset); curr_len += sizeof read->offset; - memcpy(mem + curr_len, &read->range, sizeof read->range); + SLOW5_MEMCPY(mem + curr_len, &read->range, sizeof read->range); curr_len += sizeof read->range; - memcpy(mem + curr_len, &read->sampling_rate, sizeof read->sampling_rate); + SLOW5_MEMCPY(mem + curr_len, &read->sampling_rate, sizeof read->sampling_rate); curr_len += sizeof read->sampling_rate; size_t bytes_raw_sig = read->len_raw_signal * sizeof *read->raw_signal; @@ -3968,9 +3984,9 @@ void *slow5_rec_to_mem(struct slow5_rec *read, struct slow5_aux_meta *aux_meta, read->raw_signal = (int16_t *) raw_sig_svb; } - memcpy(mem + curr_len, &read->len_raw_signal, sizeof read->len_raw_signal); + SLOW5_MEMCPY(mem + curr_len, &read->len_raw_signal, sizeof read->len_raw_signal); curr_len += sizeof read->len_raw_signal; - memcpy(mem + curr_len, read->raw_signal, bytes_raw_sig); + memcpy(mem + curr_len, read->raw_signal, bytes_raw_sig); SLOW5_BYTE_SWAP_ARRAY_VOID(mem + curr_len, sizeof *read->raw_signal, read->len_raw_signal); curr_len += bytes_raw_sig; // Auxiliary fields @@ -3998,8 +4014,7 @@ void *slow5_rec_to_mem(struct slow5_rec *read, struct slow5_aux_meta *aux_meta, mem = (char *) realloc(mem, cap); SLOW5_MALLOC_CHK(mem); } - - memcpy(mem + curr_len, &aux_data.len, sizeof aux_data.len); + SLOW5_MEMCPY(mem + curr_len, &aux_data.len, sizeof aux_data.len); curr_len += sizeof aux_data.len; } else if (curr_len + aux_data.bytes >= cap) { // Realloc if necessary @@ -4010,6 +4025,15 @@ void *slow5_rec_to_mem(struct slow5_rec *read, struct slow5_aux_meta *aux_meta, if (aux_data.len != 0) { memcpy(mem + curr_len, aux_data.data, aux_data.bytes); + if(slow5_bigend){ + if(aux_data.len==1){ + SLOW5_BYTE_SWAP_VOID(mem + curr_len, aux_data.bytes); + } else if (aux_data.len > 1 && aux_data.bytes/aux_data.len==2) { + SLOW5_BYTE_SWAP_ARRAY_VOID(mem + curr_len, 2, aux_data.len); + } else if (aux_data.len > 1 && aux_data.bytes/aux_data.len>2) { + SLOW5_ENDIAN_ERROR("multi-byte array"); + } + } } curr_len += aux_data.bytes; @@ -4031,7 +4055,7 @@ void *slow5_rec_to_mem(struct slow5_rec *read, struct slow5_aux_meta *aux_meta, uint8_t *comp_mem_full = (uint8_t *) malloc(sizeof record_size + record_size); SLOW5_MALLOC_CHK(comp_mem_full); // Copy size of compressed record - memcpy(comp_mem_full, &record_size, sizeof record_size); + SLOW5_MEMCPY(comp_mem_full, &record_size, sizeof record_size); // Copy compressed record memcpy(comp_mem_full + sizeof record_size, comp_mem, record_size); free(comp_mem); @@ -4604,6 +4628,12 @@ void slow5_set_exit_condition(enum slow5_exit_condition_opt exit_condition) { slow5_exit_condition = exit_condition; } +/* no error messages printed and not exit when a requested read ID is not found in index*/ +// being tested, do not use until added to documentation +void slow5_set_skip_rid(){ + slow5_skip_rid = 1; +} + /* * is slow5 file at end? seek back, read and find out * return 0 if not at end and file pointer left unchanged diff --git a/slow5lib/src/slow5_byte.h b/slow5lib/src/slow5_byte.h new file mode 100644 index 00000000..624fdde8 --- /dev/null +++ b/slow5lib/src/slow5_byte.h @@ -0,0 +1,229 @@ +/** + * @file slow5_byte.h + * @brief SLOW5 byte handling functions + * @author Hasindu Gamaarachchi (hasindu@garvan.org.au) + * @date 05/05/2024 + */ + +/* +MIT License + +Copyright (c) 2020 Hasindu Gamaarachchi + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. +*/ + + +#ifndef SLOW5_BYTE_H +#define SLOW5_BYTE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif /* _cplusplus */ + +/* This is for internal use only - do not use any of the following directly*/ + + +static inline void slow5_byte_swap(void *dest, const void *src, size_t size){ + if(size==2){ + + // uint16_t tmp = ((uint16_t*)src)[0]; + // tmp = (tmp << 8) | (tmp >> 8); + // ((uint16_t*)dest)[0] = tmp; + + char tmp0 = ((char*)src)[0]; + char tmp1 = ((char*)src)[1]; + ((char*)dest)[0] = tmp1; + ((char*)dest)[1] = tmp0; + + } else if(size==4){ + + // uint32_t tmp = ((uint32_t*)src)[0]; + // tmp = (tmp >> 24) | ((tmp << 8) & 0x00FF0000) | ((tmp >> 8) & 0x0000FF00) | (tmp << 24); + // ((uint32_t*)dest)[0] = tmp; + + char tmp0 = ((char*)src)[0]; + char tmp1 = ((char*)src)[1]; + char tmp2 = ((char*)src)[2]; + char tmp3 = ((char*)src)[3]; + ((char*)dest)[0] = tmp3; + ((char*)dest)[1] = tmp2; + ((char*)dest)[2] = tmp1; + ((char*)dest)[3] = tmp0; + + } else if(size==8){ + + // uint64_t tmp = ((uint64_t*)src)[0]; + // tmp = (tmp >> 56) | ((tmp << 40) & 0x00FF000000000000) | ((tmp << 24) & 0x0000FF0000000000) | + // ((tmp << 8) & 0x000000FF00000000) | ((tmp >> 8) & 0x00000000FF000000) | + // ((tmp >> 24) & 0x0000000000FF0000) | ((tmp >> 40) & 0x000000000000FF00) | (tmp << 56); + // ((uint64_t*)dest)[0] = tmp; + + char tmp0 = ((char*)src)[0]; + char tmp1 = ((char*)src)[1]; + char tmp2 = ((char*)src)[2]; + char tmp3 = ((char*)src)[3]; + char tmp4 = ((char*)src)[4]; + char tmp5 = ((char*)src)[5]; + char tmp6 = ((char*)src)[6]; + char tmp7 = ((char*)src)[7]; + ((char*)dest)[0] = tmp7; + ((char*)dest)[1] = tmp6; + ((char*)dest)[2] = tmp5; + ((char*)dest)[3] = tmp4; + ((char*)dest)[4] = tmp3; + ((char*)dest)[5] = tmp2; + ((char*)dest)[6] = tmp1; + ((char*)dest)[7] = tmp0; + + } +} + + +static inline int slow5_fwrite_bigend(const void *ptr, size_t size, size_t nitems, FILE *stream) { + + if(!(size==1 || size==2 || size==4 || size==8)){ + fprintf(stderr,"ERROR: slow5_fwrite_bigend is only implemented for data types of size %zu\n",size); + return -1; + } + if(nitems!=1){ + fprintf(stderr,"ERROR: slow5_fwrite_bigend is only implemented for nitems=1\n"); + return -1; + } + + if(size == 1){ + return fwrite(ptr, size, nitems, stream); + } else { + void *ptr_copy = malloc(size*nitems); + if(ptr_copy==NULL){ + fprintf(stderr,"[%s::ERROR]\033[1;31m malloc failled\033[0m\n", __func__); + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); + return -1; + } + slow5_byte_swap(ptr_copy, ptr, size); + int ret = fwrite(ptr_copy, size, nitems, stream); + free(ptr_copy); + return ret; + } +} + +static inline int slow5_fread_bigend(void *ptr, size_t size, size_t nitems, FILE *stream) { + + if(!(size==1 || size==2 || size==4 || size==8)){ + fprintf(stderr,"[%s::ERROR]\033[1;31m slow5_fwrite_bigend is only implemented for data types of size %zu\033[0m\n",__func__,size); + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); + return -1; + } + if(nitems!=1){ + fprintf(stderr,"[%s::ERROR]\033[1;31m slow5_fwrite_bigend is only implemented for nitems=1\033[0m\n", __func__); + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); + return -1; + } + + if(size == 1){ + return fread(ptr, size, nitems, stream); + } else { + void *ptr_copy = malloc(size*nitems); + if(ptr_copy==NULL){ + fprintf(stderr,"[%s::ERROR]\033[1;31m malloc failled\033[0m\n", __func__); + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); + return -1; + } + size_t ret = fread(ptr_copy, size, nitems, stream); + slow5_byte_swap(ptr, ptr_copy, size); + free(ptr_copy); + return ret; + } +} + +#define SLOW5_FWRITE(ptr, size, nitems, stream) \ + ( (slow5_bigend) ? (slow5_fwrite_bigend((ptr), (size), (nitems), (stream))) : (fwrite((ptr), (size), (nitems), (stream))) ) + +#define SLOW5_FREAD(ptr, size, nitems, stream) \ + ( (slow5_bigend) ? (slow5_fread_bigend((ptr), (size), (nitems), (stream))) : (fread((ptr), (size), (nitems), (stream))) ) + +static inline void *slow5_memcpy_bigend(void *dest, const void * src, size_t size) { + if(!(size==1 || size==2 || size==4 || size==8)){ + fprintf(stderr,"[%s::ERROR]\033[1;31m slow5_fwrite_bigend is only implemented for data types of size %zu\033[0m\n",__func__,size); + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); + exit(EXIT_FAILURE); + } + slow5_byte_swap((dest), (src), (size)); + return (dest); +} + +#define SLOW5_MEMCPY(dest, src, size) \ + ( (slow5_bigend) ? (slow5_memcpy_bigend((dest), (src), (size))) : (memcpy((dest), (src), (size))) ) + + +#define SLOW5_BYTE_SWAP(ptr) \ + if(slow5_bigend){ \ + int size = sizeof(*(ptr)); \ + slow5_byte_swap((ptr), (ptr), size); \ + } + +#define SLOW5_BYTE_SWAP_VOID(ptr,size) \ + if(slow5_bigend){ \ + slow5_byte_swap((ptr), (ptr), size); \ + } + +#define SLOW5_BYTE_SWAP_ARRAY(ptr, nitems) \ + if(slow5_bigend){ \ + int size = sizeof(*(ptr)); \ + for(size_t i=0; i<(nitems); i++){ \ + slow5_byte_swap((ptr)+i, (ptr)+i, size); \ + } \ + } + +#define SLOW5_BYTE_SWAP_ARRAY_VOID(ptr, size, nitems) \ + if(slow5_bigend){ \ + if((size)!=2){ \ + fprintf(stderr,"[%s::ERROR]\033[1;31m Big Endian is not supported for the feature. Open an issue please.\033[0m\n", __func__); \ + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); \ + exit(EXIT_FAILURE); \ + } \ + for(int64_t i=0; i<(nitems); i++){ \ + char *thisptr = ((char *)(ptr))+i*(size); \ + slow5_byte_swap(thisptr, thisptr, (size)); \ + } \ + } + + +#define SLOW5_ENDIAN_ERROR(MSG) \ + if(slow5_bigend){ \ + fprintf(stderr,"[%s::ERROR]\033[1;31m Big Endian is not supported for this" MSG "feature. Open an issue please.\033[0m\n", __func__); \ + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__-2); \ + exit(EXIT_FAILURE); \ + }\ + +#define SLOW5_ENDIAN_WARN \ + if(slow5_bigend){ \ + fprintf(stderr,"[%s::ERROR]\033[1;31m Big Endian is not supported for this multi-byte array feature. Don't trust any results. Open an issue please.\033[0m\n", __func__); \ + fprintf(stderr,"At %s:%d\n", __FILE__, __LINE__); \ + }\ + + +#ifdef __cplusplus +} +#endif /* _cplusplus */ + +#endif /* slow5_byte.h */ diff --git a/slow5lib/src/slow5_idx.c b/slow5lib/src/slow5_idx.c index d9aa2700..be389e20 100644 --- a/slow5lib/src/slow5_idx.c +++ b/slow5lib/src/slow5_idx.c @@ -6,10 +6,13 @@ //#include "slow5.h" #include "slow5_extra.h" #include "slow5_misc.h" +#include "slow5_byte.h" //#include "slow5_error.h" extern enum slow5_log_level_opt slow5_log_level; extern enum slow5_exit_condition_opt slow5_exit_condition; +extern int8_t slow5_bigend; +extern int8_t slow5_skip_rid; #define BUF_INIT_CAP (20*1024*1024) #define SLOW5_INDEX_BUF_INIT_CAP (64) // 2^6 TODO is this too little? @@ -270,6 +273,7 @@ static int slow5_idx_build(struct slow5_idx *index, struct slow5_file *s5p) { } return slow5_errno; } + SLOW5_BYTE_SWAP(&record_size); //if bigendian size = sizeof record_size + record_size; @@ -388,10 +392,16 @@ int slow5_idx_write(struct slow5_idx *index, struct slow5_version version) { struct slow5_rec_idx read_index = kh_value(index->hash, pos); slow5_rid_len_t read_id_len = strlen(index->ids[i]); - if (fwrite(&read_id_len, sizeof read_id_len, 1, index->fp) != 1 || - fwrite(index->ids[i], sizeof *index->ids[i], read_id_len, index->fp) != read_id_len || - fwrite(&read_index.offset, sizeof read_index.offset, 1, index->fp) != 1 || - fwrite(&read_index.size, sizeof read_index.size, 1, index->fp) != 1) { + if (SLOW5_FWRITE(&read_id_len, sizeof read_id_len, 1, index->fp) != 1){ + return SLOW5_ERR_IO; + } + if (fwrite(index->ids[i], sizeof *index->ids[i], read_id_len, index->fp) != read_id_len){ + return SLOW5_ERR_IO; + } + if (SLOW5_FWRITE(&read_index.offset, sizeof read_index.offset, 1, index->fp) != 1 ){ + return SLOW5_ERR_IO; + } + if (SLOW5_FWRITE(&read_index.size, sizeof read_index.size, 1, index->fp) != 1) { return SLOW5_ERR_IO; } } @@ -434,7 +444,7 @@ int slow5_idx_read(struct slow5_idx *index) { while (1) { slow5_rid_len_t read_id_len; - if (fread(&read_id_len, sizeof read_id_len, 1, index->fp) != 1) { + if (SLOW5_FREAD(&read_id_len, sizeof read_id_len, 1, index->fp) != 1) { SLOW5_ERROR("Malformed slow5 index. Failed to read the read ID length.%s", feof(index->fp) ? " Missing index end of file marker." : ""); if (feof(index->fp)) { slow5_errno = SLOW5_ERR_TRUNC; @@ -472,8 +482,8 @@ int slow5_idx_read(struct slow5_idx *index) { uint64_t offset; uint64_t size; - if (fread(&offset, sizeof offset, 1, index->fp) != 1 || - fread(&size, sizeof size, 1, index->fp) != 1) { + if (SLOW5_FREAD(&offset, sizeof offset, 1, index->fp) != 1 || + SLOW5_FREAD(&size, sizeof size, 1, index->fp) != 1) { return SLOW5_ERR_IO; } @@ -526,7 +536,9 @@ int slow5_idx_get(struct slow5_idx *index, const char *read_id, struct slow5_rec khint_t pos = kh_get(slow5_s2i, index->hash, read_id); if (pos == kh_end(index->hash)) { - SLOW5_ERROR("Read ID '%s' was not found.", read_id) + if (slow5_skip_rid == 0) { + SLOW5_ERROR("Read ID '%s' was not found.", read_id) + } ret = -1; } else if (read_index) { *read_index = kh_value(index->hash, pos); diff --git a/slow5lib/src/slow5_idx.h b/slow5lib/src/slow5_idx.h index 05a328ed..77d9d6a4 100644 --- a/slow5lib/src/slow5_idx.h +++ b/slow5lib/src/slow5_idx.h @@ -11,8 +11,8 @@ extern "C" { #endif /* -IMPORTANT: The low-level API is not yet finalised or documented and is only for internal use. -If anyone is interested, please open a GitHub issue, rather than trying to figure out from the code. +IMPORTANT: These are functions for internal use. +If anyone is interested in getting any of these functions exposed, please open a GitHub issue. Function prototypes can be changed without notice or completely removed. So do NOT use these functions in your code. these functions are used by slow5tools and pyslow5 - so any change to a function here means slow5tools and pyslow5 must be fixed. */ diff --git a/slow5lib/src/slow5_press.c b/slow5lib/src/slow5_press.c index adef8684..7f5d200a 100644 --- a/slow5lib/src/slow5_press.c +++ b/slow5lib/src/slow5_press.c @@ -17,6 +17,7 @@ extern enum slow5_log_level_opt slow5_log_level; extern enum slow5_exit_condition_opt slow5_exit_condition; +extern int8_t slow5_bigend; /* zlib */ static int zlib_init_deflate(z_stream *strm); @@ -1052,6 +1053,11 @@ static uint8_t *ptr_compress_svb(const uint32_t *ptr, size_t count, size_t *n) { /* return NULL on malloc error, n cannot be NULL */ static uint8_t *ptr_compress_svb_zd(const int16_t *ptr, size_t count, size_t *n) { + + if(slow5_bigend){ + SLOW5_ERROR_EXIT("%s","Compression of SVB-ZD on big-endian architectures is not supported yet."); + } + uint32_t length = count / sizeof *ptr; int32_t *in = (int32_t *) malloc(length * sizeof *in); if (!in) { @@ -1109,6 +1115,10 @@ static uint32_t *ptr_depress_svb(const uint8_t *ptr, size_t count, size_t *n) { /* return NULL on malloc error, n cannot be NULL */ static int16_t *ptr_depress_svb_zd(const uint8_t *ptr, size_t count, size_t *n) { + if(slow5_bigend){ + SLOW5_ERROR_EXIT("%s","Decompression of SVB-ZD on big-endian architectures is not supported yet."); + } + uint32_t *diff = ptr_depress_svb(ptr, count, n); if (!diff) { return NULL; From 1935adb1160d8a922dc6b481bef4d2ba8bfb6cea Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 9 Aug 2024 19:38:05 +1000 Subject: [PATCH 5/7] fix the spamming warning due to the missing reads from split read basecalling --- src/f5c.c | 1 + src/f5cio.c | 5 +++-- src/meth_main.c | 12 +++++++++++- src/resquiggle.c | 18 ++++++++++++++++-- 4 files changed, 31 insertions(+), 5 deletions(-) diff --git a/src/f5c.c b/src/f5c.c index 0f396be4..dbc6f5c6 100644 --- a/src/f5c.c +++ b/src/f5c.c @@ -247,6 +247,7 @@ core_t* init_core(const char* bamfilename, const char* fastafile, STDERR("Error in loading SLOW5 index for %s\n",slow5file); exit(EXIT_FAILURE); } + slow5_set_skip_rid(); if(drna_detect(core->sf)) { opt.flag |= F5C_RNA; diff --git a/src/f5cio.c b/src/f5cio.c index ec772368..274ebbb7 100644 --- a/src/f5cio.c +++ b/src/f5cio.c @@ -433,9 +433,10 @@ void read_slow5_single(core_t* core, db_t* db, int i){ if(record==NULL || len <0){ //todo : should we free if len<0 - db->bad_fast5_file++; + //db->bad_fast5_file++; + __sync_fetch_and_add(&db->bad_fast5_file,1); if (core->opt.flag & F5C_SKIP_UNREADABLE) { - WARNING("Slow5 record for read [%s] is unavailable/unreadable and will be skipped", qname.c_str()); + //WARNING("Slow5 record for read [%s] is unavailable/unreadable and will be skipped", qname.c_str()); db->sig[i]->nsample = 0; db->sig[i]->rawptr = NULL; } else { diff --git a/src/meth_main.c b/src/meth_main.c index 7d6401a8..126905f1 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -826,7 +826,17 @@ int meth_main(int argc, char* argv[], int8_t mode) { if(core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads > core->total_reads/2){ WARNING("%ld out of %ld reads failed. Double check --pore and --rna options.",(long)core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads,(long)core->total_reads); } - + if(core->bad_fast5_file > core->total_reads/10.0){ + WARNING("%ld out of %ld reads missing in FAST5/SLOW5.",(long)core->bad_fast5_file,(long)core->total_reads); + } + if(core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads + core->bad_fast5_file >= core->total_reads){ + ERROR("all %ld out of %ld reads failed. Check the input files.",(long)core->total_reads,(long)core->total_reads); + exit(EXIT_FAILURE); + } + if(core->total_reads==0){ + ERROR("%s","0 reads processed. Check the input files."); + exit(EXIT_FAILURE); + } //free the core data structure free_core(core,opt); diff --git a/src/resquiggle.c b/src/resquiggle.c index c5965820..e1de676d 100644 --- a/src/resquiggle.c +++ b/src/resquiggle.c @@ -95,6 +95,7 @@ core_t* init_core_rsq(opt_t opt, const char *slow5file, double realtime0) { STDERR("Error in loading SLOW5 index for %s\n",slow5file); exit(EXIT_FAILURE); } + slow5_set_skip_rid(); if(drna_detect(core->sf)) { core->opt.flag |= F5C_RNA; opt.flag |= F5C_RNA; @@ -483,9 +484,10 @@ static void read_slow5_single(core_t* core, db_t* db, int i){ MALLOC_CHK(db->sig[i]); if(record==NULL || len <0){ //todo : should we free if len<0 - db->bad_fast5_file++; + __sync_fetch_and_add(&db->bad_fast5_file,1); + //db->bad_fast5_file++; if (core->opt.flag & F5C_SKIP_UNREADABLE) { - WARNING("Slow5 record for read [%s] is unavailable/unreadable and will be skipped", db->read_id[i]); + //WARNING("Slow5 record for read [%s] is unavailable/unreadable and will be skipped", db->read_id[i]); db->sig[i]->nsample = 0; db->sig[i]->rawptr = NULL; } else { @@ -785,9 +787,21 @@ int resquiggle_main(int argc, char **argv) { fprintf(stderr,"\n"); + //todo duplicate code from meth_main.c - can be modularised if(core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads > core->total_reads/2){ WARNING("%ld out of %ld reads failed. Double check --pore and --rna options.",(long)core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads,(long)core->total_reads); } + if(core->bad_fast5_file > core->total_reads/10.0){ + WARNING("%ld out of %ld reads missing in SLOW5.",(long)core->bad_fast5_file,(long)core->total_reads); + } + if(core->qc_fail_reads + core->failed_calibration_reads + core->failed_alignment_reads + core->bad_fast5_file >= core->total_reads){ + ERROR("all %ld out of %ld reads failed. Check the input files.",(long)core->total_reads,(long)core->total_reads); + exit(EXIT_FAILURE); + } + if(core->total_reads==0){ + ERROR("%s","0 reads processed. Check the input files."); + exit(EXIT_FAILURE); + } kseq_destroy(seq); gzclose(fp); From 1395b323ee7d9f52e29582ee8b7b0e5651bbcbbc Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 9 Aug 2024 19:40:59 +1000 Subject: [PATCH 6/7] do not bother with hdf5 on macos 14 --- .github/workflows/f5c-x86_64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/f5c-x86_64.yml b/.github/workflows/f5c-x86_64.yml index e01c3621..7c831f6a 100644 --- a/.github/workflows/f5c-x86_64.yml +++ b/.github/workflows/f5c-x86_64.yml @@ -160,7 +160,7 @@ jobs: - name: install packages run: brew install hdf5 autoconf automake - name: build - run: autoreconf --install && ./scripts/install-hts.sh && ./configure && make -j8 disable_hdf5=1 + run: autoreconf --install && ./scripts/install-hts.sh && ./configure --disable-hdf5 - name: test run: test/test_slow5.sh ubuntu_arm: From 281aae91ac5acaf75ef5e01784e5ace3c856d440 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Fri, 9 Aug 2024 19:50:56 +1000 Subject: [PATCH 7/7] do not bother with hdf5 on macos 14 --- .github/workflows/f5c-x86_64.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/f5c-x86_64.yml b/.github/workflows/f5c-x86_64.yml index 7c831f6a..40a5ba5f 100644 --- a/.github/workflows/f5c-x86_64.yml +++ b/.github/workflows/f5c-x86_64.yml @@ -160,7 +160,7 @@ jobs: - name: install packages run: brew install hdf5 autoconf automake - name: build - run: autoreconf --install && ./scripts/install-hts.sh && ./configure --disable-hdf5 + run: autoreconf --install && ./scripts/install-hts.sh && ./configure --disable-hdf5 && make -j8 - name: test run: test/test_slow5.sh ubuntu_arm: