From 1f3a88b3a243a0d5443e727f21c4746c1bdfc65a Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Thu, 18 Apr 2024 10:23:41 +1000 Subject: [PATCH 1/3] attempt to collapse in m6anet style --- src/eventalign.c | 138 +++++++++++++++++++++++++++++++++++++++++++++++ src/f5c.c | 4 ++ src/f5c.h | 1 + src/f5cmisc.h | 5 ++ src/meth_main.c | 19 +++++++ 5 files changed, 167 insertions(+) diff --git a/src/eventalign.c b/src/eventalign.c index efc764ac..956dde21 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -2161,6 +2161,144 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, return sp->s; } +//cleanup unused function param shit +char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, + const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, + const std::vector& alignments, + int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, + int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr) +{ + + kstring_t str; + kstring_t *sp = &str; + str_init(sp, sizeof(char)*alignments.size()*120); + + size_t n_collapse = 1; + for(size_t i = 0; i < alignments.size(); i+=n_collapse) { + + const event_alignment_t& ea = alignments[i]; + + // basic information + if (!print_read_names) + { + sprintf_append(sp, "%s\t%d\t%s\t%ld\t%c\t", + ref_name, //ea.ref_name.c_str(), + ea.ref_position, + ea.ref_kmer, + (long)read_index, + 't'); //"tc"[ea.strand_idx]); + } + else + { + sprintf_append(sp, "%s\t%d\t%s\t%s\t%c\t", + ref_name, //ea.ref_name.c_str(), + ea.ref_position, + ea.ref_kmer, + read_name, //sr.read_name.c_str(), + 't'); //"tc"[ea.strand_idx]); + } + + // event information + uint64_t length = 0; + float event_mean = 0; + float event_stdv = 0; + float event_duration = 0; + + // if(strcmp(ea.ref_kmer,ea.model_kmer)==0){ //the ref and model kmers should match + // uint64_t len_curr = (uint64_t)((et->event)[ea.event_idx].length); + // length += len_curr; + // event_mean += get_fully_scaled_level((et->event)[ea.event_idx].mean,scalings) * len_curr; + // event_stdv += (et->event)[ea.event_idx].stdv * len_curr; + // event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr; + // } + + uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); + float model_mean = 0.0; + float model_stdv = 0.0; + // unscaled model parameters + if(ea.hmm_state != 'B') { + model_t model1 = model[rank]; + model_mean = model1.level_mean; + model_stdv = model1.level_stdv; + } + + //uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive + //uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive + + + n_collapse = 0; + while (i + n_collapse < alignments.size() && ea.ref_position == alignments[i+n_collapse].ref_position){ + assert(strcmp(ea.ref_kmer,alignments[i+n_collapse].ref_kmer)==0); + // if(strcmp(ea.model_kmer,alignments[i+n_collapse].model_kmer)!=0){ //TODO: NNNN kmers must be handled + // fprintf(stderr, "model kmer does not match! %s vs %s\n",ea.model_kmer,alignments[i+n_collapse].model_kmer); + // } + const event_alignment_t& ea_curr = alignments[i+n_collapse]; + + if(strcmp(ea_curr.ref_kmer,ea_curr.model_kmer)==0){ //the ref and model kmers should match + uint64_t len_curr = (uint64_t)((et->event)[ea_curr.event_idx].length); + length += len_curr; + event_mean += get_fully_scaled_level((et->event)[ea_curr.event_idx].mean,scalings) * len_curr; + event_stdv += (et->event)[ea_curr.event_idx].stdv * len_curr; + event_duration += get_duration_seconds(et, ea_curr.event_idx, sample_rate) * len_curr; + } + + n_collapse++; + } + event_mean /= length; + event_stdv /= length; + event_duration /= length; + + // if(scale_events) { + + // // scale reads to the model + // event_mean = get_fully_scaled_level(event_mean, scalings); + + // // unscaled model parameters + // if(ea.hmm_state != 'B') { + // model_t model1 = model[rank]; + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } + // } else { + + // // scale model to the reads + // if(ea.hmm_state != 'B') { + + // model_t model1 = get_scaled_gaussian_from_pore_model_state(model, scalings, rank); + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } + // } + + float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); + sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); + sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, + model_mean, + model_stdv, + standard_level); + + // if(write_signal_index) { + // sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); + // } + + // if(write_samples) { + // std::vector samples = get_scaled_samples(rawptr, start_idx, end_idx, scalings); + // std::stringstream sample_ss; + // std::copy(samples.begin(), samples.end(), std::ostream_iterator(sample_ss, ",")); + + // // remove trailing comma + // std::string sample_str = sample_ss.str(); + // sample_str.resize(sample_str.size() - 1); + // sprintf_append(sp, "\t%s", sample_str.c_str()); + // } + sprintf_append(sp, "\n"); + } + + + //str_free(sp); //freeing is later done in free_db_tmp() + return sp->s; +} + char *emit_event_alignment_paf(const event_table* et, int64_t len_raw_signal, int64_t ref_len, uint32_t kmer_size, scalings_t scalings, const std::vector& alignments, bam1_t* bam_record, char* read_name, char *ref_name, int8_t rna) diff --git a/src/f5c.c b/src/f5c.c index 319ca4c8..8baec8c1 100644 --- a/src/f5c.c +++ b/src/f5c.c @@ -863,6 +863,7 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){ int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0; int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0; int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0; + int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0; int8_t rna = (core->opt.flag & F5C_RNA) ? 1 : 0; if(paf_output){ @@ -872,6 +873,9 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){ int8_t sam_out_version = core->opt.sam_out_version; int64_t ref_len = core->m_hdr->target_len[db->bam_rec[i]->core.tid]; db->event_alignment_result_str[i] = emit_event_alignment_sam(qname, core->m_hdr, db->bam_rec[i], *event_alignment_result, sam_out_version, &(db->et[i]), db->sig[i]->nsample, ref_len, rna, db->scalings[i]); + } else if (m6anet_output){ + db->event_alignment_result_str[i] = emit_event_alignment_tsv_m6anet(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events, + db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr); } else { db->event_alignment_result_str[i] = emit_event_alignment_tsv(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events, db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr); diff --git a/src/f5c.h b/src/f5c.h index a3dbae51..51e157ae 100644 --- a/src/f5c.h +++ b/src/f5c.h @@ -57,6 +57,7 @@ #define F5C_COLLAPSE_EVENTS 0x20000 //collapse events #define F5C_R10 0x40000 //r10 #define F5C_PAF 0x80000 //paf (eventalign only) +#define F5C_M6ANET 0x100000 //m6anet (eventalign only) /************************************************************* * flags for a read status (related to db_t->read_stat_flag) * diff --git a/src/f5cmisc.h b/src/f5cmisc.h index 821857c4..54387868 100644 --- a/src/f5cmisc.h +++ b/src/f5cmisc.h @@ -80,6 +80,11 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, const std::vector& alignments, int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); +char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, + const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, + const std::vector& alignments, + int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, + int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index); void emit_sam_header(samFile* fp, const bam_hdr_t* hdr); char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record, diff --git a/src/meth_main.c b/src/meth_main.c index 4d9e0e58..2b2f7838 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -105,6 +105,7 @@ static struct option long_options[] = { {"pore",required_argument,0,0}, //46 pore {"paf",no_argument,0,'c'}, //47 if print in paf format (only for eventalign) {"sam-out-version",required_argument,0,0}, //48 specify the version of the sam output for eventalign (eventalign only) + {"m6anet",no_argument,0,0}, //49 m6anet output (eventalign only) {0, 0, 0, 0}}; @@ -447,6 +448,12 @@ int meth_main(int argc, char* argv[], int8_t mode) { exit(EXIT_FAILURE); } is_sam_out_version_set = 1; + } else if (c == 0 && longindex == 49){ //m6anet output + if(mode!=1){ + ERROR("%s","Option --m6anet is available only in eventalign"); + exit(EXIT_FAILURE); + } + yes_or_no(&opt, F5C_M6ANET, longindex, "yes", 1); } } @@ -513,6 +520,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { fprintf(fp_help," --paf write output in PAF format\n"); fprintf(fp_help," --sam write output in SAM format\n"); fprintf(fp_help," --sam-out-version INT sam output version (set 1 to revert to old nanopolish style format) [%d]\n",opt.meth_out_version); + fprintf(fp_help," --m6anet write output in m6anet format\n"); fprintf(fp_help," --print-read-names print read names instead of indexes\n"); fprintf(fp_help," --scale-events scale events to the model, rather than vice-versa\n"); @@ -573,16 +581,27 @@ int meth_main(int argc, char* argv[], int8_t mode) { int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0; int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0 ; int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0 ; + int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0 ; if(sam_output && paf_output){ ERROR("%s","-c and --sam cannot be used together"); exit(EXIT_FAILURE); } + if(sam_output && m6anet_output){ + ERROR("%s","-c and --m6anet cannot be used together"); + exit(EXIT_FAILURE); + } + if(paf_output && m6anet_output){ + ERROR("%s","--paf and --m6anet cannot be used together"); + exit(EXIT_FAILURE); + } if(sam_output){ emit_sam_header(core->sam_output, core->m_hdr); } else if (paf_output){ //none + } else if (m6anet_output){ + emit_event_alignment_tsv_header(stdout, 0, 0, 0); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); } From 9d6892901a4af5ea60cb1af0dac19da71b5ace19 Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Thu, 18 Apr 2024 10:34:26 +1000 Subject: [PATCH 2/3] signal-index option now --- src/eventalign.c | 51 ++++++++++++++++-------------------------------- src/meth_main.c | 2 +- 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/src/eventalign.c b/src/eventalign.c index 956dde21..dd2273d1 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -2222,8 +2222,8 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, model_stdv = model1.level_stdv; } - //uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive - //uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive + uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive + uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive n_collapse = 0; @@ -2248,27 +2248,7 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, event_stdv /= length; event_duration /= length; - // if(scale_events) { - - // // scale reads to the model - // event_mean = get_fully_scaled_level(event_mean, scalings); - // // unscaled model parameters - // if(ea.hmm_state != 'B') { - // model_t model1 = model[rank]; - // model_mean = model1.level_mean; - // model_stdv = model1.level_stdv; - // } - // } else { - - // // scale model to the reads - // if(ea.hmm_state != 'B') { - - // model_t model1 = get_scaled_gaussian_from_pore_model_state(model, scalings, rank); - // model_mean = model1.level_mean; - // model_stdv = model1.level_stdv; - // } - // } float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); @@ -2277,20 +2257,23 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, model_stdv, standard_level); - // if(write_signal_index) { - // sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); - // } + if(write_signal_index) { + if(n_collapse > 1){ + uint64_t start_idx1 = start_idx; + uint64_t end_idx1 = end_idx; - // if(write_samples) { - // std::vector samples = get_scaled_samples(rawptr, start_idx, end_idx, scalings); - // std::stringstream sample_ss; - // std::copy(samples.begin(), samples.end(), std::ostream_iterator(sample_ss, ",")); + const event_alignment_t& ea2 = alignments[i+n_collapse-1]; + uint64_t start_idx2 = (et->event)[ea2.event_idx].start; + uint64_t end_idx2 = (et->event)[ea2.event_idx].start + (uint64_t)((et->event)[ea2.event_idx].length); + + //min + start_idx = start_idx1 < start_idx2 ? start_idx1 : start_idx2; + //max + end_idx = end_idx1 > end_idx2 ? end_idx1 : end_idx2; + } + sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx); + } - // // remove trailing comma - // std::string sample_str = sample_ss.str(); - // sample_str.resize(sample_str.size() - 1); - // sprintf_append(sp, "\t%s", sample_str.c_str()); - // } sprintf_append(sp, "\n"); } diff --git a/src/meth_main.c b/src/meth_main.c index 2b2f7838..1cc964dc 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -601,7 +601,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { } else if (paf_output){ //none } else if (m6anet_output){ - emit_event_alignment_tsv_header(stdout, 0, 0, 0); + emit_event_alignment_tsv_header(stdout, print_read_names, 0, write_signal_index); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); } From e16132aa28da69c09042fcd88496b29306d207db Mon Sep 17 00:00:00 2001 From: Hasindu Gamaarachchi Date: Wed, 24 Apr 2024 14:26:47 +1000 Subject: [PATCH 3/3] update the output format --- src/eventalign.c | 63 +++++++++++++++++++++++++++++++----------------- src/f5cmisc.h | 1 + src/meth_main.c | 2 +- 3 files changed, 43 insertions(+), 23 deletions(-) diff --git a/src/eventalign.c b/src/eventalign.c index dd2273d1..83f5929c 100644 --- a/src/eventalign.c +++ b/src/eventalign.c @@ -1659,6 +1659,20 @@ void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t w fprintf(fp, "\n"); } +// Note: reference_kmer will be same as the model_kmer in this mode +// Also, event_level_mean and event_stdv are always scaled to the model (--scale-events is implicit) +void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index) +{ + fprintf(fp, "%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer", + (print_read_names? "read_name" : "read_index")); + fprintf(fp, "%s\t%s\t%s\t", "event_level_mean", "event_stdv", "event_length"); + + if(write_signal_index) { + fprintf(fp, "\t%s\t%s", "start_idx", "end_idx"); + } + + fprintf(fp, "\n"); +} typedef struct { uint64_t start_raw; @@ -2161,6 +2175,13 @@ char *emit_event_alignment_tsv(uint32_t strand_idx, return sp->s; } +// algo: +// 1. Remove rows in eventalign.tsv where reference kmer does not match model kmer +// 2. For each row in eventalign.tsv, length of each event = end_idx - start_idx +// 3. Group by each read and each reference position +// - collapsed_event_level_mean = sum(event_level_mean * length) / sum(length) +// - collapsed_event_stdv = sum(event_stdv * length) / sum(length) +// - collapsed_event_duration = sum(event_duration * length) / sum(length) //cleanup unused function param shit char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings, @@ -2181,21 +2202,20 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, // basic information if (!print_read_names) { - sprintf_append(sp, "%s\t%d\t%s\t%ld\t%c\t", + sprintf_append(sp, "%s\t%d\t%s\t%ld\t", ref_name, //ea.ref_name.c_str(), ea.ref_position, ea.ref_kmer, - (long)read_index, - 't'); //"tc"[ea.strand_idx]); + (long)read_index); } else { - sprintf_append(sp, "%s\t%d\t%s\t%s\t%c\t", + sprintf_append(sp, "%s\t%d\t%s\t%s\t", ref_name, //ea.ref_name.c_str(), ea.ref_position, ea.ref_kmer, - read_name, //sr.read_name.c_str(), - 't'); //"tc"[ea.strand_idx]); + read_name //sr.read_name.c_str(), + ); } // event information @@ -2212,15 +2232,15 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, // event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr; // } - uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); - float model_mean = 0.0; - float model_stdv = 0.0; - // unscaled model parameters - if(ea.hmm_state != 'B') { - model_t model1 = model[rank]; - model_mean = model1.level_mean; - model_stdv = model1.level_stdv; - } + // uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size); + // float model_mean = 0.0; + // float model_stdv = 0.0; + // // unscaled model parameters + // if(ea.hmm_state != 'B') { + // model_t model1 = model[rank]; + // model_mean = model1.level_mean; + // model_stdv = model1.level_stdv; + // } uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive @@ -2249,13 +2269,12 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, event_duration /= length; - - float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); - sprintf_append(sp, "%d\t%.2f\t%.3f\t%.5f\t", ea.event_idx, event_mean, event_stdv, event_duration); - sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, - model_mean, - model_stdv, - standard_level); + // float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv); + sprintf_append(sp, "%.2f\t%.3f\t%.5f\t", event_mean, event_stdv, event_duration); + // sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer, + // model_mean, + // model_stdv, + // standard_level); if(write_signal_index) { if(n_collapse > 1){ diff --git a/src/f5cmisc.h b/src/f5cmisc.h index 54387868..694867cb 100644 --- a/src/f5cmisc.h +++ b/src/f5cmisc.h @@ -86,6 +86,7 @@ char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx, int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse, int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples); void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index); +void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index); void emit_sam_header(samFile* fp, const bam_hdr_t* hdr); char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record, const std::vector& alignments, int8_t sam_out_version, diff --git a/src/meth_main.c b/src/meth_main.c index 1cc964dc..7d6401a8 100644 --- a/src/meth_main.c +++ b/src/meth_main.c @@ -601,7 +601,7 @@ int meth_main(int argc, char* argv[], int8_t mode) { } else if (paf_output){ //none } else if (m6anet_output){ - emit_event_alignment_tsv_header(stdout, print_read_names, 0, write_signal_index); + emit_event_alignment_tsv_m6anet_header(stdout, print_read_names, write_signal_index); } else{ emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index); }