Skip to content

Commit

Permalink
Merge pull request #171 from hasindu2008/collapse2
Browse files Browse the repository at this point in the history
Collapse2
  • Loading branch information
hasindu2008 authored Aug 4, 2024
2 parents 96dec53 + e16132a commit 69a4741
Show file tree
Hide file tree
Showing 5 changed files with 170 additions and 0 deletions.
140 changes: 140 additions & 0 deletions src/eventalign.c
Original file line number Diff line number Diff line change
Expand Up @@ -1659,6 +1659,20 @@ void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t w
fprintf(fp, "\n");
}

// Note: reference_kmer will be same as the model_kmer in this mode
// Also, event_level_mean and event_stdv are always scaled to the model (--scale-events is implicit)
void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index)
{
fprintf(fp, "%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer",
(print_read_names? "read_name" : "read_index"));
fprintf(fp, "%s\t%s\t%s\t", "event_level_mean", "event_stdv", "event_length");

if(write_signal_index) {
fprintf(fp, "\t%s\t%s", "start_idx", "end_idx");
}

fprintf(fp, "\n");
}

typedef struct {
uint64_t start_raw;
Expand Down Expand Up @@ -2161,6 +2175,132 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
return sp->s;
}

// algo:
// 1. Remove rows in eventalign.tsv where reference kmer does not match model kmer
// 2. For each row in eventalign.tsv, length of each event = end_idx - start_idx
// 3. Group by each read and each reference position
// - collapsed_event_level_mean = sum(event_level_mean * length) / sum(length)
// - collapsed_event_stdv = sum(event_stdv * length) / sum(length)
// - collapsed_event_duration = sum(event_duration * length) / sum(length)
//cleanup unused function param shit
char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr)
{

kstring_t str;
kstring_t *sp = &str;
str_init(sp, sizeof(char)*alignments.size()*120);

size_t n_collapse = 1;
for(size_t i = 0; i < alignments.size(); i+=n_collapse) {

const event_alignment_t& ea = alignments[i];

// basic information
if (!print_read_names)
{
sprintf_append(sp, "%s\t%d\t%s\t%ld\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
(long)read_index);
}
else
{
sprintf_append(sp, "%s\t%d\t%s\t%s\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
read_name //sr.read_name.c_str(),
);
}

// event information
uint64_t length = 0;
float event_mean = 0;
float event_stdv = 0;
float event_duration = 0;

// if(strcmp(ea.ref_kmer,ea.model_kmer)==0){ //the ref and model kmers should match
// uint64_t len_curr = (uint64_t)((et->event)[ea.event_idx].length);
// length += len_curr;
// event_mean += get_fully_scaled_level((et->event)[ea.event_idx].mean,scalings) * len_curr;
// event_stdv += (et->event)[ea.event_idx].stdv * len_curr;
// event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr;
// }

// uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size);
// float model_mean = 0.0;
// float model_stdv = 0.0;
// // unscaled model parameters
// if(ea.hmm_state != 'B') {
// model_t model1 = model[rank];
// model_mean = model1.level_mean;
// model_stdv = model1.level_stdv;
// }

uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive
uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive


n_collapse = 0;
while (i + n_collapse < alignments.size() && ea.ref_position == alignments[i+n_collapse].ref_position){
assert(strcmp(ea.ref_kmer,alignments[i+n_collapse].ref_kmer)==0);
// if(strcmp(ea.model_kmer,alignments[i+n_collapse].model_kmer)!=0){ //TODO: NNNN kmers must be handled
// fprintf(stderr, "model kmer does not match! %s vs %s\n",ea.model_kmer,alignments[i+n_collapse].model_kmer);
// }
const event_alignment_t& ea_curr = alignments[i+n_collapse];

if(strcmp(ea_curr.ref_kmer,ea_curr.model_kmer)==0){ //the ref and model kmers should match
uint64_t len_curr = (uint64_t)((et->event)[ea_curr.event_idx].length);
length += len_curr;
event_mean += get_fully_scaled_level((et->event)[ea_curr.event_idx].mean,scalings) * len_curr;
event_stdv += (et->event)[ea_curr.event_idx].stdv * len_curr;
event_duration += get_duration_seconds(et, ea_curr.event_idx, sample_rate) * len_curr;
}

n_collapse++;
}
event_mean /= length;
event_stdv /= length;
event_duration /= length;


// float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv);
sprintf_append(sp, "%.2f\t%.3f\t%.5f\t", event_mean, event_stdv, event_duration);
// sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer,
// model_mean,
// model_stdv,
// standard_level);

if(write_signal_index) {
if(n_collapse > 1){
uint64_t start_idx1 = start_idx;
uint64_t end_idx1 = end_idx;

const event_alignment_t& ea2 = alignments[i+n_collapse-1];
uint64_t start_idx2 = (et->event)[ea2.event_idx].start;
uint64_t end_idx2 = (et->event)[ea2.event_idx].start + (uint64_t)((et->event)[ea2.event_idx].length);

//min
start_idx = start_idx1 < start_idx2 ? start_idx1 : start_idx2;
//max
end_idx = end_idx1 > end_idx2 ? end_idx1 : end_idx2;
}
sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx);
}

sprintf_append(sp, "\n");
}


//str_free(sp); //freeing is later done in free_db_tmp()
return sp->s;
}


char *emit_event_alignment_paf(const event_table* et, int64_t len_raw_signal, int64_t ref_len, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments, bam1_t* bam_record, char* read_name, char *ref_name, int8_t rna)
Expand Down
4 changes: 4 additions & 0 deletions src/f5c.c
Original file line number Diff line number Diff line change
Expand Up @@ -863,6 +863,7 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){
int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0;
int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0;
int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0;
int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0;
int8_t rna = (core->opt.flag & F5C_RNA) ? 1 : 0;

if(paf_output){
Expand All @@ -872,6 +873,9 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){
int8_t sam_out_version = core->opt.sam_out_version;
int64_t ref_len = core->m_hdr->target_len[db->bam_rec[i]->core.tid];
db->event_alignment_result_str[i] = emit_event_alignment_sam(qname, core->m_hdr, db->bam_rec[i], *event_alignment_result, sam_out_version, &(db->et[i]), db->sig[i]->nsample, ref_len, rna, db->scalings[i]);
} else if (m6anet_output){
db->event_alignment_result_str[i] = emit_event_alignment_tsv_m6anet(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events,
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr);
} else {
db->event_alignment_result_str[i] = emit_event_alignment_tsv(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events,
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr);
Expand Down
1 change: 1 addition & 0 deletions src/f5c.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
#define F5C_COLLAPSE_EVENTS 0x20000 //collapse events
#define F5C_R10 0x40000 //r10
#define F5C_PAF 0x80000 //paf (eventalign only)
#define F5C_M6ANET 0x100000 //m6anet (eventalign only)

/*************************************************************
* flags for a read status (related to db_t->read_stat_flag) *
Expand Down
6 changes: 6 additions & 0 deletions src/f5cmisc.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,13 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples);
char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples);
void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index);
void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index);
void emit_sam_header(samFile* fp, const bam_hdr_t* hdr);
char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record,
const std::vector<event_alignment_t>& alignments, int8_t sam_out_version,
Expand Down
19 changes: 19 additions & 0 deletions src/meth_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ static struct option long_options[] = {
{"pore",required_argument,0,0}, //46 pore
{"paf",no_argument,0,'c'}, //47 if print in paf format (only for eventalign)
{"sam-out-version",required_argument,0,0}, //48 specify the version of the sam output for eventalign (eventalign only)
{"m6anet",no_argument,0,0}, //49 m6anet output (eventalign only)
{0, 0, 0, 0}};


Expand Down Expand Up @@ -447,6 +448,12 @@ int meth_main(int argc, char* argv[], int8_t mode) {
exit(EXIT_FAILURE);
}
is_sam_out_version_set = 1;
} else if (c == 0 && longindex == 49){ //m6anet output
if(mode!=1){
ERROR("%s","Option --m6anet is available only in eventalign");
exit(EXIT_FAILURE);
}
yes_or_no(&opt, F5C_M6ANET, longindex, "yes", 1);
}
}

Expand Down Expand Up @@ -513,6 +520,7 @@ int meth_main(int argc, char* argv[], int8_t mode) {
fprintf(fp_help," --paf write output in PAF format\n");
fprintf(fp_help," --sam write output in SAM format\n");
fprintf(fp_help," --sam-out-version INT sam output version (set 1 to revert to old nanopolish style format) [%d]\n",opt.meth_out_version);
fprintf(fp_help," --m6anet write output in m6anet format\n");

fprintf(fp_help," --print-read-names print read names instead of indexes\n");
fprintf(fp_help," --scale-events scale events to the model, rather than vice-versa\n");
Expand Down Expand Up @@ -573,16 +581,27 @@ int meth_main(int argc, char* argv[], int8_t mode) {
int8_t write_signal_index = (core->opt.flag & F5C_PRINT_SIGNAL_INDEX) ? 1 : 0;
int8_t sam_output = (core->opt.flag & F5C_SAM) ? 1 : 0 ;
int8_t paf_output = (core->opt.flag & F5C_PAF) ? 1 : 0 ;
int8_t m6anet_output = (core->opt.flag & F5C_M6ANET) ? 1 : 0 ;

if(sam_output && paf_output){
ERROR("%s","-c and --sam cannot be used together");
exit(EXIT_FAILURE);
}
if(sam_output && m6anet_output){
ERROR("%s","-c and --m6anet cannot be used together");
exit(EXIT_FAILURE);
}
if(paf_output && m6anet_output){
ERROR("%s","--paf and --m6anet cannot be used together");
exit(EXIT_FAILURE);
}

if(sam_output){
emit_sam_header(core->sam_output, core->m_hdr);
} else if (paf_output){
//none
} else if (m6anet_output){
emit_event_alignment_tsv_m6anet_header(stdout, print_read_names, write_signal_index);
} else{
emit_event_alignment_tsv_header(stdout, print_read_names, write_samples, write_signal_index);
}
Expand Down

0 comments on commit 69a4741

Please sign in to comment.