Skip to content

Commit

Permalink
use cigar instead
Browse files Browse the repository at this point in the history
  • Loading branch information
hasindu2008 committed Aug 18, 2024
1 parent 8a09a58 commit dcdb107
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 25 deletions.
229 changes: 206 additions & 23 deletions src/eventalign.c
Original file line number Diff line number Diff line change
Expand Up @@ -1263,6 +1263,32 @@ struct EventAlignmentParameters
};


static inline int32_t *get_ref_to_read_map(AlignedSegment& aligned_pairs, int32_t *ref_seg_len_ptr){

int32_t ref_start = aligned_pairs.front().ref_pos;
int32_t ref_end = aligned_pairs.back().ref_pos;
int32_t ref_seg_len = ref_end - ref_start + 1;
*ref_seg_len_ptr = ref_seg_len;
assert(ref_start < ref_end);
assert(ref_seg_len > 0);

int32_t *ref_to_read_map = (int32_t*)malloc(sizeof(int32_t)*ref_seg_len);
MALLOC_CHK(ref_to_read_map);
for(int32_t i=0; i<ref_seg_len; i++){
ref_to_read_map[i] = -1;
}
for(size_t i = 0; i < aligned_pairs.size(); i++){
int32_t ref_pos = aligned_pairs[i].ref_pos - ref_start;
int32_t read_pos = aligned_pairs[i].read_pos;
ref_to_read_map[ref_pos] = read_pos;
}

// for(int32_t i=0; i<ref_seg_len; i++){
// fprintf(stderr, "ref_to_read_map[%d] = %d\n",i+ref_start,ref_to_read_map[i]);
// }

return ref_to_read_map;
}

std::vector<event_alignment_t> align_read_to_ref(const EventAlignmentParameters& params, char *ref)
{
Expand Down Expand Up @@ -1320,6 +1346,9 @@ struct EventAlignmentParameters

AlignedSegment& aligned_pairs = aligned_segments[segment_idx];

int32_t ref_seg_len = 0;
int32_t *ref_to_read_map = get_ref_to_read_map(aligned_pairs, &ref_seg_len);

if(params.region_start != -1 && params.region_end != -1) {
//fprintf(stderr, "params.region_start = %d params.region_end = %d\n",params.region_start,params.region_end);
trim_aligned_pairs_to_ref_region(aligned_pairs, params.region_start, params.region_end);
Expand Down Expand Up @@ -1466,10 +1495,11 @@ struct EventAlignmentParameters

event_alignment_t ea;


ea.ref_position = curr_start_ref + as.kmer_idx;
std::string ref__kmer = ref_seq.substr(ea.ref_position - ref_offset, k);
kmer_cpy(ea.ref_kmer, ref__kmer.c_str(),k);
assert(as.kmer_idx < ref_seg_len - params.kmer_size + 1);
ea.read_position = ref_to_read_map[as.kmer_idx];

// event
ea.read_idx = params.read_idx;
Expand Down Expand Up @@ -1530,6 +1560,7 @@ struct EventAlignmentParameters
tester_i++;
} // for realignmentsegment
// fprintf(stderr, "tester_i = %d\n",tester_i);
free(ref_to_read_map);
} // for bam aligned segment
// fprintf(stderr, "alignment_output len = %d\n",alignment_output.size());
// assert(alignment_output.size() == 4451);
Expand Down Expand Up @@ -1660,6 +1691,20 @@ void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t w
fprintf(fp, "\n");
}

// Note: reference_kmer will be same as the model_kmer in this mode
// Also, event_level_mean and event_stdv are always scaled to the model (--scale-events is implicit)
void emit_event_alignment_tsv_m6anet_header(FILE* fp, int8_t print_read_names, int8_t write_signal_index)
{
fprintf(fp, "%s\t%s\t%s\t%s\t", "contig", "position", "reference_kmer",
(print_read_names? "read_name" : "read_index"));
fprintf(fp, "%s\t%s\t%s\t", "event_level_mean", "event_stdv", "event_length");

if(write_signal_index) {
fprintf(fp, "\t%s\t%s", "start_idx", "end_idx");
}

fprintf(fp, "\n");
}

typedef struct {
uint64_t start_raw;
Expand Down Expand Up @@ -2021,28 +2066,40 @@ std::vector<float> get_scaled_samples_for_event(const event_table* events,scalin
return out;
}

static inline int32_t *get_event_to_base_map(const event_table* et, index_pair_t *base_to_event_map, int32_t read_len, uint32_t kmer_size){
int32_t *event_to_base_map = (int32_t*)malloc(sizeof(int32_t)*et->n);
MALLOC_CHK(event_to_base_map);
for(size_t i=0; i<et->n; i++){
event_to_base_map[i] = -1;
}
int n_kmer = read_len - kmer_size + 1;
for(int i=0; i<n_kmer; i++){
if(base_to_event_map[i].start != -1){
for(int j=base_to_event_map[i].start; j<=base_to_event_map[i].stop; j++){
event_to_base_map[j] = i;
}
}
}
// static inline int32_t *get_event_to_base_map(const event_table* et, index_pair_t *base_to_event_map, int32_t read_len, uint32_t kmer_size){
// int32_t *event_to_base_map = (int32_t*)malloc(sizeof(int32_t)*et->n);
// MALLOC_CHK(event_to_base_map);
// for(size_t i=0; i<et->n; i++){
// event_to_base_map[i] = -1;
// }
// int n_kmer = read_len - kmer_size + 1;
// for(int i=0; i<n_kmer; i++){
// fprintf(stderr, "i: %d start: %d stop: %d\n", i, base_to_event_map[i].start, base_to_event_map[i].stop);
// if(base_to_event_map[i].start != -1){
// for(int j=base_to_event_map[i].start; j<=base_to_event_map[i].stop; j++){
// event_to_base_map[j] = i;
// }
// }
// }
// return event_to_base_map;
// }

return event_to_base_map;
// static inline void sprintf_read_kmer(kstring_t *sp, int32_t *event_to_base_map, uint64_t event_idx, char *read, int32_t read_len, uint32_t kmer_size){

// int32_t read_pos = event_to_base_map[event_idx];
// if(read_pos == -1){
// sprintf_append(sp, "\t.\t.");
// return;
// } else {
// char kmer[kmer_size+1];
// kmer_cpy(kmer, read+read_pos, kmer_size);
// sprintf_append(sp, "\t%d\t%s", read_pos, kmer);
// }
// }

}

static inline void sprintf_read_kmer(kstring_t *sp, int32_t *event_to_base_map, uint64_t event_idx, char *read, int32_t read_len, uint32_t kmer_size){
static inline void sprintf_read_kmer(kstring_t *sp, int32_t read_pos, char *read, int32_t read_len, uint32_t kmer_size){

int32_t read_pos = event_to_base_map[event_idx];
if(read_pos == -1){
sprintf_append(sp, "\t.\t.");
return;
Expand All @@ -2058,14 +2115,14 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr, int32_t read_len, char *read, index_pair_t *base_to_event_map)
int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr, int32_t read_len, char *read)
{

kstring_t str;
kstring_t *sp = &str;
str_init(sp, sizeof(char)*alignments.size()*120);

int32_t *event_to_base_map = get_event_to_base_map(et, base_to_event_map, read_len, kmer_size);
//int32_t *event_to_base_map = get_event_to_base_map(et, base_to_event_map, read_len, kmer_size);

size_t n_collapse = 1;
for(size_t i = 0; i < alignments.size(); i+=n_collapse) {
Expand Down Expand Up @@ -2188,11 +2245,137 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
sample_str.resize(sample_str.size() - 1);
sprintf_append(sp, "\t%s", sample_str.c_str());
}
sprintf_read_kmer(sp, event_to_base_map, ea.event_idx, read, read_len, kmer_size);
sprintf_read_kmer(sp, ea.read_position, read, read_len, kmer_size);
sprintf_append(sp, "\n");
}

//free(event_to_base_map);

//str_free(sp); //freeing is later done in free_db_tmp()
return sp->s;
}

// algo:
// 1. Remove rows in eventalign.tsv where reference kmer does not match model kmer
// 2. For each row in eventalign.tsv, length of each event = end_idx - start_idx
// 3. Group by each read and each reference position
// - collapsed_event_level_mean = sum(event_level_mean * length) / sum(length)
// - collapsed_event_stdv = sum(event_stdv * length) / sum(length)
// - collapsed_event_duration = sum(event_duration * length) / sum(length)
//cleanup unused function param shit
char *emit_event_alignment_tsv_m6anet(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name,float sample_rate, float *rawptr)
{

kstring_t str;
kstring_t *sp = &str;
str_init(sp, sizeof(char)*alignments.size()*120);

size_t n_collapse = 1;
for(size_t i = 0; i < alignments.size(); i+=n_collapse) {

const event_alignment_t& ea = alignments[i];

// basic information
if (!print_read_names)
{
sprintf_append(sp, "%s\t%d\t%s\t%ld\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
(long)read_index);
}
else
{
sprintf_append(sp, "%s\t%d\t%s\t%s\t",
ref_name, //ea.ref_name.c_str(),
ea.ref_position,
ea.ref_kmer,
read_name //sr.read_name.c_str(),
);
}

// event information
uint64_t length = 0;
float event_mean = 0;
float event_stdv = 0;
float event_duration = 0;

// if(strcmp(ea.ref_kmer,ea.model_kmer)==0){ //the ref and model kmers should match
// uint64_t len_curr = (uint64_t)((et->event)[ea.event_idx].length);
// length += len_curr;
// event_mean += get_fully_scaled_level((et->event)[ea.event_idx].mean,scalings) * len_curr;
// event_stdv += (et->event)[ea.event_idx].stdv * len_curr;
// event_duration += get_duration_seconds(et, ea.event_idx, sample_rate) * len_curr;
// }

// uint32_t rank = get_kmer_rank(ea.model_kmer, kmer_size);
// float model_mean = 0.0;
// float model_stdv = 0.0;
// // unscaled model parameters
// if(ea.hmm_state != 'B') {
// model_t model1 = model[rank];
// model_mean = model1.level_mean;
// model_stdv = model1.level_stdv;
// }

uint64_t start_idx = (et->event)[ea.event_idx].start; //inclusive
uint64_t end_idx = (et->event)[ea.event_idx].start + (uint64_t)((et->event)[ea.event_idx].length); //non-inclusive


n_collapse = 0;
while (i + n_collapse < alignments.size() && ea.ref_position == alignments[i+n_collapse].ref_position){
assert(strcmp(ea.ref_kmer,alignments[i+n_collapse].ref_kmer)==0);
// if(strcmp(ea.model_kmer,alignments[i+n_collapse].model_kmer)!=0){ //TODO: NNNN kmers must be handled
// fprintf(stderr, "model kmer does not match! %s vs %s\n",ea.model_kmer,alignments[i+n_collapse].model_kmer);
// }
const event_alignment_t& ea_curr = alignments[i+n_collapse];

if(strcmp(ea_curr.ref_kmer,ea_curr.model_kmer)==0){ //the ref and model kmers should match
uint64_t len_curr = (uint64_t)((et->event)[ea_curr.event_idx].length);
length += len_curr;
event_mean += get_fully_scaled_level((et->event)[ea_curr.event_idx].mean,scalings) * len_curr;
event_stdv += (et->event)[ea_curr.event_idx].stdv * len_curr;
event_duration += get_duration_seconds(et, ea_curr.event_idx, sample_rate) * len_curr;
}

n_collapse++;
}
event_mean /= length;
event_stdv /= length;
event_duration /= length;


// float standard_level = (event_mean - model_mean) / (sqrt(scalings.var) * model_stdv);
sprintf_append(sp, "%.2f\t%.3f\t%.5f\t", event_mean, event_stdv, event_duration);
// sprintf_append(sp, "%s\t%.2f\t%.2f\t%.2f", ea.model_kmer,
// model_mean,
// model_stdv,
// standard_level);

if(write_signal_index) {
if(n_collapse > 1){
uint64_t start_idx1 = start_idx;
uint64_t end_idx1 = end_idx;

const event_alignment_t& ea2 = alignments[i+n_collapse-1];
uint64_t start_idx2 = (et->event)[ea2.event_idx].start;
uint64_t end_idx2 = (et->event)[ea2.event_idx].start + (uint64_t)((et->event)[ea2.event_idx].length);

//min
start_idx = start_idx1 < start_idx2 ? start_idx1 : start_idx2;
//max
end_idx = end_idx1 > end_idx2 ? end_idx1 : end_idx2;
}
sprintf_append(sp, "\t%lu\t%lu", start_idx, end_idx);
}

sprintf_append(sp, "\n");
}

free(event_to_base_map);

//str_free(sp); //freeing is later done in free_db_tmp()
return sp->s;
Expand Down
2 changes: 1 addition & 1 deletion src/f5c.c
Original file line number Diff line number Diff line change
Expand Up @@ -874,7 +874,7 @@ void eventalign_single(core_t* core, db_t* db, int32_t i){
db->event_alignment_result_str[i] = emit_event_alignment_sam(qname, core->m_hdr, db->bam_rec[i], *event_alignment_result, sam_out_version, &(db->et[i]), db->sig[i]->nsample, ref_len, rna, db->scalings[i]);
} else {
db->event_alignment_result_str[i] = emit_event_alignment_tsv(0,&(db->et[i]),core->model,core->kmer_size, db->scalings[i],*event_alignment_result, print_read_names, scale_events, write_samples, write_signal_index, collapse_events,
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr, db->read_len[i], db->read[i], db->base_to_event_map[i]);
db->read_idx[i], qname, contig, db->sig[i]->sample_rate, db->sig[i]->rawptr, db->read_len[i], db->read[i]);
}
}

Expand Down
1 change: 1 addition & 0 deletions src/f5c.h
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ typedef struct {
//int32_t strand_idx;
int32_t event_idx;
bool rc;
int32_t read_position;

// hmm data
char model_kmer[MAX_KMER_SIZE + 1];
Expand Down
2 changes: 1 addition & 1 deletion src/f5cmisc.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ char *emit_event_alignment_tsv(uint32_t strand_idx,
const event_table* et, model_t* model, uint32_t kmer_size, scalings_t scalings,
const std::vector<event_alignment_t>& alignments,
int8_t print_read_names, int8_t scale_events, int8_t write_samples, int8_t write_signal_index, int8_t collapse,
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples, int32_t read_len, char *read, index_pair_t *base_to_event_map);
int64_t read_index, char* read_name, char *ref_name, float sample_rate, float *samples, int32_t read_len, char *read);
void emit_event_alignment_tsv_header(FILE* fp, int8_t print_read_names, int8_t write_samples, int8_t write_signal_index);
void emit_sam_header(samFile* fp, const bam_hdr_t* hdr);
char *emit_event_alignment_sam(char* read_name, bam_hdr_t* base_hdr, bam1_t* base_record,
Expand Down

0 comments on commit dcdb107

Please sign in to comment.