diff --git a/Additional_src/Modified_hifiasm/.github/workflows/ci.yaml b/Additional_src/Modified_hifiasm/.github/workflows/ci.yaml deleted file mode 100644 index 3bbd88d..0000000 --- a/Additional_src/Modified_hifiasm/.github/workflows/ci.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: CI - -on: - push: - branches: - - master - pull_request: - -jobs: - build: - runs-on: ubuntu-latest - strategy: - matrix: - compiler: [gcc] - - steps: - - name: Checkout minimap2 - uses: actions/checkout@v2 - - - name: Compile with ${{ matrix.compiler }} - run: make CC=${{ matrix.compiler }} diff --git a/Additional_src/Modified_hifiasm/CommandLines.h b/Additional_src/Modified_hifiasm/CommandLines.h index f8c91cc..2c56e16 100644 --- a/Additional_src/Modified_hifiasm/CommandLines.h +++ b/Additional_src/Modified_hifiasm/CommandLines.h @@ -5,7 +5,7 @@ #include #include -#define HA_VERSION "0.19.3-r572" +#define HA_VERSION "0.19.5-r587" #define VERBOSE 0 diff --git a/Additional_src/Modified_hifiasm/Overlaps.cpp b/Additional_src/Modified_hifiasm/Overlaps.cpp index 038b495..00ff448 100644 --- a/Additional_src/Modified_hifiasm/Overlaps.cpp +++ b/Additional_src/Modified_hifiasm/Overlaps.cpp @@ -59,12 +59,21 @@ KRADIX_SORT_INIT(u_trans_ts, u_trans_t, u_trans_ts_key, member_size(u_trans_t, t KRADIX_SORT_INIT(ha_mzl_t_srt1, ha_mzl_t, ha_mzl_t_key, member_size(ha_mzl_t, x)) #define UL_COV_THRES 2 +#define PHASE_SEP 64 +#define PHASE_SEF 2 +#define PHASE_SEP_RATE 0.04 +#define PHASE_MISS_LEN 1000000 +#define PHASE_MISS_N 8 KSORT_INIT_GENERIC(uint32_t) void reduce_hamming_error_adv(ma_ug_t *iug, asg_t *sg, ma_hit_t_alloc* sources, ma_sub_t *coverage_cut, int max_hang, int min_ovlp, long long gap_fuzz, R_to_U *ru, bubble_type* bub); void print_vw_edge(asg_t *sg, uint32_t vid, uint32_t wid, const char *cmd); +void output_trio_graph_joint(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, +long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, +int min_ovlp, long long gap_fuzz, bub_label_t* b_mask_t, ma_ug_t **rhu0, ma_ug_t **rhu1); typedef struct { uint32_t d, tot, ma, p; @@ -99,12 +108,45 @@ typedef struct { asg64_v *rr; } rd_hamming_t; +typedef struct { + asg_t *ref; + asg_t *nsg; + ma_ug_t *nug; + uint32_t *o2n; + uint64_t *ugh; + asg64_v *srt; +} rd_hamming_fly_t; + +typedef struct { + // asg_t *ref; + // asg_t *ng; + ma_hit_t_alloc* src; + ma_sub_t *cov; + int32_t max_hang; + int32_t min_ovlp; + int32_t gap_fuzz; + asg32_v *srt; + uint8_t *vs; + // uint32_t *rs; + ma_ug_t *fg; + kvec_asg_arc_t_warp *ae; + uint32_t n_insert; +} rd_hamming_fly_simp_t; + +typedef struct { + ma_ug_t *ug; + asg_t *rg; + uint32_t *ridx; + uint64_t *ra; + uint64_t ridx_n, ra_n; +} dedup_idx_t; + ///this value has been updated at the first line of build_string_graph_without_clean long long min_thres; uint32_t print_untig_by_read(ma_ug_t *g, const char* name, uint32_t in, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, const char* info); -int asg_pop_bubble_primary_trio(ma_ug_t *ug, uint64_t* i_max_dist, uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain); +int asg_pop_bubble_primary_trio(ma_ug_t *ug, uint64_t* i_max_dist, uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain, rd_hamming_fly_simp_t *p); kv_u_trans_t *get_utg_ovlp(ma_ug_t **ug, asg_t* read_g, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, R_to_U* ruIndex, int max_hang, int min_ovlp, kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t, uint8_t* r_het); void delete_useless_nodes(ma_ug_t **ug); @@ -8615,6 +8657,236 @@ ma_ug_t *ma_ug_gen(asg_t *g) return ug; } +ma_ug_t *ma_ug_gen_phase(asg_t *g, uint32_t min_occ, double cutoff) +{ + // fprintf(stderr, "\n-0-[M::%s] min_occ::%u, cutoff::%f\n", __func__, min_occ, cutoff); + asg_cleanup(g); + int32_t *mark; + uint32_t i, v, n_vtx = g->n_seq * 2, fn, mn, fn0, mn0, fn1, mn1, n1, cn, k, st, pt, ct, sz, ez; + uint64_t z; + ///is a queue + kdq_t(uint64_t) *q; + asg64_v uidx; kv_init(uidx); + + ma_ug_t *ug; + + ug = (ma_ug_t*)calloc(1, sizeof(ma_ug_t)); + ug->g = asg_init(); + ///each node has two directions + mark = (int32_t*)calloc(n_vtx, 4); + + q = kdq_init(uint64_t); + for (v = 0; v < n_vtx; ++v) { + uint32_t w, x, l, start, end, len; + ma_utg_t *p; + if (g->seq[v>>1].del || mark[v]) continue; + if (arc_cnt(g, v) == 0 && arc_cnt(g, (v^1)) != 0) continue; + mark[v] = 1; + q->count = 0, start = v, end = v^1, len = 0; fn = mn = 0; + // forward + w = v; + while (1) { + /** + * w----->x + * w<-----x + * that means the only suffix of w is x, and the only prefix of x is w + **/ + if (arc_cnt(g, w) != 1) break; + x = arc_first(g, w).v; // w->x + if (arc_cnt(g, x^1) != 1) break; + /** + * another direction of w would be marked as used (since w has been used) + **/ + mark[x] = mark[w^1] = 1; + ///l is the edge length, instead of overlap length + ///note: edge length is different with overlap length + l = asg_arc_len(arc_first(g, w)); + kdq_push(uint64_t, q, (uint64_t)w<<32 | l); + if(R_INF.trio_flag[w>>1] == FATHER) fn++; + if(R_INF.trio_flag[w>>1] == MOTHER) mn++; + end = x^1, len += l; + w = x; + if (x == v) break; + } + if (start != (end^1) || kdq_size(q) == 0) { // linear unitig + ///length of seq, instead of edge + l = g->seq[end>>1].len; + kdq_push(uint64_t, q, (uint64_t)(end^1)<<32 | l); + if(R_INF.trio_flag[end>>1] == FATHER) fn++; + if(R_INF.trio_flag[end>>1] == MOTHER) mn++; + len += l; + } else { // circular unitig + start = end = UINT32_MAX; + goto add_unitig; // then it is not necessary to do the backward + } + // backward + x = v; + while (1) { // similar to forward but not the same + if (arc_cnt(g, x^1) != 1) break; + w = arc_first(g, x^1).v ^ 1; // w->x + if (arc_cnt(g, w) != 1) break; + mark[x] = mark[w^1] = 1; + l = asg_arc_len(arc_first(g, w)); + ///w is the seq id + direction, l is the length of edge + ///push element to the front of a queue + kdq_unshift(uint64_t, q, (uint64_t)w<<32 | l); + if(R_INF.trio_flag[w>>1] == FATHER) fn++; + if(R_INF.trio_flag[w>>1] == MOTHER) mn++; + // fprintf(stderr, "uId: %u, >%.*s (%u)\n", + // ug->u.n, (int)Get_NAME_LENGTH((R_INF), w>>1), Get_NAME((R_INF), w>>1), w>>1); + + start = w, len += l; + x = w; + } +add_unitig: + if (start != UINT32_MAX) mark[start] = mark[end] = 1; + // fprintf(stderr, "\n-0-[M::%s] fn::%u, mn::%u\n", __func__, fn, mn); + cn = MIN(fn, mn); + // if((cn > min_occ) && (cn > ((fn+mn)*cutoff))) + if((cn <= ((fn+mn)*cutoff)) || (cn <= min_occ)) { + kv_pushp(ma_utg_t, ug->u, &p); + p->s = 0, p->start = start, p->end = end, p->len = len, p->n = kdq_size(q), p->circ = (start == UINT32_MAX); + p->m = p->n; + kv_roundup32(p->m); + p->a = (uint64_t*)malloc(8 * p->m); + //all elements are saved here + for (i = 0; i < kdq_size(q); ++i) p->a[i] = kdq_at(q, i); + } else if(kdq_size(q)) { + ct = R_INF.trio_flag[kdq_at(q, 0)>>33]; + if((ct != FATHER) && (ct != MOTHER)) ct = AMBIGU; + pt = ct; + fn0 = fn; mn0 = mn; uidx.n = fn = mn = 0; + if(ct == FATHER) fn++; if(ct == MOTHER) mn++; + for (k = 1, l = 0; k <= kdq_size(q); k++) { + st = 0; ct = AMBIGU; + if(k == kdq_size(q)) { + st = 1; + } else { + ct = R_INF.trio_flag[kdq_at(q, k)>>33]; + if((ct != FATHER) && (ct != MOTHER)) ct = AMBIGU; + if((ct != AMBIGU) && (pt != AMBIGU) && (ct != pt)) { + st = 1; + } + } + if(st) { + // fprintf(stderr, "-1-[M::%s] l::%u, k::%u, kdq_size(q)::%u, fn::%u, mn::%u, ct::%u, pt::%u\n", + // __func__, l, k, (uint32_t)kdq_size(q), fn, mn, ct, pt); + if(k < kdq_size(q)) { + assert(fn || mn); assert((!fn) || (!mn)); + } + z = l<<1; z |= (((uint64_t)MAX(fn, mn))<<32); + if(mn) z |= 1; + kv_push(uint64_t, uidx, z); + fn = mn = 0; l = k; + } + if(ct != AMBIGU) pt = ct; + if(ct == FATHER) fn++; if(ct == MOTHER) mn++; + } + + + fn = mn = 0; fn1 = mn1 = n1 = 0; + if(uidx.a[0]&1) mn += uidx.a[0]>>32; + else fn += uidx.a[0]>>32; + for (k = 1, l = 0; k <= uidx.n; k++) { + st = 0; + if(k == uidx.n) { + st = 1; + } else { + if(uidx.a[k]&1) mn += uidx.a[k]>>32; + else fn += uidx.a[k]>>32; + cn = MIN(fn, mn); + if((cn > min_occ) && (cn > ((fn+mn)*cutoff))) st = 1; + // fprintf(stderr, "-2-[M::%s] fn::%u, mn::%u, cn::%u, ((fn+mn)*cutoff)::%u, st::%u\n", + // __func__, fn, mn, cn, (uint32_t)(((fn+mn)*cutoff)), st); + } + if(st) { + // fprintf(stderr, "-3-[M::%s] fn::%u, mn::%u\n", __func__, fn, mn); + sz = ((uint32_t)uidx.a[l])>>1; + ez = ((k>1):(kdq_size(q))); + assert(ez > sz); n1 += ez - sz; + kv_pushp(ma_utg_t, ug->u, &p); + if ((start == UINT32_MAX) && (sz == 0) && (ez == kdq_size(q))) {///circle + p->s = 0, p->start = start, p->end = end, p->len = len, p->n = kdq_size(q), p->circ = (start == UINT32_MAX); + p->m = p->n; + kv_roundup32(p->m); + p->a = (uint64_t*)malloc(8 * p->m); + //all elements are saved here + for (i = 0; i < kdq_size(q); ++i) { + p->a[i] = kdq_at(q, i); + ct = R_INF.trio_flag[p->a[i]>>33]; + if((ct != FATHER) && (ct != MOTHER)) ct = AMBIGU; + if(ct == FATHER) fn1++; if(ct == MOTHER) mn1++; + } + } else { + p->s = 0; p->len = 0; p->circ = 0; + p->start = kdq_at(q, sz)>>32; + p->end = (kdq_at(q, (ez-1))>>32)^1; + p->m = p->n = ez - sz; kv_roundup32(p->m); + p->a = (uint64_t*)malloc(8 * p->m); + + for (i = sz, z = 0; i+1 < ez; i++, z++) { + p->a[z] = kdq_at(q, i); p->len += (uint32_t)p->a[z]; + ct = R_INF.trio_flag[p->a[z]>>33]; + if((ct != FATHER) && (ct != MOTHER)) ct = AMBIGU; + if(ct == FATHER) fn1++; if(ct == MOTHER) mn1++; + } + p->a[z] = kdq_at(q, i); p->a[z] >>= 32; p->a[z] <<= 32; + p->a[z] |= g->seq[p->a[z]>>33].len; p->len += (uint32_t)p->a[z]; + ct = R_INF.trio_flag[p->a[z]>>33]; + if((ct != FATHER) && (ct != MOTHER)) ct = AMBIGU; + if(ct == FATHER) fn1++; if(ct == MOTHER) mn1++; + } + fn = mn = 0; l = k; + if(k < uidx.n) { + if(uidx.a[k]&1) mn += uidx.a[k]>>32; + else fn += uidx.a[k]>>32; + } + } + } + assert(n1 == kdq_size(q)); + assert(fn1 == fn0); assert(mn1 == mn0); + } + } + kdq_destroy(uint64_t, q); kv_destroy(uidx); + + // add arcs between unitigs; reusing mark for a different purpose + //ug saves all unitigs + for (v = 0; v < n_vtx; ++v) mark[v] = -1; + + //mark all start nodes and end nodes of all unitigs + for (i = 0; i < ug->u.n; ++i) { + if (ug->u.a[i].circ) continue; + mark[ug->u.a[i].start] = i<<1 | 0; + mark[ug->u.a[i].end] = i<<1 | 1; + } + + //scan all edges + for (i = 0; i < g->n_arc; ++i) { + asg_arc_t *p = &g->arc[i]; + if (p->del) continue; + ///to connect two unitigs, we need to connect the end of unitig x to the start of unitig y + ///so we need to ^1 to get the reverse direction of (x's end)? + ///>=0 means this node is a start/end node of an unitig + ///means this node is a intersaction node + if (mark[p->ul>>32^1] >= 0 && mark[p->v] >= 0) { + asg_arc_t *q; + uint32_t u = mark[p->ul>>32^1]^1; + int l = ug->u.a[u>>1].len - p->ol; + if (l < 0) l = 1; + q = asg_arc_pushp(ug->g); + q->ol = p->ol, q->del = 0; + q->ul = (uint64_t)u<<32 | l; + q->v = mark[p->v]; q->ou = 0; + q->el = p->el; + } + } + for (i = 0; i < ug->u.n; ++i) + asg_seq_set(ug->g, i, ug->u.a[i].len, 0); + asg_cleanup(ug->g); + free(mark); + return ug; +} + ma_ug_t *ma_ug_gen_primary(asg_t *g, uint8_t flag) { asg_cleanup(g); @@ -13330,7 +13602,7 @@ ma_hit_t_alloc* sources, R_to_U* ruIndex, int max_hang, int min_ovlp, kvec_asg_a FILE* output_file = fopen(gfa_name, "w"); ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, "ptg", output_file); fclose(output_file); - + if(asm_opt.make_only_primary_contigs == 1) { fprintf(stderr, "Modified_hifiasm finishes because the user has provided the option \"--only-primary\""); @@ -13811,7 +14083,7 @@ void debug_hapS(uint32_t *hapS, uint32_t rn) } void output_poly_trio(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, -R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, +R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int gap_fuzz, int is_bench, bub_label_t* b_mask_t, uint32_t hapN) { uint32_t i; @@ -13822,7 +14094,7 @@ bub_label_t* b_mask_t, uint32_t hapN) update_poly_trio(1<n_seq); sprintf(fp, "hap%u", i+1); output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, - stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, is_bench, b_mask_t, fp, NULL, NULL); + stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, gap_fuzz, is_bench, b_mask_t, fp, NULL, NULL); } free(fp); free(hapS); } @@ -15214,11 +15486,10 @@ void purge_ovlp_cov_adv(uint32_t id, kv_u_trans_t *ta, asg64_v *b64, ug_rid_cov_ } } -void trans_sec_cut_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *sg, ma_hit_t_alloc* src) +void trans_sec_cut_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *sg, ma_hit_t_alloc* src, ug_rid_cov_t *in) { - uint64_t k; - asg64_v b64; kv_init(b64); - ug_rid_cov_t *cc = gen_ug_rid_cov_t(ug, sg, src); + uint64_t k; asg64_v b64; kv_init(b64); + ug_rid_cov_t *cc = ((in)?(in):(gen_ug_rid_cov_t(ug, sg, src))); fprintf(stderr, "+[M::%s]\thom_cov::%lu\thet_cov::%lu\thom_cut::%lu\n", __func__, cc->hom_cov, cc->het_cov, cc->hom_max); @@ -15227,7 +15498,7 @@ void trans_sec_cut_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *sg, ma purge_ovlp_cov_adv(k, ta, &b64, cc, cc->hom_max); } - destory_ug_rid_cov_t(cc); free(cc); kv_destroy(b64); + if(!in) {destory_ug_rid_cov_t(cc); free(cc);} kv_destroy(b64); } static void worker_for_trans_sec_simple_cut(void *data, long i, int tid) // callback for kt_for() @@ -15257,8 +15528,18 @@ static void worker_for_trans_sec_simple_cut(void *data, long i, int tid) // call } } +void gen_ug_rid_cov_t_by_ovlp(kv_u_trans_t *ta, ug_rid_cov_t *cc) +{ + uint64_t z, k, id, n; u_trans_t *a; + for (z = 0; z < cc->ug->g->n_seq; z++) { + id = z; a = u_trans_a(*ta, id); n = u_trans_n(*ta, id); + for (k = 0; k < n; k++) { + append_cov_line_ug_rid_cov_t(id, cc->cov.a+cc->idx[id], &(a[k]), cc, ((uint64_t)-1), -1); + } + } +} -void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src) +void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src, ug_rid_cov_t *in) { u_trans_clean_t sl; uint64_t k, i, l, st, occ; ha_mzl_t *tz; ha_mzl_v srt_a; kv_u_trans_t *bl; u_trans_t *z; @@ -15324,7 +15605,7 @@ void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t * free(sl.res); kv_destroy(srt_a); kt_u_trans_t_idx(ta, ug->g->n_seq); // dbg_prt_utg_trans(ta, ug, "after"); - trans_sec_cut_filter_mmhap_adv(ta, ug, read_g, src); + trans_sec_cut_filter_mmhap_adv(ta, ug, read_g, src, in); kt_for(sl.n_thread, worker_for_trans_sec_simple_cut, &sl, sl.ta->idx.n); // CALLOC(sl.srt, sl.n_thread); sl.sec_rate = 0.5; @@ -15344,6 +15625,7 @@ void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t * st = i; } } + // dbg_prt_utg_extra_trans(ta, ug, asm_opt.output_file_name); } @@ -15471,13 +15753,16 @@ long long gap_fuzz, bub_label_t* b_mask_t, ug_opt_t *opt) kv_destroy(d_edges.a); asg_cleanup(sg); - // reduce_hamming_error(sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); - reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, opt->ruIndex, NULL); + // reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, opt->ruIndex, NULL); - ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); - ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, rhits?1:0, b_mask_t, NULL, NULL, NULL); + // ug_fa = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + // 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, rhits?1:0, b_mask_t, NULL, NULL, NULL); + // ug_mo = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + // 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, rhits?1:0, b_mask_t, NULL, NULL, NULL); + + + output_trio_graph_joint(sg, coverage_cut, output_file_name, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, b_mask_t, rhits?(&ug_fa):NULL, rhits?(&ug_mo):NULL); if(rhits) { ha_aware_order(rhits, sg, ug_fa, ug_mo, cov?&(cov->t_ch->k_trans):&(t_ch->k_trans), opt, 3); @@ -15612,7 +15897,7 @@ ma_ug_t *mm_ug, mmhap_t *rh, uint32_t n_hap) sprintf(fp, "hap%u", i+1); update_trio_mmhap(i, mm_ug, rh, sg, n_hap); output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, - stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, 0, b_mask_t, fp, NULL, NULL); + stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, fp, NULL, NULL); } free(fp); } @@ -15666,9 +15951,9 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, long long gap // if((asm_opt.flag & HA_F_VERBOSE_GFA)) write_trans_chain(cov->t_ch, output_file_name); } - dbg_prt_utg_trans(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, "pre"); - clean_u_trans_t_idx_filter_mmhap_adv(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, sg, opt->sources); - dbg_prt_utg_trans(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, "after"); + // dbg_prt_utg_trans(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, "pre"); + clean_u_trans_t_idx_filter_mmhap_adv(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, sg, opt->sources, NULL); + // dbg_prt_utg_trans(&(cov?cov->t_ch->k_trans:t_ch->k_trans), ug, "after"); // refine_hic_trans_mmhap(opt, &(cov?cov->t_ch->k_trans:t_ch->k_trans), sg, ug); ///for debug @@ -15729,7 +16014,7 @@ float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, long long gap kv_destroy(d_edges.a); asg_cleanup(sg); - dbg_prt_trio_mmhap_label(ug, rh, output_file_name); + // dbg_prt_trio_mmhap_label(ug, rh, output_file_name); output_trio_mmhap(sg, coverage_cut, output_file_name, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, min_ovlp, gap_fuzz, b_mask_t, opt, ug, rh, asm_opt.polyploidy); @@ -15851,9 +16136,9 @@ long long gap_fuzz, bub_label_t* b_mask_t) reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, opt.ruIndex, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); } void set_trio_flag_by_cov(ma_ug_t *ug, asg_t *read_g, hap_cov_t *cov) @@ -16616,7 +16901,7 @@ void output_bp_graph_adv(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_na ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, -bub_label_t* b_mask_t, ug_opt_t *opt) +int gap_fuzz, bub_label_t* b_mask_t, ug_opt_t *opt) { hic_clean(sg); @@ -16696,16 +16981,15 @@ bub_label_t* b_mask_t, ug_opt_t *opt) output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); } void output_bp_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, -ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, -long long tipsLen, float tip_drop_ratio, long long stops_threshold, -R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, -bub_label_t* b_mask_t) +ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, +R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t, +long long gap_fuzz, ug_opt_t *opt) { hic_clean(sg); kvec_asg_arc_t_warp new_rtg_edges; @@ -16743,10 +17027,15 @@ bub_label_t* b_mask_t) ma_ug_destroy(ug); kv_destroy(new_rtg_edges.a); - output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); - output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang, min_ovlp, 0, b_mask_t, NULL, NULL, NULL); + // reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, opt->ruIndex, NULL); + + // output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + // 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); + // output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + // 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, 0, b_mask_t, NULL, NULL, NULL); + + output_trio_graph_joint(sg, coverage_cut, output_file_name, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, + 0.05, 0.9, max_hang, min_ovlp, gap_fuzz, b_mask_t, NULL, NULL); } ma_ug_t* merge_utg(ma_ug_t **dest, ma_ug_t **src) @@ -16809,15 +17098,15 @@ ma_ug_t* merge_utg(ma_ug_t **dest, ma_ug_t **src) void benchmark_hic_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, -float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, bub_label_t* b_mask_t) +float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int gap_fuzz, bub_label_t* b_mask_t) { ma_ug_t *ug_1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, - chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL, NULL, NULL); + chimeric_rate, drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, NULL); ma_ug_t *ug_2 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, - chimeric_rate, drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL, NULL, NULL); + chimeric_rate, drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, NULL); fprintf(stderr, "ug_1->u.n: %u, ug_2->u.n: %u\n", (uint32_t)ug_1->u.n, (uint32_t)ug_2->u.n); ma_ug_t *ug = merge_utg(&ug_1, &ug_2); fprintf(stderr, "ug->u.n: %u\n", (uint32_t)ug->u.n); @@ -18593,55 +18882,287 @@ asg_t* copy_read_graph(asg_t *src) return dest; } +rd_hamming_fly_t* gen_rd_hamming_fly_t(ma_ug_t *ug, asg_t *sg) +{ + rd_hamming_fly_t *p; CALLOC(p, 1); + MALLOC(p->o2n, sg->n_seq); + memset(p->o2n, -1, sizeof((*(p->o2n)))*sg->n_seq); + MALLOC(p->ugh, ug->g->n_seq); + return p; +} + +void destroy_rd_hamming_fly_t(rd_hamming_fly_t *p) +{ + free(p->o2n); free(p->srt->a); free(p->srt); + ma_ug_destroy(p->nug); asg_destroy(p->nsg); + asg_destroy(p->ref); free(p->ugh); +} + +void recall_arcs(asg_t *des, asg_t *src) +{ + uint32_t v, w, n_vtx = src->n_seq*2; + asg_arc_t *av, *za, *p; uint32_t an, zn, ai, zi, k; + kvec_t(asg_arc_t) ka; kv_init(ka); + + for (v = 0; v < n_vtx; ++v) { + if(src->seq[v>>1].del) continue; + za = asg_arc_a(src, v); zn = asg_arc_n(src, v); + av = asg_arc_a(des, v); an = asg_arc_n(des, v); + for (zi = 0; zi < zn; zi++) { + if(za[zi].del) continue; + w = za[zi].v; + for (ai = 0; ai < an; ai++) { + if(av[ai].del) continue; + if(av[ai].v == w) break; + } + if(ai >= an) kv_push(asg_arc_t, ka, za[zi]); + } + } + + if(ka.n) { + for (k = 0; k < ka.n; k++) { + p = asg_arc_pushp(des); *p = (ka.a[k]); + } + free(des->idx); + des->idx = 0; + des->is_srt = 0; + asg_cleanup(des); + // asg_symm(des); + } + fprintf(stderr, "[M::%s] # transitive arcs::%u\n", __func__, (uint32_t)ka.n); + fprintf(stderr, "[M::%s] # new arcs::%u, # old arcs::%u\n", __func__, des->n_arc, src->n_arc); + + kv_destroy(ka); +} + +ma_ug_t* gen_fg(ma_ug_t *ug, asg_t *rg, ma_hit_t_alloc* src, ma_sub_t *cov, int32_t max_hang, int32_t min_ovlp, int32_t gap_fuzz) +{ + uint32_t *idx; MALLOC(idx, rg->n_seq); + memset(idx, -1, sizeof((*idx))*rg->n_seq); + ma_ug_t *fg = copy_untig_graph(ug); asg_cleanup(fg->g);///some edges might be deleted + kvec_t(uint64_t) srt; kv_init(srt); + uint64_t i, k, l, m, rv, rw, uv, uw, zn, z, nist = 0; ma_utg_t *u; + for (k = 0; k < fg->u.n; k++) { + u = &(ug->u.a[k]); fg->g->seq[k].c = PRIMARY_LABLE; + if(u->circ) continue; + m = k<<1; m |= (((uint64_t)u->start)<<32); kv_push(uint64_t, srt, m); + m = (k<<1)+1; m |= (((uint64_t)u->end)<<32); kv_push(uint64_t, srt, m); + } + + radix_sort_arch64(srt.a, srt.a+srt.n); + for (k = 1, l = 0; k <= srt.n; k++) { + if(k == srt.n || (srt.a[k]>>33) != (srt.a[l]>>33)) { + idx[srt.a[l]>>33] = l; + l = k; + } + } + + ma_hit_t_alloc* x; asg_arc_t *za; + ma_hit_t *h; ma_sub_t *sq, *st; + int32_t r; asg_arc_t t0, t1, *p; + for (k = 0; k < fg->u.n; k++) { + u = &(ug->u.a[k]); + if(u->circ) continue; + + uv = k<<1; rv = u->end^1; + x = &(src[rv>>1]); + za = asg_arc_a(ug->g, uv); + zn = asg_arc_n(ug->g, uv); + for (i = 0; i < x->length; i++) { + h = &(x->buffer[i]); + // if(!(h->el)) continue; + sq = &(cov[Get_qn(*h)]); st = &(cov[Get_tn(*h)]); + if(st->del || rg->seq[Get_tn(*h)].del) continue; + r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, max_hang, + asm_opt.max_hang_rate, min_ovlp, &t0); + + ///if it is a contained read, skip + if(r < 0) continue; + if((t0.ul>>32) != rv) continue; + rw = t0.v; + if(idx[rw>>1] == ((uint32_t)-1)) continue; + m = idx[rw>>1]; assert((srt.a[m]>>33) == (rw>>1)); + for (; m < srt.n && (srt.a[m]>>33) == (rw>>1); m++) { + if(rw == (srt.a[m]>>32)) { + uw = (uint32_t)srt.a[m]; + if(uv == uw) continue; + for (z = 0; z < zn; z++) { + if((!za[z].del) && (za[z].v==uw)) break; + } + if(z < zn) continue; + if(get_edge_from_source(src, cov, NULL, max_hang, min_ovlp, (t0.v^1), ((t0.ul>>32)^1), &t1)) { + p = asg_arc_pushp(fg->g); *p = t0; + p->ul<<=32; p->ul>>=32; p->ul |= (uv<<32); p->v = uw; + + p = asg_arc_pushp(fg->g); *p = t1; + p->ul<<=32; p->ul>>=32; p->ul |= ((uw^1)<<32); p->v = uv^1; + nist++; + } + } + } + } + + + uv = (k<<1)+1; rv = u->start^1; + x = &(src[rv>>1]); + za = asg_arc_a(ug->g, uv); + zn = asg_arc_n(ug->g, uv); + for (i = 0; i < x->length; i++) { + h = &(x->buffer[i]); + // if(!(h->el)) continue; + sq = &(cov[Get_qn(*h)]); st = &(cov[Get_tn(*h)]); + if(st->del || rg->seq[Get_tn(*h)].del) continue; + r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, max_hang, + asm_opt.max_hang_rate, min_ovlp, &t0); + + ///if it is a contained read, skip + if(r < 0) continue; + if((t0.ul>>32) != rv) continue; + rw = t0.v; + if(idx[rw>>1] == ((uint32_t)-1)) continue; + m = idx[rw>>1]; assert((srt.a[m]>>33) == (rw>>1)); + for (; m < srt.n && (srt.a[m]>>33) == (rw>>1); m++) { + if(rw == (srt.a[m]>>32)) { + uw = (uint32_t)srt.a[m]; + if(uv == uw) continue; + for (z = 0; z < zn; z++) { + if((!za[z].del) && (za[z].v==uw)) break; + } + if(z < zn) continue; + if(get_edge_from_source(src, cov, NULL, max_hang, min_ovlp, (t0.v^1), ((t0.ul>>32)^1), &t1)) { + p = asg_arc_pushp(fg->g); *p = t0; + p->ul<<=32; p->ul>>=32; p->ul |= (uv<<32); p->v = uw; + + p = asg_arc_pushp(fg->g); *p = t1; + p->ul<<=32; p->ul>>=32; p->ul |= ((uw^1)<<32); p->v = uv^1; + nist++; + } + } + } + } + } + + if(nist) { + free(fg->g->idx); + fg->g->idx = 0; + fg->g->is_srt = 0; + asg_cleanup(fg->g); + asg_symm(fg->g); + asg_arc_del_trans(fg->g, gap_fuzz); + ///some of old edges might be lost due the transitive reduction + recall_arcs(fg->g, ug->g); + } + + kv_destroy(srt); free(idx); + return fg; +} + +rd_hamming_fly_simp_t* gen_rd_hamming_fly_simp_t(ma_ug_t *ug, asg_t *rg, ma_hit_t_alloc* src, ma_sub_t *cov, int32_t max_hang, int32_t min_ovlp, int32_t gap_fuzz, kvec_asg_arc_t_warp *ae) +{ + rd_hamming_fly_simp_t *p; CALLOC(p, 1); + // p->ng = asg_init(); + // p->ng->n_seq = p->ng->m_seq = ug->g->n_seq; + // MALLOC(p->ng->seq, p->ng->n_seq); + // memcpy(p->ng->seq, ug->g->seq, (sizeof((*(p->ng->seq)))*p->ng->n_seq)); + p->src = src; p->cov = cov; p->max_hang = max_hang; p->min_ovlp = min_ovlp; p->gap_fuzz = gap_fuzz; + p->fg = gen_fg(ug, rg, src, cov, max_hang, min_ovlp, gap_fuzz); p->n_insert = 0; + CALLOC(p->vs, (ug->g->n_seq<<1)); CALLOC(p->srt, 1); p->ae = ae; + // p->fg = gen_fg(); + // p->rg = rg; MALLOC(p->rs, rg->n_seq<<1); + // memset(p->rs, -1, sizeof((*(p->rs)))*(rg->n_seq<<1)); + return p; +} + +void destroy_rd_hamming_fly_simp_t(rd_hamming_fly_simp_t *p) +{ + ///asg_destroy(p->ng); ///free(p->rs); + ma_ug_destroy(p->fg); + free(p->vs); + free(p->srt->a); + free(p->srt); +} + void clean_trio_untig_graph(ma_ug_t *ug, asg_t *read_g, ma_sub_t* coverage_cut, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, buf_t* b_0, uint8_t* visit, float density, uint32_t miniHapLen, uint32_t miniBiGraph, float chimeric_rate, int is_final_clean, int just_bubble_pop, -float drop_ratio, uint32_t trio_flag, float trio_drop_rate, hap_cov_t *cov) +float drop_ratio, uint32_t trio_flag, float trio_drop_rate, int max_hang, int min_ovlp, +int gap_fuzz, hap_cov_t *cov, kvec_asg_arc_t_warp *ae) { - asg_t *g = ug->g; + asg_t *g = ug->g; rd_hamming_fly_simp_t *p = NULL; uint32_t is_first = 1; - + // if(trio_flag == MOTHER) { + // print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len, 0, 1, 0); + // exit(1); + // } redo: - ///print_untig((ug), 61955, "i-0:", 0); - + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-0:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-0:", 0); + ///debug + // if(!p) p = gen_rd_hamming_fly_simp_t(ug, read_g, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz); // fprintf(stderr, "[M::%s] 0\n", __func__); - asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1); + asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1, p); + ///do not need to refine bubbles during the first round of cleaning + if(!p) p = gen_rd_hamming_fly_simp_t(ug, read_g, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, ae); + + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-1:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-1:", 0); // fprintf(stderr, "[M::%s] 1\n", __func__); magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); // fprintf(stderr, "[M::%s] 2\n", __func__); + + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-2:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-2:", 0); /**********debug**********/ if(just_bubble_pop == 0) { cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); } // fprintf(stderr, "[M::%s] 3\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-3:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-3:", 0); /**********debug**********/ long long pre_cons = get_graph_statistic(g); long long cur_cons = 0; while(pre_cons != cur_cons) { // fprintf(stderr, "[M::%s] 4\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-4:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-4:", 0); pre_cons = get_graph_statistic(g); // fprintf(stderr, "[M::%s] 5\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-5:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-5:", 0); ///need consider tangles - asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1); + asg_pop_bubble_primary_trio(ug, NULL, trio_flag, DROP, cov, NULL, 1, p); // fprintf(stderr, "[M::%s] 6\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-6:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-6:", 0); /**********debug**********/ if(just_bubble_pop == 0) { ///need consider tangles asg_arc_cut_trio_long_tip_primary(g, ug, read_g, reverse_sources, ruIndex, 2, tip_drop_ratio, trio_flag, cov, NULL); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-7:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-7:", 0); // fprintf(stderr, "[M::%s] 7\n", __func__); // if(trio_flag == MOTHER) print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); asg_arc_cut_trio_long_equal_tips_assembly(g, ug, read_g, reverse_sources, 2, ruIndex, trio_flag, cov, NULL); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-8:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-8:", 0); // fprintf(stderr, "[M::%s] 8\n", __func__); asg_arc_cut_trio_long_tip_primary_complex(g, ug, read_g, reverse_sources, ruIndex, 2, tip_drop_ratio, stops_threshold, cov, NULL, trio_flag); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-9:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-9:", 0); // fprintf(stderr, "[M::%s] 9\n", __func__); asg_arc_cut_trio_long_equal_tips_assembly_complex(g, ug, read_g, reverse_sources, 2, ruIndex, stops_threshold, cov, NULL, trio_flag); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-10:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-10:", 0); // fprintf(stderr, "[M::%s] 10\n", __func__); detect_chimeric_by_topo(g, ug, read_g, reverse_sources, 2, stops_threshold, chimeric_rate, ruIndex, NULL, cov->is_r_het); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-11:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-11:", 0); // fprintf(stderr, "[M::%s] 11\n", __func__); ///need consider tangles ///note we need both the read graph and the untig graph @@ -18649,40 +19170,60 @@ float drop_ratio, uint32_t trio_flag, float trio_drop_rate, hap_cov_t *cov) /**********debug**********/ cur_cons = get_graph_statistic(g); // fprintf(stderr, "[M::%s] 12\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-12:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-12:", 0); } if(just_bubble_pop == 0) { // fprintf(stderr, "[M::%s] 13\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-13:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-13:", 0); cut_trio_tip_primary(g, ug, tipsLen, trio_flag, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-14:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-14:", 0); // fprintf(stderr, "[M::%s] 14\n", __func__); } // print_debug_gfa(read_g, ug, coverage_cut, "debug_dups", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); // fprintf(stderr, "[M::%s] 15\n", __func__); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-15:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-15:", 0); magic_trio_phasing(g, ug, read_g, coverage_cut, sources, reverse_sources, 2, ruIndex, trio_flag, trio_drop_rate); // fprintf(stderr, "[M::%s] 16\n", __func__); - + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-16:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-16:", 0); // print_debug_gfa(read_g, ug, coverage_cut, "resolve_tangles", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len, 0, 0, 0); // exit(1); ///bug here resolve_tangles(ug, read_g, reverse_sources, 20, 100, 0.05, 0.2, ruIndex, cov->is_r_het, trio_flag, drop_ratio); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-17:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-17:", 0); // fprintf(stderr, "[M::%s] 17\n", __func__); drop_semi_circle(ug, g, read_g, reverse_sources, ruIndex, cov->is_r_het); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-18:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-18:", 0); // fprintf(stderr, "[M::%s] 18\n", __func__); all_to_all_deduplicate(ug, read_g, coverage_cut, sources, trio_flag, trio_drop_rate, reverse_sources, ruIndex, cov->is_r_het, DOUBLE_CHECK_THRES, asm_opt.trio_flag_occ_thres); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-19:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-19:", 0); // fprintf(stderr, "[M::%s] 19\n", __func__); // if(trio_flag == MOTHER) print_untig_by_read(ug, "m54329U_190827_173812/30214441/ccs", (uint32_t)-1, NULL, NULL, "bf-16"); if(is_first) { is_first = 0; unitig_arc_del_short_diploid_by_length(ug->g, drop_ratio); + // if(trio_flag == MOTHER) print_untig((ug), 9, "i-20:", 0); + // if(trio_flag == MOTHER) print_untig((ug), 10, "i-20:", 0); // fprintf(stderr, "[M::%s] 20\n", __func__); goto redo; - } + } + if(p) { + fprintf(stderr, "[M::%s] # adjusted arcs::%u\n", __func__, p->n_insert); + destroy_rd_hamming_fly_simp_t(p); free(p); + } } - void print_graph_statistic(asg_t *g, const char* cmd) { uint64_t n_arc = 0, n_node = 0, size = 0; @@ -18714,7 +19255,7 @@ int just_bubble_pop, float drop_ratio, hap_cov_t *cov) redo: ///print_graph_statistic(g, "beg"); ///print_debug_gfa(read_g, ug, coverage_cut, "debug_trans_ovlp_hg002", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); - asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, NULL, 1); + asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, NULL, 1, NULL); if(just_bubble_pop == 0) { cut_trio_tip_primary(g, ug, tipsLen, (uint32_t)-1, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); @@ -18725,7 +19266,7 @@ int just_bubble_pop, float drop_ratio, hap_cov_t *cov) while(pre_cons != cur_cons) { pre_cons = get_graph_statistic(g); - asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, NULL, 1); + asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, NULL, 1, NULL); if(just_bubble_pop == 0) { ///need consider tangles @@ -18777,7 +19318,7 @@ int min_ovlp, hap_cov_t *cov) // print_debug_gfa(read_g, ug, coverage_cut, "debug_init", sources, ruIndex, asm_opt.max_hang_Len, asm_opt.min_overlap_Len); redo: - asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, o, 1); + asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, o, 1, NULL); cut_trio_tip_primary(g, ug, tipsLen, (uint32_t)-1, 0, read_g, reverse_sources, ruIndex, cov->is_r_het, 2); long long pre_cons = get_graph_statistic(g); @@ -18789,7 +19330,7 @@ int min_ovlp, hap_cov_t *cov) while(pre_cons != cur_cons) { pre_cons = get_graph_statistic(g); - asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, o, 1); + asg_pop_bubble_primary_trio(ug, NULL, (uint32_t)-1, DROP, cov, o, 1, NULL); ///need consider tangles asg_arc_cut_trio_long_tip_primary(g, ug, read_g, reverse_sources, ruIndex, 2, tip_drop_ratio, (uint32_t)-1, cov, o); @@ -19338,16 +19879,105 @@ void delete_useless_nodes(ma_ug_t **ug) asg_cleanup(nsg); } +inline uint32_t is_useful_node(uint32_t flag_occ, uint32_t non_flag_occ, uint32_t drop_occ, uint32_t tot_occ, +float flag_rate, float used_rate, uint32_t min_occ) +{ + if((flag_occ > 0) && (flag_occ >= min_occ) && (flag_occ >= ((non_flag_occ+flag_occ+drop_occ)*flag_rate)) + && (drop_occ <= (tot_occ*used_rate))) { + return 1; + } + return 0; +} +void recover_chain_nodes(buf_t *in, ma_ug_t *ug, uint32_t flag, float flag_rate, float used_rate, uint32_t min_occ) +{ + ma_utg_t *u = NULL; uint32_t flag_occ, non_flag_occ, drop_occ, rid; + uint32_t tot_flag_occ, tot_non_flag_occ, tot_drop_occ, tot_occ, i, k, z; + tot_flag_occ = tot_non_flag_occ = tot_drop_occ = tot_occ = 0; + + for (i = 0; i < in->b.n; i++) { + u = &(ug->u.a[in->b.a[i]>>1]); + flag_occ = non_flag_occ = drop_occ = 0; + for (k = 0; k < u->n; k++) { + rid = u->a[k]>>33; + if(R_INF.trio_flag[rid] == AMBIGU) continue; + else if(R_INF.trio_flag[rid] == DROP) drop_occ++; + else if(R_INF.trio_flag[rid] == flag) flag_occ++; + else if(R_INF.trio_flag[rid] != flag) non_flag_occ++; + } + + if(is_useful_node(flag_occ, non_flag_occ, drop_occ, u->n, flag_rate, used_rate, min_occ)) { + ug->g->seq[in->b.a[i]>>1].c = PRIMARY_LABLE; + } + tot_flag_occ += flag_occ; + tot_non_flag_occ += non_flag_occ; + tot_drop_occ += drop_occ; + tot_occ += u->n; + if(is_useful_node(tot_flag_occ, tot_non_flag_occ, tot_drop_occ, tot_occ, flag_rate, used_rate, min_occ)) { + for (z = 0; z <= i; z++) { + if(ug->g->seq[in->b.a[z]>>1].c == ALTER_LABLE) { + u = &(ug->u.a[in->b.a[z]>>1]); + flag_occ = non_flag_occ = drop_occ = 0; + for (k = 0; k < u->n; k++) { + rid = u->a[k]>>33; + if(R_INF.trio_flag[rid] == AMBIGU) continue; + else if(R_INF.trio_flag[rid] == DROP) drop_occ++; + else if(R_INF.trio_flag[rid] == flag) flag_occ++; + else if(R_INF.trio_flag[rid] != flag) non_flag_occ++; + } + if(is_useful_node(flag_occ, non_flag_occ, drop_occ, u->n, flag_rate, used_rate, 0/**min_occ**/)) { + ug->g->seq[in->b.a[z]>>1].c = PRIMARY_LABLE; + } + } + } + } + } +} + +void rescue_useless_trio_nodes(ma_ug_t *ug, uint32_t flag, float flag_rate, float used_rate, uint32_t min_occ) +{ + asg_t* nsg = ug->g; + uint32_t v, w, n_vtx = nsg->n_seq<<1, i, k, z; + long long nodeLen, baseLen, max_stop_nodeLen, max_stop_baseLen; + buf_t b; memset(&b, 0, sizeof(buf_t)); + + for (v = 0; v < n_vtx; v++) { + if(nsg->seq[v>>1].del) continue; + if(nsg->seq[v>>1].c != ALTER_LABLE) continue; + ///check if beg is the tig end + if(get_real_length(nsg, v^1, NULL) == 1) { + get_real_length(nsg, v^1, &w); + if(get_real_length(nsg, w^1, NULL) == 1) continue; + } + + b.b.n = 0; + get_unitig(nsg, ug, v, &w, &nodeLen, &baseLen, &max_stop_nodeLen, &max_stop_baseLen, 1, &b); + recover_chain_nodes(&b, ug, flag, flag_rate, used_rate, min_occ); + if(b.b.n > 1) { + k = b.b.n>>1; + for (i = 0; i < k; i++) { + z = b.b.a[i]; b.b.a[i] = b.b.a[b.b.n-i-1]; b.b.a[b.b.n-i-1] = z; + } + recover_chain_nodes(&b, ug, flag, flag_rate, used_rate, min_occ); + } + } + + free(b.b.a); + asg_cleanup(nsg); +} void delete_useless_trio_nodes(ma_ug_t **ug, asg_t* read_g, ma_sub_t* coverage_cut, -ma_hit_t_alloc* sources, R_to_U* ruIndex) +ma_hit_t_alloc* sources, R_to_U* ruIndex, uint32_t flag, float flag_rate, float used_rate, uint32_t min_occ) { asg_t* nsg = (*ug)->g; uint32_t v, n_vtx = nsg->n_seq; uint8_t* primary_flag = (uint8_t*)calloc(read_g->n_seq, sizeof(uint8_t)); + if(flag_rate > 0 && used_rate > 0 && min_occ > 0) { + rescue_useless_trio_nodes(*ug, flag, flag_rate, used_rate, min_occ); + } + for (v = 0; v < n_vtx; ++v) { if(nsg->seq[v].del) continue; @@ -19588,7 +20218,7 @@ void purge_dump(ma_ug_t* ug) void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_rate, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, long long tipsLen, float tip_drop_ratio, long long stops_threshold, -R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, +R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int gap_fuzz, kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) { asg_t* nsg = (*ug)->g; @@ -19632,11 +20262,11 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) } // fprintf(stderr, "[M::%s] 3\n", __func__); clean_trio_untig_graph(*ug, read_g, coverage_cut, sources, reverse_sources, tipsLen, - tip_drop_ratio, stops_threshold, ruIndex, NULL, NULL, 0, 0, 0, chimeric_rate, 0, 0, drop_ratio, flag, drop_rate, cov); + tip_drop_ratio, stops_threshold, ruIndex, NULL, NULL, 0, 0, 0, chimeric_rate, 0, 0, drop_ratio, flag, drop_rate, max_hang, min_ovlp, gap_fuzz, cov, new_rtg_edges); // fprintf(stderr, "[M::%s] 4\n", __func__); ///delete_useless_nodes(ug); - delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex); + delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex, flag, 0.8, 0.15, 16); // fprintf(stderr, "[M::%s] 5\n", __func__); update_hap_label(*ug, read_g); @@ -19684,7 +20314,7 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) // fprintf(stderr, "[M::%s] 16\n", __func__); ///delete_useless_nodes(ug); - delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex); + delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex, flag, 0.8, 0.15, 16); // fprintf(stderr, "[M::%s] 17\n", __func__); @@ -19694,7 +20324,7 @@ kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t) asm_opt.purge_simi_thres, asm_opt.purge_overlap_len, max_hang, min_ovlp, drop_ratio, 1, 0, cov, 0, 0); ///delete_useless_nodes(ug); - delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex); + delete_useless_trio_nodes(ug, read_g, coverage_cut, sources, ruIndex, flag, 0.8, 0.15, 16); } // fprintf(stderr, "[M::%s] 18\n", __func__); @@ -19724,27 +20354,49 @@ int debug_untig_length(ma_ug_t *g, uint32_t tipsLen, const char* name) return 0; } +void prt_phase_dbg_graph(char *in, asg_t *sg, ma_sub_t *cov, ma_hit_t_alloc *src, R_to_U* ri, int max_hang, int min_ovlp) +{ + char* gfa_name = (char*)malloc(strlen(in)+100); + sprintf(gfa_name, "%s.phase", in); + uint64_t pscut = 0; + pscut = (asm_opt.hom_global_coverage_set?(asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); + pscut *= PHASE_SEF; if(pscut < PHASE_SEP) pscut = PHASE_SEP; + ma_ug_t *ug = ma_ug_gen_phase(sg, pscut, PHASE_SEP_RATE); + print_debug_gfa(sg, ug, cov, gfa_name, src, ri, max_hang, min_ovlp, 0, 0, 0); + ma_ug_destroy(ug); + sprintf(gfa_name, "%s.raw", in); + ug = ma_ug_gen(sg); + print_debug_gfa(sg, ug, cov, gfa_name, src, ri, max_hang, min_ovlp, 0, 0, 0); + ma_ug_destroy(ug); + + free(gfa_name); +} ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, -float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, -char *f_prefix, uint8_t *kpt_buf, kvec_asg_arc_t_warp *r_edges) +float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int gap_fuzz, +int is_bench, bub_label_t* b_mask_t, char *f_prefix, uint8_t *kpt_buf, kvec_asg_arc_t_warp *r_edges) { char* gfa_name = (char*)malloc(strlen(output_file_name)+100); sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); FILE* output_file = NULL; if(is_bench == 0) output_file = fopen(gfa_name, "w"); - ma_ug_t *ug = NULL; - ug = ma_ug_gen(sg); + // prt_phase_dbg_graph(gfa_name, sg, coverage_cut, sources, ruIndex, max_hang, min_ovlp); + + ma_ug_t *ug = NULL; uint64_t pscut = 0; + // ug = ma_ug_gen(sg); + pscut = (asm_opt.hom_global_coverage_set?(asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); + pscut *= PHASE_SEF; if(pscut < PHASE_SEP) pscut = PHASE_SEP; + ug = ma_ug_gen_phase(sg, pscut, PHASE_SEP_RATE); kvec_asg_arc_t_warp new_rtg_edges; kv_init(new_rtg_edges.a); adjust_utg_by_trio(&ug, sg, flag, TRIO_THRES, sources, reverse_sources, coverage_cut, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, drop_ratio, max_hang, - min_ovlp, &new_rtg_edges, b_mask_t); + min_ovlp, gap_fuzz, &new_rtg_edges, b_mask_t); if(asm_opt.b_low_cov > 0) { break_ug_contig(&ug, sg, &R_INF, coverage_cut, sources, ruIndex, &new_rtg_edges, max_hang, min_ovlp, @@ -19806,6 +20458,39 @@ char *f_prefix, uint8_t *kpt_buf, kvec_asg_arc_t_warp *r_edges) } +void output_hap_graph(ma_ug_t *ug, asg_t *sg, kvec_asg_arc_t_warp *arcs, +ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, +R_to_U* ruIndex, int max_hang, int min_ovlp, char *f_prefix) +{ + char* gfa_name = (char*)malloc(strlen(output_file_name)+100); + sprintf(gfa_name, "%s.%s.p_ctg.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); + FILE* output_file = fopen(gfa_name, "w"); + + fprintf(stderr, "Writing %s to disk... \n", gfa_name); + ///debug_utg_graph(ug, sg, 0, 0); + ///debug_untig_length(ug, tipsLen, gfa_name); + ///print_untig_by_read(ug, "m64011_190901_095311/125831121/ccs", 2310925, "end"); + ma_ug_seq(ug, sg, coverage_cut, sources, arcs, max_hang, min_ovlp, 0, 1); + ma_ug_print(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file); + fclose(output_file); + + sprintf(gfa_name, "%s.%s.p_ctg.noseq.gfa", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); + output_file = fopen(gfa_name, "w"); + ma_ug_print_simple(ug, sg, coverage_cut, sources, ruIndex, (flag==FATHER?"h1tg":"h2tg"), output_file); + fclose(output_file); + if(asm_opt.bed_inconsist_rate != 0) + { + sprintf(gfa_name, "%s.%s.p_ctg.lowQ.bed", output_file_name, f_prefix?f_prefix:(flag==FATHER?"hap1":"hap2")); + output_file = fopen(gfa_name, "w"); + ma_ug_print_bed(ug, sg, &R_INF, coverage_cut, sources, arcs, + max_hang, min_ovlp, asm_opt.bed_inconsist_rate, (flag==FATHER?"h1tg":"h2tg"), output_file, NULL); + fclose(output_file); + } + + free(gfa_name); +} + + void filter_set_kug(uint8_t* trio_flag, asg_t *rg, uint8_t *rf, kvec_asg_arc_t_warp *r_edges, float f_rate, ma_ug_t **ug) { asg_t* nsg = (*ug)->g; ma_utg_t *u = NULL; @@ -19857,11 +20542,11 @@ int min_ovlp, int is_bench, long long gap_fuzz, ug_opt_t *opt, bub_label_t* b_ma output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, - drop_ratio, max_hang, min_ovlp, is_bench, b_mask_t, NULL, rf, NULL); + drop_ratio, max_hang, min_ovlp, gap_fuzz, is_bench, b_mask_t, NULL, rf, NULL); output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, - drop_ratio, max_hang, min_ovlp, is_bench, b_mask_t, NULL, rf, NULL); + drop_ratio, max_hang, min_ovlp, gap_fuzz, is_bench, b_mask_t, NULL, rf, NULL); if(rf) { kvec_asg_arc_t_warp r_edges; kv_init(r_edges.a); @@ -19872,7 +20557,7 @@ int min_ovlp, int is_bench, long long gap_fuzz, ug_opt_t *opt, bub_label_t* b_ma update_dump_trio(R_INF.trio_flag, sg->n_seq, rf, NULL); kug = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, - drop_ratio, max_hang, min_ovlp, 1, b_mask_t, NULL, NULL, &r_edges); + drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &r_edges); filter_set_kug(R_INF.trio_flag, sg, rf, &r_edges, asm_opt.kpt_rate, &kug); print_utg(kug, sg, coverage_cut, kug_n, sources, ruIndex, max_hang, min_ovlp, &r_edges); @@ -19881,6 +20566,280 @@ int min_ovlp, int is_bench, long long gap_fuzz, ug_opt_t *opt, bub_label_t* b_ma } } +dedup_idx_t *gen_dedup_idx_t(ma_ug_t *ug, asg_t *rg) +{ + // fprintf(stderr, "[M::%s] Start\n", __func__); + dedup_idx_t *p = NULL; CALLOC(p, 1); + p->rg = rg; p->ug = ug; p->ridx_n = rg->n_seq + 1; + uint64_t k, m, l, z, *a, a_n; ma_utg_t *u; + + CALLOC(p->ridx, p->ridx_n); + for (k = p->ra_n = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); p->ra_n += u->n; + if(!(u->n)) continue; + for (z = 0; z < u->n; z++) p->ridx[u->a[z]>>33]++; + } + for (k = l = 0; k < p->ridx_n; k++) { + m = p->ridx[k]; p->ridx[k] = l; l += m; + } + + + MALLOC(p->ra, p->ra_n); memset(p->ra, -1, sizeof((*(p->ra)))*p->ra_n); + for (k = 1; k < p->ridx_n; k++) { + a = p->ra + p->ridx[k-1]; + a_n = p->ridx[k] - p->ridx[k-1]; + if(a_n) a[a_n-1] = 0; + } + + for (k = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); + if(!(u->n)) continue; + for (z = 0; z < u->n; z++) { + a = p->ra + p->ridx[u->a[z]>>33]; + a_n = p->ridx[(u->a[z]>>33)+1] - p->ridx[u->a[z]>>33]; + if(a_n) { + if(a[a_n-1] == a_n-1) a[a_n-1] = (k<<32)|z; + else a[a[a_n-1]++] = (k<<32)|z; + } + } + } + // fprintf(stderr, "[M::%s] End\n", __func__); + return p; +} + +void destroy_dedup_idx_t(dedup_idx_t *p) +{ + // fprintf(stderr, "[M::%s] Start\n", __func__); + if(p == NULL) return; + free(p->ridx); free(p->ra); free(p); + // fprintf(stderr, "[M::%s] End\n", __func__); +} + +void update_recover_atg_cov() +{ + if(asm_opt.recover_atg_cov_min == -1024) { + asm_opt.recover_atg_cov_max = (asm_opt.hom_global_coverage_set? + (asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); + asm_opt.recover_atg_cov_min = asm_opt.recover_atg_cov_max * 0.85; + asm_opt.recover_atg_cov_max = INT32_MAX; + } +} + +int64_t cal_exact_ug_o(dedup_idx_t *idx, ma_utg_t *u, uint64_t f) +{ + if(u->n <= 0) return INT32_MIN; + uint64_t sv, *sa, sn, ev, *ea, en, si, ei, zn, *za, rev, k, nf = (uint64_t)-1, m, fn, nfn; ma_utg_t *z; + if(f == FATHER) nf = MOTHER; if(f == MOTHER) nf = FATHER; + sv = u->a[0]>>32; sa = idx->ra + idx->ridx[sv>>1]; sn = idx->ridx[(sv>>1)+1]-idx->ridx[sv>>1]; + ev = u->a[u->n-1]>>32; ea = idx->ra + idx->ridx[ev>>1]; en = idx->ridx[(ev>>1)+1] - idx->ridx[ev>>1]; + for (si = 0; si < sn; si++) { + z = &(idx->ug->u.a[sa[si]>>32]); + if((z->n == 0) || (z->m == 0) || (idx->ug->g->seq[sa[si]>>32].del)) continue; + assert(((z->a[(uint32_t)sa[si]]>>32)>>1) == (sv>>1)); + if((z->a[(uint32_t)sa[si]]>>32) == sv) rev = 0; + else rev = 1; + for (ei = 0; ei < en; ei++) { + if((idx->ug->u.a[ea[ei]>>32].n == 0) || (idx->ug->u.a[ea[ei]>>32].m == 0) || (idx->ug->g->seq[ea[ei]>>32].del)) continue; + assert(((idx->ug->u.a[ea[ei]>>32].a[(uint32_t)ea[ei]]>>32)>>1) == (ev>>1)); + if((sa[si]>>32) != (ea[ei]>>32)) continue;///not the same uid + if(((uint32_t)sa[si]) >= ((uint32_t)ea[ei])) { + zn = ((uint32_t)sa[si]) - ((uint32_t)ea[ei]) + 1; za = z->a + ((uint32_t)ea[ei]); + } else { + zn = ((uint32_t)ea[ei]) - ((uint32_t)sa[si]) + 1; za = z->a + ((uint32_t)sa[si]); + } + if(u->n != zn) continue; + if(!rev) { + for (k = 0; (k < zn) && ((u->a[k]>>32) == ((za[k]>>32))); k++); + if(k < zn) continue; + } else { + for (k = 0; (k < zn) && ((u->a[k]>>32) == (((uint64_t)(za[zn-k-1]>>32))^1)); k++); + if(k < zn) continue; + } + + assert(u->n <= z->n); + if(u->n < z->n) { + return -1;///delete u + } else if(u->n == z->n) { + for (m = fn = nfn = 0; m < u->n; m++) { + if(R_INF.trio_flag[u->a[m]>>33] == f) fn++; + if(R_INF.trio_flag[u->a[m]>>33] == nf) nfn++; + } + if(fn < nfn) return -1;///delete u + else return sa[si]>>32;///delete z + } + } + } + return INT32_MIN; +} + +void delete_ug_node(ma_ug_t *ug, uint64_t nid) +{ + asg_seq_del(ug->g, nid); + if(ug->u.a[nid].m!=0) { + ug->u.a[nid].m = ug->u.a[nid].n = 0; + free(ug->u.a[nid].a); ug->u.a[nid].a = NULL; + } +} + +uint64_t dedup_exact_ug(dedup_idx_t *ref, dedup_idx_t *qry, ma_sub_t *cov, ma_hit_t_alloc *src, R_to_U *rui, uint8_t *ff, uint8_t trio_f) +{ + // fprintf(stderr, "[M::%s] Start\n", __func__); + uint64_t k, n_base = 0; int64_t f; + for (k = 0; k < qry->ug->u.n; k++) { + if((qry->ug->u.a[k].n == 0) || (qry->ug->u.a[k].m == 0) || (qry->ug->g->seq[k].del)) continue; + if(if_primary_unitig(&(qry->ug->u.a[k]), qry->rg, cov, src, rui, ff)) continue; + f = cal_exact_ug_o(ref, &(qry->ug->u.a[k]), trio_f); + if(f == INT32_MIN) continue; + if(f == -1) {///delete qry + delete_ug_node(qry->ug, k); n_base += qry->ug->u.a[k].len; + } else if(f >= 0) {///delete ref + delete_ug_node(ref->ug, f); n_base += ref->ug->u.a[f].len; + } + } + // fprintf(stderr, "[M::%s] End\n", __func__); + return n_base; +} + +void push_ma_utg_t(ma_ug_t *ug, ma_utg_t *u) +{ + ma_utg_t *p; + ///graph + asg_seq_set(ug->g, ug->u.n, u->len, 0); ug->g->seq[ug->u.n].c = 0; + + //unitig + kv_pushp(ma_utg_t, ug->u, &p); memset(p, 0, sizeof((*p))); + *p = *u; p->a = NULL; p->s = NULL; + MALLOC(p->a, p->m); memcpy(p->a, u->a, sizeof((*(p->a)))*p->m); + if(u->s) { + MALLOC(p->s, p->len); memcpy(p->s, u->s, sizeof((*(p->s)))*p->len); + } + assert(ug->g->n_seq == ug->u.n); + if(p->n) { + p->circ = 0; p->start = (p->a[0]>>32); p->end = (p->a[p->n-1]>>32)^1; + } +} + +uint64_t append_miss_nid(asg_t *sg, ma_ug_t *hap0, ma_ug_t *hap1, uint8_t *ff, uint64_t len_cut, uint64_t occ_cut) +{ + ma_ug_t *ug = NULL; ma_utg_t *u; uint64_t k, z, pscut, n_set, fn, nfn, hap0n, hap1n, n_base = 0; + kvec_asg_arc_t_warp fe; memset(&fe, 0, sizeof(fe)); + memset(ff, 0, sizeof((*ff))*sg->n_seq); + ug = hap0; + for (k = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); + if((u->n == 0) || (u->m == 0) || (ug->g->seq[k].del)) continue; + for (z = 0; z < u->n; z++) ff[u->a[z]>>33] = 1; + } + ug = hap1; + for (k = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); + if((u->n == 0) || (u->m == 0) || (ug->g->seq[k].del)) continue; + for (z = 0; z < u->n; z++) ff[u->a[z]>>33] = 1; + } + ug = NULL; pscut = 0; + pscut = (asm_opt.hom_global_coverage_set?(asm_opt.hom_global_coverage):(((double)asm_opt.hom_global_coverage)/((double)HOM_PEAK_RATE))); + pscut *= PHASE_SEF; if(pscut < PHASE_SEP) pscut = PHASE_SEP; + ug = ma_ug_gen_phase(sg, pscut, PHASE_SEP_RATE); + for (k = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); + if((u->n == 0) || (u->m == 0) || (ug->g->seq[k].del)) continue; + for (z = n_set = 0; z < u->n; z++) { + if(ff[u->a[z]>>33]) n_set++; + } + if((n_set > 0) && (n_set >= (u->n*0.2))) delete_ug_node(ug, k); + } + renew_utg((&ug), sg, &fe); + + for (k = hap0n = hap1n = 0; k < ug->u.n; k++) { + u = &(ug->u.a[k]); + if((u->n == 0) || (u->m == 0) || (ug->g->seq[k].del)) continue; + if((u->len < len_cut) || (u->n < occ_cut)) continue; + for (z = n_set = 0; z < u->n; z++) { + if(ff[u->a[z]>>33]) n_set++; + } + if((n_set > 0) && (n_set >= (u->n*0.2))) continue; + for (z = fn = nfn = 0; z < u->n; z++) { + if(R_INF.trio_flag[u->a[z]>>33] == FATHER) fn++; + if(R_INF.trio_flag[u->a[z]>>33] == MOTHER) nfn++; + } + if(fn > nfn) { + push_ma_utg_t(hap0, u); hap0n++; n_base += u->len; + } else { + push_ma_utg_t(hap1, u); hap1n++; n_base += u->len; + } + } + ma_ug_destroy(ug); ug = NULL; + if(hap0n) { + ug = hap0; + free(ug->g->idx); ug->g->idx = 0; ug->g->is_srt = 0; + asg_cleanup(ug->g); asg_symm(ug->g); + if(ug->g->seq_vis) { + REALLOC(ug->g->seq_vis, (ug->g->n_seq*2)); + memset(ug->g->seq_vis, 0, sizeof((*(ug->g->seq_vis)))*(ug->g->n_seq*2)); + } + } + if(hap1n) { + ug = hap1; + free(ug->g->idx); ug->g->idx = 0; ug->g->is_srt = 0; + asg_cleanup(ug->g); asg_symm(ug->g); + if(ug->g->seq_vis) { + REALLOC(ug->g->seq_vis, (ug->g->n_seq*2)); + memset(ug->g->seq_vis, 0, sizeof((*(ug->g->seq_vis)))*(ug->g->n_seq*2)); + } + } + return n_base; +} + +void output_trio_graph_joint(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, +ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, +long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, +int min_ovlp, long long gap_fuzz, bub_label_t* b_mask_t, ma_ug_t **rhu0, ma_ug_t **rhu1) +{ + ma_ug_t *hu0 = NULL, *hu1 = NULL; kvec_asg_arc_t_warp arcs0, arcs1; + memset(&arcs0, 0, sizeof(arcs0)); memset(&arcs1, 0, sizeof(arcs1)); + + reduce_hamming_error_adv(NULL, sg, sources, coverage_cut, max_hang, min_ovlp, gap_fuzz, ruIndex, NULL); + + hu0 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, FATHER, sources, + reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, + drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs0); + + hu1 = output_trio_unitig_graph(sg, coverage_cut, output_file_name, MOTHER, sources, + reverse_sources, tipsLen, tip_drop_ratio, stops_threshold, ruIndex, chimeric_rate, + drop_ratio, max_hang, min_ovlp, gap_fuzz, 1, b_mask_t, NULL, NULL, &arcs1); + + dedup_idx_t *hidx0 = NULL, *hidx1 = NULL; uint8_t *ff; CALLOC(ff, sg->n_seq); + hidx0 = gen_dedup_idx_t(hu0, sg); hidx1 = gen_dedup_idx_t(hu1, sg); + update_recover_atg_cov(); + + uint64_t dedup_base = 0, miss_base = 0, s; + s = dedup_exact_ug(hidx1, hidx0, coverage_cut, sources, ruIndex, ff, FATHER); dedup_base += s; + s = dedup_exact_ug(hidx0, hidx1, coverage_cut, sources, ruIndex, ff, MOTHER); dedup_base += s; + destroy_dedup_idx_t(hidx0); destroy_dedup_idx_t(hidx1); + + miss_base = append_miss_nid(sg, hu0, hu1, ff, PHASE_MISS_LEN, PHASE_MISS_N); free(ff); + + renew_utg((&hu0), sg, &arcs0); renew_utg((&hu1), sg, &arcs1); + fprintf(stderr, "[M::%s] dedup_base::%lu, miss_base::%lu\n", __func__, dedup_base, miss_base); + + if(!rhu0) { + output_hap_graph(hu0, sg, &arcs0, coverage_cut, output_file_name, FATHER, sources, ruIndex, max_hang, min_ovlp, NULL); + ma_ug_destroy(hu0); + } else { + (*rhu0) = hu0; + } + kv_destroy(arcs0.a); + + + if(!rhu1) { + output_hap_graph(hu1, sg, &arcs1, coverage_cut, output_file_name, MOTHER, sources, ruIndex, max_hang, min_ovlp, NULL); + ma_ug_destroy(hu1); + } else { + ma_ug_destroy(hu1); + } + kv_destroy(arcs1.a); +} + void output_read_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, long long n_read) { fprintf(stderr, "Writing read GFA to disk... \n"); @@ -21130,7 +22089,7 @@ int max_hang, int min_ovlp, uint8_t* trio_flag, uint8_t* vis_flag, kv_asg_arc_t* ma_sub_t *sq = NULL; ma_sub_t *st = NULL; int32_t r; - asg_arc_t t; + asg_arc_t t0, t1; for (k_i = 0; k_i < b->b.n; k_i++) @@ -21153,18 +22112,16 @@ int max_hang, int min_ovlp, uint8_t* trio_flag, uint8_t* vis_flag, kv_asg_arc_t* st = &(coverage_cut[Get_tn(*h)]); if(st->del || sg->seq[Get_tn(*h)].del) continue; r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, max_hang, - asm_opt.max_hang_rate, min_ovlp, &t); + asm_opt.max_hang_rate, min_ovlp, &t0); ///if it is a contained read, skip if(r < 0) continue; - if((t.ul>>32) != v) continue; - if(vis_flag[t.ul>>32] == 0 || vis_flag[t.v] == 0) continue; - kv_push(asg_arc_t, *e, t); - get_edge_from_source(sources, coverage_cut, NULL, max_hang, min_ovlp, - (t.v^1), ((t.ul>>32)^1), &t); - kv_push(asg_arc_t, *e, t); + if((t0.ul>>32) != v) continue; + if(vis_flag[t0.ul>>32] == 0 || vis_flag[t0.v] == 0) continue; + if(get_edge_from_source(sources, coverage_cut, NULL, max_hang, min_ovlp, (t0.v^1), ((t0.ul>>32)^1), &t1)) { + kv_push(asg_arc_t, *e, t0); kv_push(asg_arc_t, *e, t1); + } } - } } } @@ -21210,7 +22167,7 @@ buf_t *b, uint64_t tLen, uint64_t vis_f, asg_t *res, asg64_v *sv) ma_hit_t_alloc* x = NULL; ma_hit_t *h; ma_sub_t *sq, *st; - int32_t r; asg_arc_t t, *p; + int32_t r; asg_arc_t t0, t1, *p; for (k_i = 0; k_i < b->b.n; k_i++) { if((b->b.a[k_i]>>1)==(bi>>1) || (b->b.a[k_i]>>1)==(b->S.a[0]>>1)) continue; @@ -21224,22 +22181,22 @@ buf_t *b, uint64_t tLen, uint64_t vis_f, asg_t *res, asg64_v *sv) zn = asg_arc_n(sg, v); for (i = 0; i < x->length; i++) { h = &(x->buffer[i]); - if(!(h->el)) continue; + // if(!(h->el)) continue; sq = &(cov[Get_qn(*h)]); st = &(cov[Get_tn(*h)]); if(st->del || sg->seq[Get_tn(*h)].del) continue; r = ma_hit2arc(h, sq->e - sq->s, st->e - st->s, max_hang, - asm_opt.max_hang_rate, min_ovlp, &t); + asm_opt.max_hang_rate, min_ovlp, &t0); ///if it is a contained read, skip if(r < 0) continue; - if((t.ul>>32) != v) continue; - if((vis_r_flag[t.ul>>32] != vis_f) || (vis_r_flag[t.v] != vis_f)) continue; - for (z = 0; (z < zn) && (za[z].v != t.v); z++); + if((t0.ul>>32) != v) continue; + if((vis_r_flag[t0.ul>>32] != vis_f) || (vis_r_flag[t0.v] != vis_f)) continue; + for (z = 0; (z < zn) && (za[z].v != t0.v); z++); if(z < zn) continue; - - p = asg_arc_pushp(res); *p = t; - get_edge_from_source(src, cov, NULL, max_hang, min_ovlp, (t.v^1), ((t.ul>>32)^1), &t); - p = asg_arc_pushp(res); *p = t; + if(get_edge_from_source(src, cov, NULL, max_hang, min_ovlp, (t0.v^1), ((t0.ul>>32)^1), &t1)) { + p = asg_arc_pushp(res); *p = t0; + p = asg_arc_pushp(res); *p = t1; + } } } @@ -21833,12 +22790,19 @@ uint64_t rd_hm_bub(asg_t *g, asg_t *ref, uint32_t v0, uint64_t max_dist, buf_t * return n_pop; } -uint64_t rd_hm_drop0(asg_t *g, asg_t *ref, uint32_t v, double cutoff) +uint64_t rd_hm_drop0(asg_t *g, asg_t *ref, uint32_t v, double cutoff, uint32_t drop_inexact) { uint32_t nv0, nv1, mol = 0, i0, i1, ncut = 0; asg_arc_t *av0, *av1; nv0 = asg_arc_n(g, v); av0 = asg_arc_a(g, v); nv1 = asg_arc_n(ref, v); av1 = asg_arc_a(ref, v); - if(cutoff < 1) { + if(drop_inexact) { + for (i0 = 0; i0 < nv0; ++i0) { // loop through v's neighbors + if (av0[i0].del) continue; + if (av0[i0].el) continue; + av0[i0].del = 1; asg_arc_del(g, av0[i0].v^1, (av0[i0].ul>>32)^1, 1); + ncut++; + } + } else if(cutoff < 1) { for (i0 = 0; i0 < nv0; ++i0) { // loop through v's neighbors if (av0[i0].del) continue; if(mol < av0[i0].ol) mol = av0[i0].ol; @@ -21866,7 +22830,7 @@ uint64_t rd_hm_drop0(asg_t *g, asg_t *ref, uint32_t v, double cutoff) return ncut; } -uint64_t rd_hm_drop(asg_t *g, asg_t *ref, uint32_t v0, uint32_t v1, double cutoff, buf_t *b) +uint64_t rd_hm_drop(asg_t *g, asg_t *ref, uint32_t v0, uint32_t v1, double cutoff, uint32_t drop_inexact, buf_t *b) { uint32_t i1, ncut = 0; uint32_t v, w, nv1, i; asg_arc_t *av1; @@ -21890,11 +22854,11 @@ uint64_t rd_hm_drop(asg_t *g, asg_t *ref, uint32_t v0, uint32_t v1, double cutof for (i = 0; i < b->b.n; ++i) { // clear the states of visited vertices v = b->b.a[i]; b->a[b->b.a[i]].s = 0; if(v == v0 || v == v1) continue; - ncut += rd_hm_drop0(g, ref, v, cutoff); - ncut += rd_hm_drop0(g, ref, v^1, cutoff); + ncut += rd_hm_drop0(g, ref, v, cutoff, drop_inexact); + ncut += rd_hm_drop0(g, ref, v^1, cutoff, drop_inexact); } - ncut += rd_hm_drop0(g, ref, v0, cutoff); - ncut += rd_hm_drop0(g, ref, v1^1, cutoff); + ncut += rd_hm_drop0(g, ref, v0, cutoff, drop_inexact); + ncut += rd_hm_drop0(g, ref, v1^1, cutoff, drop_inexact); return ncut; } @@ -21927,8 +22891,17 @@ void rd_hamming_symm(void *data, long i, int tid) // callback for kt_for() return; } + ///drop inexact edges first + cuttoff = -1; + ncut = rd_hm_drop(s->g, s->ref, st, ed^1, cuttoff, 1, b); + p = rd_hm_bub(s->g, s->ref, st, max_dist, b); + if(p) { + assert(b->S.a[0] == (ed^1)); + return; + } + for (cuttoff = step; cuttoff < 1.0; cuttoff += step) { - ncut = rd_hm_drop(s->g, s->ref, st, ed^1, cuttoff, b); + ncut = rd_hm_drop(s->g, s->ref, st, ed^1, cuttoff, 0, b); p = rd_hm_bub(s->g, s->ref, st, max_dist, b); if(p) { assert(b->S.a[0] == (ed^1)); @@ -21936,7 +22909,7 @@ void rd_hamming_symm(void *data, long i, int tid) // callback for kt_for() } if(!ncut) break; } - rd_hm_drop(s->g, s->ref, st, ed^1, 1024, b); + rd_hm_drop(s->g, s->ref, st, ed^1, 1024, 0, b); p = rd_hm_bub(s->g, s->ref, st, max_dist, b); if(p) { assert(b->S.a[0] == (ed^1)); @@ -21974,7 +22947,7 @@ void rd_hamming_symm_simple(rd_hamming_t *s, uint32_t st, uint32_t ed) // callba } for (cuttoff = step; cuttoff < 1.0; cuttoff += step) { - ncut = rd_hm_drop(s->g, s->ref, st, ed^1, cuttoff, b); + ncut = rd_hm_drop(s->g, s->ref, st, ed^1, cuttoff, 0, b); p = rd_hm_bub(s->g, s->ref, st, max_dist, b); if(p) { assert(b->S.a[0] == (ed^1)); @@ -21982,7 +22955,7 @@ void rd_hamming_symm_simple(rd_hamming_t *s, uint32_t st, uint32_t ed) // callba } if(!ncut) break; } - rd_hm_drop(s->g, s->ref, st, ed^1, 1024, b); + rd_hm_drop(s->g, s->ref, st, ed^1, 1024, 0, b); p = rd_hm_bub(s->g, s->ref, st, max_dist, b); if(p) { assert(b->S.a[0] == (ed^1)); @@ -21990,12 +22963,70 @@ void rd_hamming_symm_simple(rd_hamming_t *s, uint32_t st, uint32_t ed) // callba } } + +uint32_t rd_hamming_symm_simple0(buf_t *b, asg_t *ref, asg_t *g, uint32_t st, uint32_t ed, uint64_t max_dist, uint64_t *r_max_dist) // callback for kt_for() +{ + double step = 0.2, cuttoff; + uint32_t p, k, ncut; + p = rd_hm_bub(g, ref, st, max_dist, b); + if(p) { + assert(b->S.a[0] == (ed^1)); + if(r_max_dist) (*r_max_dist) = max_dist; + return 1; + } + ///recalculate max_dist + p = rd_hm_bub(ref, NULL, st, max_dist, b); + // if(!p) { + // fprintf(stderr, "[M::%s] st>>1::%u(st&1::%u), ed>>1::%u(ed&1::%u), max_dist::%lu\n", + // __func__, st>>1, st&1, ed>>1, ed&1, max_dist); + // } + assert(p); assert(b->S.a[0] == (ed^1)); + for (k = max_dist = 0; k < b->b.n; ++k) { + if(b->b.a[k]==st || b->b.a[k]==b->S.a[0]) continue; + max_dist += ref->seq[b->b.a[k]>>1].len; + } + max_dist += ref->seq[st>>1].len; + max_dist += ref->seq[b->S.a[0]>>1].len; + p = rd_hm_bub(g, ref, st, max_dist, b); + if(p) { + assert(b->S.a[0] == (ed^1)); + if(r_max_dist) (*r_max_dist) = max_dist; + return 1; + } + + ///drop inexact edges first + cuttoff = -1; + ncut = rd_hm_drop(g, ref, st, ed^1, cuttoff, 1, b); + p = rd_hm_bub(g, ref, st, max_dist, b); + if(p) { + assert(b->S.a[0] == (ed^1)); + if(r_max_dist) (*r_max_dist) = max_dist; + return 1; + } + + for (cuttoff = step; cuttoff < 1.0; cuttoff += step) { + ncut = rd_hm_drop(g, ref, st, ed^1, cuttoff, 0, b); + p = rd_hm_bub(g, ref, st, max_dist, b); + if(p) { + assert(b->S.a[0] == (ed^1)); + if(r_max_dist) (*r_max_dist) = max_dist; + return 1; + } + if(!ncut) break; + } + + rd_hm_drop(g, ref, st, ed^1, 1024, 0, b); + p = rd_hm_bub(g, ref, st, max_dist, b); + assert(p); assert(b->S.a[0] == (ed^1)); + if(r_max_dist) (*r_max_dist) = max_dist; + return 0; +} + void reduce_hamming_error_adv(ma_ug_t *iug, asg_t *sg, ma_hit_t_alloc* sources, ma_sub_t *coverage_cut, int max_hang, int min_ovlp, long long gap_fuzz, R_to_U *ru, bubble_type* bub) { double index_time = yak_realtime(); - ma_ug_t *ug = NULL; rd_hamming_t aux_t; memset((&aux_t), 0, sizeof(aux_t)); - ug = (iug)?(iug):(ma_ug_gen_primary(sg, PRIMARY_LABLE)); + ma_ug_t *ug = NULL; ug = (iug)?(iug):(ma_ug_gen_primary(sg, PRIMARY_LABLE)); uint8_t* vis_flag = NULL; CALLOC(vis_flag, sg->n_seq*2); uint32_t fix_bub = 0; asg_t *g = ug->g; uint32_t v, n_vtx = g->n_seq * 2, n_arc, n_arc_0 = sg->n_arc, nv, i; @@ -22059,8 +23090,13 @@ int max_hang, int min_ovlp, long long gap_fuzz, R_to_U *ru, bubble_type* bub) MALLOC(ig->seq, ig->n_seq); memcpy(ig->seq, sg->seq, (sizeof((*(ig->seq)))*ig->n_seq)); asg_cleanup(ig); asg_arc_del_trans_aux(ig, sg, vis_flag, gap_fuzz); - aux_t.n_thread = 1/**asm_opt.thread_num**/; CALLOC(aux_t.a, aux_t.n_thread); REALLOC(b.a, (ig->n_seq<<1)); memset(b.a, 0, sizeof((*(b.a)))*(ig->n_seq<<1)); + + for (i = 0; i < sv.n; i++) rd_hamming_symm_simple0(&b, sg, ig, sv.a[i]>>32, (uint32_t)(sv.a[i]), max_dist, NULL); + /** + rd_hamming_t aux_t; memset((&aux_t), 0, sizeof(aux_t)); + aux_t.n_thread = 1; // aux_t.n_thread = asm_opt.thread_num; + CALLOC(aux_t.a, aux_t.n_thread); for (i = 0; i < aux_t.n_thread; i++) aux_t.a[i].a = b.a; aux_t.g = ig; aux_t.ref = sg; aux_t.rr = &sv; aux_t.max_dist = max_dist; // print_debug_gfa(ug, sg, coverage_cut, "debug_hamming", sources, ru); @@ -22073,6 +23109,7 @@ int max_hang, int min_ovlp, long long gap_fuzz, R_to_U *ru, bubble_type* bub) free(aux_t.a[i].b.a); free(aux_t.a[i].e.a); } free(aux_t.a); + **/ } free(sv.a); free(vis_flag); @@ -22970,13 +24007,323 @@ uint64_t get_s_bub_pop_max_dist_advance(asg_t *g, buf_s_t *b) return mLen; } +void append_node_arcs(asg_t *des, asg_t *src, uint8_t *s, uint8_t se, uint32_t v) +{ + asg_arc_t *av, *za; uint32_t an, zn, k, n0, n1; + n0 = n1 = 0; + za = asg_arc_a(src, v); zn = asg_arc_n(src, v); + av = asg_arc_a(des, v); an = asg_arc_n(des, v); + ///set + for (k = 0; k < zn; k++) { + if(za[k].del) continue; + s[za[k].v] |= se; n0++; + } + + for (k = 0; k < an; k++) { + ///s[av[k].v]&se:: in the existing graph + if(s[av[k].v]&se) { + av[k].del = 0; n1++; + } + } + + ///reset + for (k = 0; k < zn; k++) { + if(za[k].del) continue; + if(s[za[k].v]&se) s[za[k].v] -= se; + } + if(!(n0 == n1)) { + fprintf(stderr, "[M::%s] n0::%u, n1::%u\n", __func__, n0, n1); + } + assert(n0 == n1); +} + + +static inline void asg_arc_rest(asg_t *des, asg_t *src, uint32_t v0, uint32_t w0, ma_ug_t *ug, kvec_asg_arc_t_warp *ae, ma_hit_t_alloc* src_e, ma_sub_t *cov, int32_t max_hang, int32_t min_ovlp, int32_t gap_fuzz, uint32_t *n_insert) +{ + uint32_t v, w, i, nv, rv, rw; asg_arc_t *av, *arc, t; + + v = v0; w = w0; + av = asg_arc_a(des, v); nv = asg_arc_n(des, v); + for (i = 0; i < nv; ++i) { + if (av[i].v == w) { + av[i].del = 0; break; + } + } + if(i < nv) { + v = w0^1; w = v0^1; + av = asg_arc_a(des, v); nv = asg_arc_n(des, v); + for (i = 0; i < nv; ++i) { + if (av[i].v == w) { + av[i].del = 0; break; + } + } + assert(i < nv); + return; + } + + ///replace a deleted arc + // fprintf(stderr, "[M::%s] replace\n", __func__); + v = v0; w = w0; + av = asg_arc_a(src, v); nv = asg_arc_n(src, v); + for (i = 0, arc = NULL; i < nv; ++i) { + if (av[i].v == w) { + av[i].del = 0; arc = &(av[i]); break; + } + } + assert(arc); + av = asg_arc_a(des, v); nv = asg_arc_n(des, v); + assert(nv); + for (i = 0; i < nv; ++i) assert(av[i].del); + for (i = 0; i < nv && av[i].ul < arc->ul; ++i); + if(i >= nv) i = nv - 1; av[i] = *arc; + + v = w0^1; w = v0^1; + av = asg_arc_a(src, v); nv = asg_arc_n(src, v); + for (i = 0, arc = NULL; i < nv; ++i) { + if (av[i].v == w) { + av[i].del = 0; arc = &(av[i]); break; + } + } + assert(arc); + av = asg_arc_a(des, v); nv = asg_arc_n(des, v); + assert(nv); + for (i = 0; i < nv; ++i) assert(av[i].del); + for (i = 0; i < nv && av[i].ul < arc->ul; ++i); + if(i >= nv) i = nv - 1; av[i] = *arc; + + rv = ((v0&1)?(ug->u.a[v0>>1].start^1):(ug->u.a[v0>>1].end^1)); + rw = ((w0&1)?(ug->u.a[w0>>1].end):(ug->u.a[w0>>1].start)); + assert(get_edge_from_source(src_e, cov, NULL, max_hang, min_ovlp, rv, rw, &t)); + kv_push(asg_arc_t, ae->a, t); + + assert(get_edge_from_source(src_e, cov, NULL, max_hang, min_ovlp, rw^1, rv^1, &t)); + kv_push(asg_arc_t, ae->a, t); + + (*n_insert) += 2; +} + +uint32_t bub_pop_merge(ma_ug_t *raw_ug, ma_ug_t *new_ug, uint32_t v0, uint32_t v1, uint64_t max_dist, buf_t *b, +uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, uint32_t is_update_chain, utg_trans_t *o, kvec_asg_arc_t_warp *ae, +ma_hit_t_alloc* src, ma_sub_t *sub, int32_t max_hang, int32_t min_ovlp, int32_t gap_fuzz, uint32_t *n_insert) +{ + ///do not pop bubble within new_ug; + uint32_t is_pop = asg_bub_pop1_primary_trio(new_ug->g, new_ug, v0, max_dist, b, positive_flag, negative_flag, 0, NULL, NULL, cov, is_update_chain, 0, o); + assert(is_pop); assert(b->S.a[0] == v1); + + ///b->S.a[0] is the sink of this bubble + uint32_t i, v, qn, tn, tmp_c, u; asg_arc_t *a; + asg_t *g = raw_ug->g; tmp_c = g->seq[b->S.a[0]>>1].c; + + ///assert(b->S.n == 1); + ///first remove all nodes in this bubble + for (i = 0; i < b->b.n; ++i) g->seq[b->b.a[i]>>1].c = ALTER_LABLE; + + + ///v is the sink of this bubble + v = b->S.a[0]; + ///recover node + do { + u = b->a[v].p; // u->v + /****************************may have hap bugs********************************/ + ////g->seq[v>>1].c = PRIMARY_LABLE; + g->seq[v>>1].c = HAP_LABLE; + /****************************may have hap bugs********************************/ + v = u; + } while (v != v0); + ///especially for unitig graph, don't label beg and sink node of a bubble as HAP_LABLE + ///since in unitig graph, a node may consist of a lot of reads + g->seq[b->S.a[0]>>1].c = tmp_c; + + ///remove all edges (self/reverse for each edge) in this bubble + for (i = 0; i < b->e.n; ++i) { + a = &(new_ug->g->arc[b->e.a[i]]);///note:: new_ug->g here + qn = a->ul>>33; + tn = a->v>>1; + if(g->seq[qn].c == ALTER_LABLE && g->seq[tn].c == ALTER_LABLE) continue; + ///remove this edge self + asg_arc_del(g, a->ul>>32, a->v, 1); + ///remove the reverse direction + asg_arc_del(g, a->v^1, a->ul>>32^1, 1); + } + + ///v is the sink of this bubble + v = b->S.a[0]; + ///recover node + do { + u = b->a[v].p; // u->v + g->seq[v>>1].del = 0; + asg_arc_rest(g, new_ug->g, u, v, raw_ug, ae, src, sub, max_hang, min_ovlp, gap_fuzz, n_insert); + v = u; + } while (v != v0); + return is_pop; +} + +uint64_t renew_phase_bubble(rd_hamming_fly_simp_t *pf, uint64_t v0, buf_t *b, ma_ug_t *ug, uint64_t max_dist, +uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain) +{ + uint64_t v, k, i, v1 = b->S.a[0], is_update, is_pop = 0; ma_ug_t *fg = pf->fg; + uint8_t *s = pf->vs; asg32_v *bc = pf->srt; uint8_t sn = 1, se = 2; + bc->n = 0; kv_resize(uint32_t, (*bc), b->b.n); + assert((fg->u.a[v0>>1].len == ug->u.a[v0>>1].len) && (fg->u.a[v0>>1].n == ug->u.a[v0>>1].n)); + assert((fg->u.a[v1>>1].len == ug->u.a[v1>>1].len) && (fg->u.a[v1>>1].n == ug->u.a[v1>>1].n)); + ///b->S.a[0] is the sink of this bubble + for (i = 0; i < b->b.n; i++) { + v = b->b.a[i]; + if((v == v0) || (v == v1)) continue; + s[v] = sn; + kv_push(uint32_t, *bc, v); + assert((fg->u.a[v>>1].len == ug->u.a[v>>1].len) && (fg->u.a[v>>1].n == ug->u.a[v>>1].n)); + } + + asg_arc_t *av, *za, *ra; uint32_t an, zn, rn, ri; + v = v0; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 1; + fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; + + + v = v1^1; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 1; + fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; + + + for (i = 0; i < bc->n; i++) { + v = bc->a[i]; + av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 1; + + v ^= 1; + av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 1; + + fg->g->seq[v>>1].c = ug->g->seq[v>>1].c; + } + + + + + for (i = 0; i < bc->n; i++) { + v = bc->a[i]; + za = asg_arc_a(ug->g, v); zn = asg_arc_n(ug->g, v); + av = asg_arc_a(fg->g, v); an = asg_arc_n(fg->g, v); + + ///set + for (k = 0; k < zn; k++) { + if((za[k].del) || (!s[za[k].v])) continue; + s[za[k].v] |= se; + } + + + for (k = 0; k < an; k++) { + ///s[av[k].v]&se:: in the existing graph + if((!s[av[k].v]) || (s[av[k].v]&se)) continue; ///in the existing graph + if((av[k].v) == (v>>1)) continue; + av[k].del = 0; + ra = asg_arc_a(fg->g, (av[k].v^1)); rn = asg_arc_n(fg->g, (av[k].v^1)); + for (ri = 0; ri < rn; ri++) { + if(ra[ri].v == ((av[k].ul>>32)^1)) { + ra[ri].del = 0; break; + } + } + assert(ri < rn); + } + + ///reset + for (k = 0; k < zn; k++) { + if((za[k].del) || (!s[za[k].v])) continue; + if(s[za[k].v]&se) s[za[k].v] -= se; + } + } + + + is_update = rd_hamming_symm_simple0(b, ug->g, fg->g, v0, v1^1, max_dist, &max_dist); + // fprintf(stderr, "[M::%s] is_update::%lu\n", __func__, is_update); + if(is_update) { + for (i = 0; i < bc->n; i++) { + append_node_arcs(fg->g, ug->g, s, se, bc->a[i]); + append_node_arcs(fg->g, ug->g, s, se, bc->a[i]^1); + } + append_node_arcs(fg->g, ug->g, s, se, v0); + append_node_arcs(fg->g, ug->g, s, se, v1^1); + is_pop = bub_pop_merge(ug, fg, v0, v1, max_dist, b, positive_flag, negative_flag, cov, is_update_chain, o, pf->ae, pf->src, pf->cov, pf->max_hang, pf->min_ovlp, pf->gap_fuzz, &(pf->n_insert)); + } else { + is_pop = asg_bub_pop1_primary_trio(ug->g, ug, v0, max_dist, b, positive_flag, negative_flag, 1, NULL, NULL, cov, is_update_chain, 0, o); + } + + + + for (i = 0; i < bc->n; i++) s[bc->a[i]] = 0; + ///reset + v = v0; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 0; + fg->g->seq[v>>1].c = PRIMARY_LABLE; + + + v = v1^1; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 0; + fg->g->seq[v>>1].c = PRIMARY_LABLE; + + + for (i = 0; i < bc->n; i++) { + v = bc->a[i]; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 0; + + v ^= 1; + av = asg_arc_a(fg->g, v); + an = asg_arc_n(fg->g, v); + for (k = 0; k < an; k++) av[k].del = 0; + + fg->g->seq[v>>1].c = PRIMARY_LABLE; + } + return is_pop; +} + +uint64_t refine_bubble_popping(ma_ug_t *ug, buf_t *b, uint32_t v0, uint64_t max_dist, uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain, rd_hamming_fly_simp_t *pf) +{ + // fprintf(stderr, "[M::%s]\n", __func__); + if(!asg_bub_pop1_primary_trio(ug->g, ug, v0, max_dist, b, positive_flag, negative_flag, 0, NULL, NULL, NULL, 0, 0, NULL)) return 0; + uint32_t non_positive_flag = (uint32_t)-1, v, u, k, rId, pn, npn; + if(positive_flag == FATHER) non_positive_flag = MOTHER; + if(positive_flag == MOTHER) non_positive_flag = FATHER; + ma_utg_t* p = NULL; + ///b->S.a[0] is the sink of this bubble + ///v is the sink of this bubble + v = b->S.a[0]; pn = npn = 0; + ///scan node + do { + u = b->a[v].p; // u->v + if(v != b->S.a[0]) { + p = &(ug->u.a[v>>1]); + for (k = 0; k < p->n; k++) { + rId = p->a[k]>>33; + if(R_INF.trio_flag[rId] == positive_flag) pn++; + if(R_INF.trio_flag[rId] == non_positive_flag) npn++; + } + } + v = u; + } while (v != v0); + // fprintf(stderr, "[M::%s] pn::%u, npn::%u\n", __func__, pn, npn); + ///debug + if((npn <= 0) || ((npn <= ((npn+pn)*0.05)) && (npn <= 64))) {///phasing is ok + return asg_bub_pop1_primary_trio(ug->g, ug, v0, max_dist, b, positive_flag, negative_flag, 1, NULL, NULL, cov, is_update_chain, 0, o); + } + return renew_phase_bubble(pf, v0, b, ug, max_dist, positive_flag, negative_flag, cov, o, is_update_chain); +} // pop bubbles -int asg_pop_bubble_primary_trio(ma_ug_t *ug, uint64_t* i_max_dist, uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain) +int asg_pop_bubble_primary_trio(ma_ug_t *ug, uint64_t* i_max_dist, uint32_t positive_flag, uint32_t negative_flag, hap_cov_t *cov, utg_trans_t *o, uint32_t is_update_chain, rd_hamming_fly_simp_t *p) { - asg_t *g = ug->g; + asg_t *g = ug->g; uint64_t n_pop = 0, max_dist; uint32_t v, n_vtx = g->n_seq * 2, n_arc, nv, i; - uint64_t n_pop = 0, max_dist; asg_arc_t *av = NULL; buf_t b; if (!g->is_symm) asg_symm(g); @@ -23023,8 +24370,13 @@ int asg_pop_bubble_primary_trio(ma_ug_t *ug, uint64_t* i_max_dist, uint32_t posi ///some edges could be deleted for (i = n_arc = 0; i < nv; ++i) // asg_bub_pop1() may delete some edges/arcs if (!av[i].del) ++n_arc; - if (n_arc > 1) - n_pop += asg_bub_pop1_primary_trio(ug->g, ug, v, max_dist, &b, positive_flag, negative_flag, 1, NULL, NULL, cov, is_update_chain, 0, o); + if (n_arc > 1) { + if(p){ + n_pop += refine_bubble_popping(ug, &b, v, max_dist, positive_flag, negative_flag, cov, o, is_update_chain, p); + } else { + n_pop += asg_bub_pop1_primary_trio(ug->g, ug, v, max_dist, &b, positive_flag, negative_flag, 1, NULL, NULL, cov, is_update_chain, 0, o); + } + } } if(VERBOSE >= 1) @@ -28714,7 +30066,7 @@ R_to_U* ruIndex, int max_hang, int min_ovlp, const ug_opt_t *uopt) if(bubble_dist > 0) { - asg_pop_bubble_primary_trio(ug, &bubble_dist, (uint32_t)-1, DROP, NULL, NULL, 0); + asg_pop_bubble_primary_trio(ug, &bubble_dist, (uint32_t)-1, DROP, NULL, NULL, 0, NULL); delete_useless_nodes(&ug); renew_utg(&ug, sg, &new_rtg_edges); } @@ -35043,19 +36395,21 @@ ma_sub_t **coverage_cut_ptr, int debug_g) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; output_poly_trio(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, ruIndex, - 0.05, 0.9, max_hang_length, mini_overlap_length, 0, &b_mask_t, asm_opt.polyploidy); + 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, 0, &b_mask_t, asm_opt.polyploidy); } else if (ha_opt_triobin(&asm_opt) && ha_opt_hic(&asm_opt)) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; benchmark_hic_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), 0.15, 3, - ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t); + ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t); } else if (ha_opt_triobin(&asm_opt)) { if(asm_opt.flag & HA_F_PARTITION) asm_opt.flag -= HA_F_PARTITION; - output_trio_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), - 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, 0, gap_fuzz, &uopt, &b_mask_t); + // output_trio_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), + // 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, 0, gap_fuzz, &uopt, &b_mask_t); + output_trio_graph_joint(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), + 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, gap_fuzz, &b_mask_t, NULL, NULL); } else if(ha_opt_hic(&asm_opt)) { @@ -35074,7 +36428,7 @@ ma_sub_t **coverage_cut_ptr, int debug_g) else if((asm_opt.flag & HA_F_PARTITION) && (asm_opt.purge_level_primary > 0)) { output_bp_graph(sg, coverage_cut, o_file, sources, reverse_sources, (asm_opt.max_short_tip*2), - 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t/**, &uopt**/); + 0.15, 3, ruIndex, 0.05, 0.9, max_hang_length, mini_overlap_length, &b_mask_t, gap_fuzz, &uopt); } else { diff --git a/Additional_src/Modified_hifiasm/Overlaps.h b/Additional_src/Modified_hifiasm/Overlaps.h index 65d116e..955e528 100644 --- a/Additional_src/Modified_hifiasm/Overlaps.h +++ b/Additional_src/Modified_hifiasm/Overlaps.h @@ -1066,7 +1066,7 @@ ma_ug_t* copy_untig_graph(ma_ug_t *src); ma_ug_t* output_trio_unitig_graph(asg_t *sg, ma_sub_t* coverage_cut, char* output_file_name, uint8_t flag, ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, -float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int is_bench, bub_label_t* b_mask_t, char *f_prefix, uint8_t *kpt_buf, kvec_asg_arc_t_warp *r_edges); +float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, int gap_fuzz, int is_bench, bub_label_t* b_mask_t, char *f_prefix, uint8_t *kpt_buf, kvec_asg_arc_t_warp *r_edges); asg_t* copy_read_graph(asg_t *src); ma_ug_t *ma_ug_gen(asg_t *g); void ma_ug_destroy(ma_ug_t *ug); @@ -1135,7 +1135,7 @@ void adjust_utg_by_trio(ma_ug_t **ug, asg_t* read_g, uint8_t flag, float drop_ra ma_hit_t_alloc* sources, ma_hit_t_alloc* reverse_sources, ma_sub_t* coverage_cut, long long tipsLen, float tip_drop_ratio, long long stops_threshold, R_to_U* ruIndex, float chimeric_rate, float drop_ratio, int max_hang, int min_ovlp, -kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t); +int gap_fuzz, kvec_asg_arc_t_warp* new_rtg_edges, bub_label_t* b_mask_t); uint32_t cmp_untig_graph(ma_ug_t *src, ma_ug_t *dest); void reduce_hamming_error(asg_t *sg, ma_hit_t_alloc* sources, ma_sub_t *coverage_cut, int max_hang, int min_ovlp, long long gap_fuzz); @@ -1233,5 +1233,7 @@ void destory_ug_rid_cov_t(ug_rid_cov_t *p); uint32_t append_cov_line_ug_rid_cov_t(uint64_t uid, uint64_t *qcc, u_trans_t *p, ug_rid_cov_t *idx, uint64_t hom_cut, double cut_rate); uint64_t infer_mmhap_copy(ma_ug_t *ug, asg_t *sg, ma_hit_t_alloc *src, uint8_t *ff, uint64_t uid, uint64_t het_cov, uint64_t n_hap); uint64_t trans_sec_cut0(kv_u_trans_t *ta, asg64_v *srt, uint32_t id, double sec_rate, uint64_t bd, ma_ug_t *ug); +void clean_u_trans_t_idx_filter_mmhap_adv(kv_u_trans_t *ta, ma_ug_t *ug, asg_t *read_g, ma_hit_t_alloc* src, ug_rid_cov_t *in); +void gen_ug_rid_cov_t_by_ovlp(kv_u_trans_t *ta, ug_rid_cov_t *cc); #endif diff --git a/Additional_src/Modified_hifiasm/README.md b/Additional_src/Modified_hifiasm/README.md index c081c5a..7b3337d 100644 --- a/Additional_src/Modified_hifiasm/README.md +++ b/Additional_src/Modified_hifiasm/README.md @@ -22,6 +22,9 @@ hifiasm -o HG002.asm --h1 read1.fq.gz --h2 read2.fq.gz HG002-HiFi.fq.gz yak count -b37 -t16 -o pat.yak <(cat pat_1.fq.gz pat_2.fq.gz) <(cat pat_1.fq.gz pat_2.fq.gz) yak count -b37 -t16 -o mat.yak <(cat mat_1.fq.gz mat_2.fq.gz) <(cat mat_1.fq.gz mat_2.fq.gz) hifiasm -o HG002.asm -t32 -1 pat.yak -2 mat.yak HG002-HiFi.fa.gz + +# Single-sample telomere-to-telomere assembly with HiFi, ultralong and Hi-C reads +hifiasm -o HG002.asm --h1 read1.fq.gz --h2 read2.fq.gz --ul ul.fq.gz HG002-HiFi.fq.gz ``` See [tutorial][tutorial] for more details. @@ -43,15 +46,12 @@ See [tutorial][tutorial] for more details. ## Introduction -Hifiasm is a fast haplotype-resolved de novo assembler for PacBio HiFi reads. -It can assemble a human genome in several hours and assemble a ~30Gb California -redwood genome in a few days. Hifiasm emits partially phased assemblies of -quality competitive with the best assemblers. Given parental short reads or -Hi-C data, it produces arguably the best haplotype-resolved assemblies so far. +Hifiasm is a fast haplotype-resolved de novo assembler initially designed for PacBio HiFi reads. +Its latest release could support the telomere-to-telomere assembly by utilizing ultralong Oxford Nanopore reads. Hifiasm produces arguably the best single-sample telomere-to-telomere assemblies combing HiFi, ultralong and Hi-C reads, and it is one of the best haplotype-resolved assemblers for the trio-binning assembly given parental short reads. For a human genome, hifiasm can produce the telomere-to-telomere assembly in one day. ## Why Hifiasm? -* Hifiasm delivers high-quality assemblies. It tends to generate longer contigs +* Hifiasm delivers high-quality telomere-to-telomere assemblies. It tends to generate longer contigs and resolve more segmental duplications than other assemblers. * Given Hi-C reads or short reads from the parents, hifiasm can produce overall the best @@ -146,11 +146,18 @@ The second command line will run much faster than the first. ### Ultra-long ONT integration -Hifiasm could integrate ultra-long ONT reads to improve the assembly quality: +Hifiasm could integrate ultra-long ONT reads to produce the telomere-to-telomere assembly: ```sh hifiasm -o NA12878.asm -t32 --ul ul.fq.gz HiFi-reads.fq.gz ``` -Please note that this mode is not stable right now. We have only tested with >=100kb UL reads. +For the single-sample telomere-to-telomere assembly with Hi-C reads: +```sh +hifiasm -o NA12878.asm -t32 --ul ul.fq.gz --h1 read1.fq.gz --h2 read2.fq.gz HiFi-reads.fq.gz +``` +For the trio-binning telomere-to-telomere assembly; +```sh +hifiasm -o NA12878.asm -t32 --ul ul.fq.gz -1 pat.yak -2 mat.yak HiFi-reads.fq.gz +``` ### Output files diff --git a/Additional_src/Modified_hifiasm/gfa_ut.cpp b/Additional_src/Modified_hifiasm/gfa_ut.cpp index 89e2860..dbc04a2 100644 --- a/Additional_src/Modified_hifiasm/gfa_ut.cpp +++ b/Additional_src/Modified_hifiasm/gfa_ut.cpp @@ -7365,7 +7365,7 @@ void rebuid_idx(ul_resolve_t *uidx) init_ul_str_idx_t(uidx); } -void shrink_1b(ma_ug_t *ug, uc_block_t *z, uint32_t is_forward) +void shrink_1b(ma_ug_t *ug, uc_block_t *z, uc_block_t *lim, uint32_t is_forward) { if(z->ts != 0 || z->te != ug->g->seq[z->hid].len) return; uc_block_t bc = *z; @@ -7377,12 +7377,14 @@ void shrink_1b(ma_ug_t *ug, uc_block_t *z, uint32_t is_forward) } else { z->te -= 1; z->qs += off; } + if((lim) && (!((z->qs <= lim->qs) && (z->qe <= lim->qe)))) *z = bc; } else { if((!z->rev)) { z->te -= 1; z->qe -= off; } else { z->ts += 1; z->qe -= off; } + if((lim) && (!((z->qs >= lim->qs) && (z->qe >= lim->qe)))) *z = bc; } if((ugl_cover_check(bc.ts, bc.te, &(ug->u.a[bc.hid]))) && (!ugl_cover_check(z->ts, z->te, &(ug->u.a[z->hid])))) { @@ -7436,7 +7438,7 @@ void renew_ul_vec_t(ul_vec_t *x, ma_ug_t *ug) void shrink_ul0(all_ul_t *uls, ul_str_t *str, uint64_t id, integer_t *buf, ma_ug_t *ug) { - uint32_t k, c_k, p_k, cv, pv, bl, i; uc_block_t *xi; buf->u.n = 0; nid_t *np = NULL; + uint32_t k, c_k, p_k, cv, pv, bl, i; uc_block_t *xi, *yi; buf->u.n = 0; nid_t *np = NULL; asg_arc_t *av; uint32_t nv, s, e, m, d, mm, is_conn; uint64_t *z; ul_vec_t *x; if(str->cn < 2) return; for (k = 0, bl = 0, c_k = p_k = pv = (uint32_t)-1; k < str->cn; k++) { @@ -7464,6 +7466,11 @@ void shrink_ul0(all_ul_t *uls, ul_str_t *str, uint64_t id, integer_t *buf, ma_ug is_conn = 1; } } + if(is_conn) { + is_conn = 0; assert(k); + yi = &(uls->a[id].bb.a[str->a[k-1]>>32]); + if((xi->qs >= yi->qs) && (xi->qe >= yi->qe)) is_conn = 1; + } if(is_conn) { bl++; } else { @@ -7514,10 +7521,9 @@ void shrink_ul0(all_ul_t *uls, ul_str_t *str, uint64_t id, integer_t *buf, ma_ug x->bb.n = m; assert(x->bb.n > 1); - - - shrink_1b(ug, &(x->bb.a[0]), 1); - shrink_1b(ug, &(x->bb.a[x->bb.n-1]), 0); + shrink_1b(ug, &(x->bb.a[0]), ((x->bb.n>=2)?&(x->bb.a[1]):(NULL)), 1); + shrink_1b(ug, &(x->bb.a[x->bb.n-1]), ((x->bb.n>=2)?&(x->bb.a[x->bb.n-2]):(NULL)), 0); + d = x->bb.a[0].qs; for (k = 0; k < x->bb.n; k++) { x->bb.a[k].qs -= d; x->bb.a[k].qe -= d; @@ -7546,8 +7552,8 @@ void shrink_ul0(all_ul_t *uls, ul_str_t *str, uint64_t id, integer_t *buf, ma_ug } x->bb.n = m; assert(x->bb.n > 1); - shrink_1b(ug, &(x->bb.a[0]), 1); - shrink_1b(ug, &(x->bb.a[x->bb.n-1]), 0); + shrink_1b(ug, &(x->bb.a[0]), ((x->bb.n>=2)?&(x->bb.a[1]):(NULL)), 1); + shrink_1b(ug, &(x->bb.a[x->bb.n-1]), ((x->bb.n>=2)?&(x->bb.a[x->bb.n-2]):(NULL)), 0); d = x->bb.a[0].qs; for (k = 0; k < x->bb.n; k++) { x->bb.a[k].qs -= d; x->bb.a[k].qe -= d; @@ -15686,6 +15692,9 @@ void u2g_hybrid_clean(ul_resolve_t *uidx, ulg_opt_t *ulopt, usg_t *ng, asg64_v * // prt_usg_t(uidx, ng, sb); // usg_arc_cut_length(ng, b, ub, mm_tip>>1, drop, ulopt->is_trio, 1, NULL); usg_bub_clean(ng, &bb, b, ub, mm_tip>>1, drop, 1, bs, f); + // fprintf(stderr, "-1bub-[M::%s::] i::%ld, drop::%f\n", __func__, i, drop); + // sprintf(sb, "ng_ss::%ld_i::%ld_drop::%f_b::bub", ss, i, drop); + // prt_usg_t(uidx, ng, sb); usg_arc_cut_srt_length(ng, b, ub, mm_tip>>1, drop, ulopt->is_trio, 1, NULL, bs); // fprintf(stderr, "-1-[M::%s::] i::%ld, drop::%f\n", __func__, i, drop); // sprintf(sb, "ng_ss::%ld_i::%ld_drop::%f_b", ss, i, drop); @@ -16580,7 +16589,7 @@ ma_ug_t* output_trio_unitig_graph_ul(ug_opt_t *uopt, ul_resolve_t *uidx, char* o adjust_utg_by_trio(&ug, uidx->sg, flag, TRIO_THRES, uopt->sources, uopt->reverse_sources, uopt->coverage_cut, uopt->tipsLen, uopt->tip_drop_ratio, uopt->stops_threshold, uopt->ruIndex, - uopt->chimeric_rate, uopt->drop_ratio, uopt->max_hang, uopt->min_ovlp, &ne, uopt->b_mask_t); + uopt->chimeric_rate, uopt->drop_ratio, uopt->max_hang, uopt->min_ovlp, uopt->gap_fuzz, &ne, uopt->b_mask_t); // if(asm_opt.b_low_cov > 0) { // break_ug_contig(&ug, uidx->sg, &R_INF, uopt->coverage_cut, uopt->sources, uopt->ruIndex, &ne, diff --git a/Additional_src/Modified_hifiasm/horder.cpp b/Additional_src/Modified_hifiasm/horder.cpp index f5faa44..7a6353f 100644 --- a/Additional_src/Modified_hifiasm/horder.cpp +++ b/Additional_src/Modified_hifiasm/horder.cpp @@ -781,7 +781,7 @@ ma_ug_t* get_trio_unitig_graph(asg_t *sg, uint8_t flag, ug_opt_t *opt) adjust_utg_by_trio(&ug, sg, flag, TRIO_THRES, opt->sources, opt->reverse_sources, opt->coverage_cut, opt->tipsLen, opt->tip_drop_ratio, opt->stops_threshold, opt->ruIndex, opt->chimeric_rate, opt->drop_ratio, opt->max_hang, opt->min_ovlp, - &new_rtg_edges, opt->b_mask_t); + opt->gap_fuzz, &new_rtg_edges, opt->b_mask_t); kv_destroy(new_rtg_edges.a); return ug; diff --git a/Additional_src/Modified_hifiasm/inter.cpp b/Additional_src/Modified_hifiasm/inter.cpp index 2cf8218..b093f6a 100644 --- a/Additional_src/Modified_hifiasm/inter.cpp +++ b/Additional_src/Modified_hifiasm/inter.cpp @@ -10434,6 +10434,202 @@ static void worker_for_trans_ovlp_mmhap_adv(void *data, long i, int tid) // call s->free_cnt[tid]++; } +uint32_t tranfor_ovlp(u_trans_t *qovlp, u_trans_t *tovlp, asg_t *g, ul_ov_t *res, uint32_t adjust_rev) +{ + int64_t os, oe, s_shift, e_shift, tt, qs, qe, ts, te; + os = MAX(qovlp->ts, tovlp->qs); + oe = MIN(qovlp->te, tovlp->qe); + if(oe <= os) return 0; + + ///[os, oe) -> qovlp->t* + s_shift = get_offset_adjust(os-qovlp->ts, qovlp->te-qovlp->ts, qovlp->qe-qovlp->qs); + e_shift = get_offset_adjust(qovlp->te-oe, qovlp->te-qovlp->ts, qovlp->qe-qovlp->qs); + if(qovlp->rev) { + tt = s_shift; s_shift = e_shift; e_shift = tt; + } + qs = qovlp->qs+s_shift; qe = ((int64_t)qovlp->qe)-e_shift; + if(qs >= qe) return 0; + + ///[os, oe) -> tovlp->q* + s_shift = get_offset_adjust(os-tovlp->qs, tovlp->qe-tovlp->qs, tovlp->te-tovlp->ts); + e_shift = get_offset_adjust(tovlp->qe-oe, tovlp->qe-tovlp->qs, tovlp->te-tovlp->ts); + if(tovlp->rev) { + tt = s_shift; s_shift = e_shift; e_shift = tt; + } + ts = tovlp->ts+s_shift; te = ((int64_t)tovlp->te)-e_shift; + if(ts >= te) return 0; + + memset(res, 0, sizeof(*res)); + res->qn = qovlp->qn; res->qs = qs; res->qe = qe; + res->tn = tovlp->tn; res->ts = ts; res->te = te; + res->rev = ((qovlp->rev == tovlp->rev)?0:1); + if(adjust_rev && res->rev) {///for linear chaining + res->ts = g->seq[res->tn].len - te; + res->te = g->seq[res->tn].len - ts; + } + return 1; +} + +uint32_t rescue_adject_ovlp(asg_t *g, uint32_t id, kv_u_trans_t *ta, kv_ul_ov_t *out, st_mt_t *buf) +{ + u_trans_t *a, *b; ul_ov_t rr; uint64_t a_n, b_n, k, l, i, z, m; + a = u_trans_a((*ta), id); a_n = u_trans_n((*ta), id); + for (i = out->n = buf->n = 0; i < a_n; i++) { + b = u_trans_a((*ta), a[i].tn); b_n = u_trans_n((*ta), a[i].tn); + z = a[i].tn; z <<= 32; kv_push(uint64_t, *buf, z); + for (k = 0; k < b_n; k++) { + if(b[k].tn == id) continue; + if(!tranfor_ovlp(&(a[i]), &(b[k]), g, &rr, 1)) continue; + z = rr.tn; z <<= 32; z |= out->n; z |= ((uint64_t)0x80000000); + rr.tn <<= 1; rr.tn |= rr.rev; kv_push(ul_ov_t, *out, rr); + } + } + if(out->n == 0) return 1; + + radix_sort_gfa64(buf->a, buf->a + buf->n); + for (k = 1, l = m = 0; k <= buf->n; k++) { + if(k == buf->n || (buf->a[l]>>32)!=(buf->a[k]>>32)) { + if((k - l > 1) && (!(buf->a[l]&((uint64_t)0x80000000)))) {///overlap within bck + for (z = l; z < k; z++) { + if(buf->a[z]&((uint64_t)0x80000000)) { + out->a[(uint32_t)(buf->a[z]-((uint64_t)0x80000000))].tn = (uint32_t)-1; + m++; + } + } + } + l = k; + } + } + + if(m) { + for (k = m = 0; k < out->n; k++) { + if(out->a[k].tn == (uint32_t)-1) continue; + out->a[m++] = out->a[k]; + } + out->n = m; + } + if(out->n == 0) return 1; + + radix_sort_ul_ov_srt_tn(out->a, out->a+out->n); + for (k = 0; k < out->n; k++) out->a[k].tn >>= 1; + + return 0; +} + +/** +uint64_t gen_trans_chain_mmhap(ug_trans_t *s, uint64_t rid, ha_ovec_buf_t *b, kv_ul_ov_t *bl, char *seq, uint64_t len, +double err, double bw) +{ + uint64_t cnt = ((s->idx_n.a[rid+1]-s->idx_n.a[rid])), ol_h = 0, pass_aln = 0; + uint32_t high_occ = asm_opt.polyploidy + 1; overlap_region *aux_o = NULL; + ///note: high_occ is different + ug_map_lchain(b->abl, rid, seq, len, s->w, s->k, &(s->udb), &b->olist, &b->clist, bw, bw, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, + s->is_HPC, s->idx_a.a + s->idx_n.a[rid], cnt, s->srt_a.a, s->srt_a.n, s->mini_cut, s->chain_cut, NULL); + // if(rid == 57) { + // fprintf(stderr, "-1-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + ///remove candidate chains that have been calculated + if(!fi) backward_dedup_ol(rid, bl, &(b->sp), &b->olist);///it is ok + // if(rid == 57) { + // fprintf(stderr, "-2-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + filter_by_reliable_ovlp_mmhap_adv(rid, s->filter, &(b->sp), &b->olist, &(s->udb), s->sec_cutoff, 1, 1, s->ccov, &ol_h); + clear_Cigar_record(&b->cigar1); clear_Round2_alignment(&b->round2); + if(!fi) ol_h = 0; + + // if(rid == 57) { + // fprintf(stderr, "-3-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + + ol_h = split_ug_lalign(ol_h, &b->olist, err_high, err_low, + &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, rid, s->k, s->chain_cut, NULL); + + // if(rid == 57) { + // fprintf(stderr, "-4-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + + aux_o = gen_aux_ovlp(&b->olist);///must be here + + // if(rid == 57) { + // fprintf(stderr, "-5-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + + ol_h = split_ug_lalign(ol_h, &b->olist, err_high, err_low, + &b->clist, &(s->udb), s->uopt, seq, len, &b->self_read, &b->ovlp_read, + &b->correct, &b->exz, aux_o, rid, s->k, s->chain_cut, NULL); + + // if(rid == 57) { + // fprintf(stderr, "-6-[M::%s] utg%.6lu%c, rid::%ld, b->olist->length::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, b->olist.length); + // } + + if(fi) {///first round + pass_aln = test_het_aln_mmhap(rid, s->ccov, u_trans_a((*(s->filter)), rid), u_trans_n((*(s->filter)), rid), &b->olist, &(b->sp)); + push_ul_ov_t(&(s->udb), u_trans_a((*(s->filter)), rid), u_trans_n((*(s->filter)), rid), rid, &(b->sp), &b->olist, len, pass_aln, err_high, bl); + // fprintf(stderr, "-1-[M::%s] utg%.6lu%c, rid::%lu, pass_aln::%lu\n", + // __func__, rid+1, "lc"[s->ug->u.a[rid].circ], rid, pass_aln); + } else {///second round + push_ul_ov_t(&(s->udb), NULL, 0, rid, &(b->sp), &b->olist, len, 0, err_high, bl); + remove_trans_ovlp_connect(s->udb.ug, rid, bl); + } + return pass_aln; +} +**/ + + + +static void worker_for_trans_chain_mmhap_adv(void *data, long i, int tid) // callback for kt_for() +{ + ug_trans_t *s = (ug_trans_t*)data; + ha_ovec_buf_t *b = s->hab[tid]; kv_ul_ov_t *bl = &(s->ll[tid].tk); + uint32_t high_occ = asm_opt.polyploidy + 1; uint64_t cnt; + char *seq = s->ug->u.a[i].s; int64_t len = s->ug->u.a[i].len; + if((!s->is_ovlp) && (s->is_cnt)) s->idx_n.a[i] = 0; + if(s->ug->g->seq[i].del) return; + if(is_mmhom_node(s->ccov->cov.a+s->ccov->idx[i], &(s->ug->u.a[i]), s->ccov->rg, s->ccov->hom_min, 0.9)) return; + // asprintf(&as, "\n[M::%s] rid::%ld, len::%lu, name::%.*s\n", __func__, s->id+i, s->len[i], (int32_t)UL_INF.nid.a[s->id+i].n, UL_INF.nid.a[s->id+i].a); + // push_vlog(&(overall_zdbg->a[s->id+i]), as); free(as); as = NULL; + // if(rescue_adject_ovlp(s->ug->g, i, s->filter, &(s->ll[tid].lo))) return; + + // gen_trans_chain_mmhap(s, i, b, bl, seq, len, 0.8, 0.8); + + if(!s->is_ovlp) { + if(s->is_cnt) { + s->idx_n.a[i] = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, NULL, 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + } else { + cnt = ug_map_lchain(b->abl, i, seq, len, s->w, s->k, &(s->udb), NULL, NULL, s->bw_thres, s->bw_thres_double, + s->max_n_chain, 1, NULL, &(b->tmp_region), NULL, &(b->sp), &high_occ, NULL, 0, 1, 0.2, 3, s->is_HPC, s->idx_a.a + s->idx_n.a[i], 0, NULL, 0, s->mini_cut, s->chain_cut, NULL); + assert(cnt == ((s->idx_n.a[i+1]-s->idx_n.a[i]))); + } + if(s->free_cnt[tid] >= FREE_BATCH) { + clear_count_buf(s, tid, 1); s->free_cnt[tid] = 0; + } + s->free_cnt[tid]++; + return; + } + + // if(i == 58) { + // fprintf(stderr, "\n-1-[M::%s] utg%.6u%c, rid::%ld, is_ovlp::%d, is_cnt::%d, len::%ld, str::%u\n", + // __func__, (uint32_t)i+1, "lc"[s->ug->u.a[i].circ], i, s->is_ovlp, s->is_cnt, len, (uint32_t)(!!seq)); + // } + + if(!gen_trans_adaptive_mmhap_aln(s, i, b, bl, seq, len, s->filter, s->diff_ec_ul, s->diff_ec_ul_double, s->bw_thres, s->bw_thres_double)) { + gen_trans_adaptive_mmhap_aln(s, i, b, bl, seq, len, NULL, s->diff_ec_ul_double, s->diff_ec_ul_double, s->bw_thres_double, s->bw_thres_double); + } + if(s->free_cnt[tid] >= FREE_BATCH) { + clear_count_buf(s, tid, 0); s->free_cnt[tid] = 0; + } + s->free_cnt[tid]++; +} + int64_t retrieve_cigar_err_dir(bit_extz_t *ez, int64_t s, int64_t e, int64_t *xk, int64_t *ck, int64_t is_back) { ///[ez->ts, ez->te]/[ez->qs, ez->qe]/[s, e) @@ -20178,6 +20374,145 @@ void gen_trans_base_count_comp(ug_trans_t *p, kv_u_trans_t *res) fprintf(stderr, "[M::%s::%.3f] ==> Qualification\n", __func__, yak_realtime()-index_time); } +void clean_trans_base_count_mmhap_comp_rmap(ug_trans_t *p, kv_u_trans_t *res) +{ + uint64_t i, k, l, occ, idx_n; ha_mzl_t *tz; u_trans_t *z; + kv_ul_ov_t *bl; double ww; ha_mzl_t *idx; + ///make results consistent + kv_resize(ha_mzl_t, p->srt_a, p->srt_a.n+p->ug->u.n); + idx = p->srt_a.a + p->srt_a.n; idx_n = p->ug->u.n; + for (i = 0; i < idx_n; i++) { + tz = &(idx[i]); + tz->x = (uint64_t)-1; tz->rev = 0; + tz->pos = tz->rid = tz->span = 0; + } + + for (i = 0, occ = res->n; (int64_t)i < p->n_thread; i++) { + bl = &(p->ll[i].tk); + if(!(bl->n)) continue; + for (k = 1, l = 0; k <= bl->n; k++) { + if(k == bl->n || bl->a[k].qn != bl->a[l].qn) { + if(k > l) { + tz = &(idx[bl->a[l].qn]); + tz->x = bl->a[l].qn; tz->x <<= 32; tz->x |= i; + tz->rid = l>>32; tz->pos = (uint32_t)l; tz->rev = 1; + occ += (k - l); + } + l = k; + } + } + } + + kv_resize(u_trans_t, *res, occ); + for (i = 0; i < idx_n; i++) { + tz = &(idx[i]); + if(!(tz->rev)) continue; + bl = &(p->ll[(uint32_t)(tz->x)].tk); + k = tz->rid; k <<= 32; k += tz->pos; + assert(bl->a[k].qn == (tz->x>>32)); + for (; (k < bl->n) && (bl->a[k].qn == (tz->x>>32)); k++) { + if(bl->a[k].qn == bl->a[k].tn) continue; + ww = cal_trans_ov_w(&(bl->a[k])); + if(ww <= 0) continue; + + kv_pushp(u_trans_t, *res, &z); + z->f = RC_3; z->rev = bl->a[k].rev; z->del = 0; + z->qn = bl->a[k].qn; z->qs = bl->a[k].qs; z->qe = bl->a[k].qe; + z->tn = bl->a[k].tn; z->ts = bl->a[k].ts; z->te = bl->a[k].te; + z->nw = ww; + } + } + destory_ug_rid_cov_t(p->ccov); free(p->ccov); + p->ccov = gen_ug_rid_cov_t(p->ug, p->rg, p->uopt->sources); + + clean_u_trans_t_idx_filter_mmhap_adv(res, p->ug, p->rg, p->uopt->sources, p->ccov); + gen_ug_rid_cov_t_by_ovlp(res, p->ccov); +} + +void gen_trans_base_count_mmhap_comp_rmap(ug_trans_t *p, kv_u_trans_t *res) +{ + double index_time = yak_realtime(); + uint64_t i, k, l, occ, m, cc; + p->ccov = gen_ug_rid_cov_t(p->ug, p->rg, p->uopt->sources); + clean_u_trans_t_idx_adv(res, p->ug, p->rg); p->filter = res; + + p->is_cnt = 1; p->is_ovlp = 0; + memset(p->free_cnt, 0, sizeof((*(p->free_cnt)))*p->n_thread); + kt_for(p->n_thread, worker_for_trans_ovlp_mmhap_adv, p, p->ug->u.n); + for (i = l = 0; i < p->ug->u.n; i++) { + occ = p->idx_n.a[i]; p->idx_n.a[i] = l; l += occ; + } + + p->idx_n.a[i] = l; + p->idx_a.n = p->idx_a.m = l; MALLOC(p->idx_a.a, p->idx_a.n); + p->is_cnt = 0; p->is_ovlp = 0; + memset(p->free_cnt, 0, sizeof((*(p->free_cnt)))*p->n_thread); + kt_for(p->n_thread, worker_for_trans_ovlp_mmhap_adv, p, p->ug->u.n); + p->srt_a.n = p->srt_a.m = p->idx_a.n; MALLOC(p->srt_a.a, p->srt_a.n); + + for (i = 0; i < p->srt_a.n; i++) { + p->srt_a.a[i] = p->idx_a.a[i]; + p->srt_a.a[i].pos = (uint32_t)i; + p->srt_a.a[i].rid = i>>32; + } + radix_sort_ha_mzl_t_srt(p->srt_a.a, p->srt_a.a + p->srt_a.n); + kvec_t(uint64_t) cut; kv_init(cut); + for (k = 1, l = 0; k <= p->srt_a.n; k++) { + if(k == p->srt_a.n || p->srt_a.a[l].x != p->srt_a.a[k].x) { + for (i = l; i < k; i++) { + m = p->srt_a.a[i].rid; m <<= 32; m |= p->srt_a.a[i].pos; + assert(p->srt_a.a[i].x == p->idx_a.a[m].x); + p->srt_a.a[i] = p->idx_a.a[m]; p->idx_a.a[m].x = i; + } + kv_push(uint64_t, cut, (k - l)); + l = k; + } + } + + if(cut.n > 0) { + radix_sort_gfa64(cut.a, cut.a + cut.n); + m = cut.n * 0.0002; cc = cut.a[cut.n-1] + 1; + if(m > 0 && m <= cut.n) cc = cut.a[cut.n-m] + 1; + if(cc < (uint64_t)p->mini_cut) p->mini_cut = cc; + } + kv_destroy(cut); + + p->is_cnt = 0; p->is_ovlp = 1; + memset(p->free_cnt, 0, sizeof((*(p->free_cnt)))*p->n_thread); + kt_for(p->n_thread, worker_for_trans_ovlp_mmhap_adv, p, p->ug->u.n); + + clean_trans_base_count_mmhap_comp_rmap(p, res); + + for (i = 0; (int64_t)i < p->n_thread; i++) p->ll[i].tk.n = 0; + + p->is_cnt = 0; p->is_ovlp = 1; + memset(p->free_cnt, 0, sizeof((*(p->free_cnt)))*p->n_thread); + kt_for(p->n_thread, worker_for_trans_chain_mmhap_adv, p, p->ug->u.n); + + + + + + for (i = 0; (int64_t)i < p->n_thread; i++) { + ha_ovec_destroy(p->hab[i]); + free(p->ll[i].lo.a); free(p->ll[i].srt.a.a); free(p->ll[i].tc.a); + } + free(p->idx_a.a); free(p->idx_n.a); free(p->hab); free(p->free_cnt); + destory_ug_rid_cov_t(p->ccov); free(p->ccov); + + + + + + + + + + + for (i = 0; (int64_t)i < p->n_thread; i++) free(p->ll[i].tk.a); + free(p->srt_a.a); free(p->ll); + fprintf(stderr, "[M::%s::%.3f] ==> Qualification\n", __func__, yak_realtime()-index_time); +} void gen_trans_base_count_mmhap_comp(ug_trans_t *p, kv_u_trans_t *res) { diff --git a/Additional_src/calculate_N50.py b/Additional_src/calculate_N50.py new file mode 100644 index 0000000..6e98261 --- /dev/null +++ b/Additional_src/calculate_N50.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python3 +# coding=utf-8 + +""" +Скрипт считает N50 последовательностей в формате FASTA. Выдаёт он только одно число - N50. + +Пример: +python3 calculate_N50.py assembly.fasta +""" + +import sys +import os +import re + +s_path_to_the_input_file = sys.argv[1] + +l_lengths_of_sequences = [] #список длин последовательностей. +n_sum_of_lengths_of_all_sequences = 0 #сумма длин всех последовательностей. + + +f_infile = open(s_path_to_the_input_file, "r") + +s_current_sequence = "" #последовательность, которую скрипт в данный момент рассматривает. + +for s_line in f_infile: + #если это заголовок последовательности, то скрипт записывает длину прошлой последовательности в список l_lengths_of_sequences и обнуляет значение переменной s_current_sequence + if re.search("^>", s_line): + if s_current_sequence != "": #если прошлой последовательности не было, то, скорее всего, это потому что я сейчас смотрю на первую последовательность в файле. Тогда прошлую последовательность не надо добавлять в список l_lengths_of_sequences + n_sequence_length = len(s_current_sequence) + l_lengths_of_sequences.append(n_sequence_length) + n_sum_of_lengths_of_all_sequences += n_sequence_length + s_current_sequence = "" + else: + s_current_sequence += re.sub(r"\s", r"", s_line) #убираю пробельные символы, включая символы переноса строки. + +#Добавляю длину последней последовательности +n_sequence_length = len(s_current_sequence) +l_lengths_of_sequences.append(n_sequence_length) +n_sum_of_lengths_of_all_sequences += n_sequence_length + +#сортирую массив длин последовательностей в обратном порядке. +l_lengths_of_sequences_sorted_backwards = sorted(l_lengths_of_sequences, reverse=True) + +#считаю N50. +n_current_sum_of_lengths = 0 +for n_sequence_length in l_lengths_of_sequences_sorted_backwards: + n_current_sum_of_lengths += n_sequence_length + if n_current_sum_of_lengths >= n_sum_of_lengths_of_all_sequences/2: + print(str(n_sequence_length)) + sys.exit() + + + + + + + + + + diff --git a/calculate_AG.py b/calculate_AG.py index 003d86f..1c97d86 100755 --- a/calculate_AG.py +++ b/calculate_AG.py @@ -19,6 +19,7 @@ import sys import os import re +import time import datetime import urllib.request import statistics @@ -88,7 +89,7 @@ s_number_of_busco_orthogroups_to_use = "1000" #сколько ортогрупп BUSCO использовать. Это строка, содержащая или число, или слово "all", если нужно использовать все. Если пользователь укажет больше, чем есть в используемой базе данных BUSCO, то calculate_AG всё равно будет использовать все. s_maximum_allowed_intron_length = "from_BUSCO" #максимальная разрешённая длина интрона. По умолчанию, используется значение из файла dataset.cfg датасета BUSCO. Переменная начинается с "s_", потому что это строка. Ниже будет ещё переменная n_maximum_allowed_intron_length, которая число. - s_version_of_calculate_AG = "2.19" #версия этой программы. Всегда равна версии Mabs. Поскольку эта программа нужна, в первую очередь, для Mabs, то когда я увеличиваю номер версии Mabs, то увеличивается и номер версии calculate_AG, и наоборот. + s_version_of_calculate_AG = "2.24" #версия этой программы. Всегда равна версии Mabs. Поскольку эта программа нужна, в первую очередь, для Mabs, то когда я увеличиваю номер версии Mabs, то увеличивается и номер версии calculate_AG, и наоборот. l_errors_in_command_line = [] #список ошибок в командной строке. Если пользователь совершил много ошибок, то calculate_AG напишет про них все, а не только про первую встреченную. @@ -208,7 +209,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name_online #путь к месту, где будет лежать скачанный архивированный gzip файл с датасетом BUSCO. - #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ + #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то пробую ещё два раза с интервалом в 5 секунд. Если адрес так и не станет доступным, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ try: s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() @@ -219,7 +220,28 @@ l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") except: - l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") #если пользователь использовал --local_busco_dataset o_regular_expression_results = re.search(r" --local_busco_dataset (\S+)", s_command_line_reduced) @@ -332,13 +354,13 @@ ################################ #Со входными параметрами разобрался. Теперь, собственно, делаю работу. - f_logs = open(s_path_to_the_output_folder + "/logs.txt","w",buffering=1) + f_log = open(s_path_to_the_output_folder + "/log.txt","w",buffering=1) o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Started calculate_AG\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Started calculate_AG\n\n") - f_logs.write("You have run calculate_AG of version " + s_version_of_calculate_AG + " with the following command: " + s_command_line + "\n\n") + f_log.write("You have run calculate_AG of version " + s_version_of_calculate_AG + " with the following command: " + s_command_line + "\n\n") #сделаю специальный файл, в который в конце будет записана только строка вроде "AG is 1023". f_AG_calculation_results = open(s_path_to_the_output_folder + "/AG.txt", "w") @@ -353,7 +375,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name #Оставляю из базы BUSCO только нужное количество (s_number_of_busco_orthogroups_to_use) ортогрупп — тех, которые имеют наиболее консервативные последовательности. Если пользователь указал использовать все ортогруппы, то calculate_AG использует все. Если пользователь указал больше ортогрупп, чем есть в этом наборе BUSCO, то calculate_AG использует все и пишет Warning в основной файл с логами. - mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_logs) + mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_log) s_path_to_a_BUSCO_folder = s_path_to_the_output_folder + "/BUSCO_dataset_to_use/" @@ -366,7 +388,7 @@ #если файл с контигами пустой, то сразу останавливаю выполнение calculate_AG, считая что AG=0. Иначе Metaeuk выдаст ошибку (если я правильно помню). n_size_of_the_file_with_contigs = os.stat(s_path_to_the_assembly).st_size if n_size_of_the_file_with_contigs == 0: - f_logs.write("AG is 0") + f_log.write("AG is 0") f_AG_calculation_results.write("AG is 0") sys.exit() @@ -425,7 +447,7 @@ #если MetaEuk вообще не выдал результатов, то считаю, что AG=0. if not os.path.exists(s_path_to_the_output_folder + "/MetaEuk_results.fas"): - f_logs.write("AG is 0. Number of genes in single-copy orthogroups is 0. Number of genes in true multicopy orthogroups is 0. Number of genes in false multicopy orthogroups is 0.\n") + f_log.write("AG is 0. Number of genes in single-copy orthogroups is 0. Number of genes in true multicopy orthogroups is 0. Number of genes in false multicopy orthogroups is 0.\n") f_AG_calculation_results.write("AG is 0") sys.exit() @@ -436,7 +458,7 @@ s_path_to_the_file_with_BUSCO_scores_cutoff = s_path_to_a_BUSCO_folder + "/scores_cutoff" s_path_to_the_file_with_BUSCO_lengths_cutoff = s_path_to_a_BUSCO_folder + "/lengths_cutoff" - #f_logs = open("logs.txt", "w", buffering = 1) + #f_log = open("log.txt", "w", buffering = 1) #делаю словарь, в котором ключ это название ортогруппы, вроде 54443at71240, а значение это bit score cutoff, вроде 302.75. d_orthogroup_title_to_bit_score_cutoff = {} @@ -583,7 +605,7 @@ n_first_exon_coordinate += 1 #прибавляю единицу, потому что Metaeuk выдаёт координаты zero-based. #удаляю упоминание об этом экзоне из строки, чтобы можно было начать рассматривать новый. - #f_logs.write("analyzed exon " + o_regular_expression_results_2.group(0) + "\n") + #f_log.write("analyzed exon " + o_regular_expression_results_2.group(0) + "\n") s_exon_information_with_masked_metacharacters = re.escape(o_regular_expression_results_2.group(0)) #s_exon_information_with_masked_metacharacters это как o_regular_expression_results_2.group(0) , но все метасимволы замаскированы. Нужно, чтобы правильно прошло удаление этой подстроки из s_string_with_exons с помощью re.sub s_string_with_exons = re.sub(s_exon_information_with_masked_metacharacters, "", s_string_with_exons) @@ -714,77 +736,81 @@ s_orthogroup_title = l_line_split[3] n_bit_score = float(l_line_split[7]) - if s_orthogroup_title not in dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file: - dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title] = [] + #Иногда бывает такое, что ген, найденный MetaEuk для одной ортогруппы BUSCO, имеет также матчи к марковским профилями других ортогрупп. Благодаря следующей строке я учитываю только матчи к профилю той же ортогруппы, по белку которой этот ген был найден. + + if re.search(r"^" + s_orthogroup_title, s_target_name): - if s_target_name not in dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title]: - dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title].append(s_target_name) + if s_orthogroup_title not in dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file: + dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title] = [] - if n_bit_score >= d_orthogroup_title_to_bit_score_cutoff[s_orthogroup_title]: - n_z_value = (n_target_length - d_orthogroup_title_to_the_average_BUSCO_protein_length[s_orthogroup_title]) / d_orthogroup_title_to_the_standard_deviation_of_BUSCO_protein_lengths[s_orthogroup_title] + if s_target_name not in dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title]: + dl_orthogroup_title_to_the_list_of_targets_I_have_already_seen_in_this_file[s_orthogroup_title].append(s_target_name) - if n_z_value >= -2: - o_regular_expression_results = re.search(r"^(.*?)\|([^\|]+)\|(\+|\-)\|.*?\|.*?\|.*?\|(\d+)\|(\d+)\|(.+)", s_target_name) - - if o_regular_expression_results: - s_orthogroup_title = o_regular_expression_results.group(1) - s_contig_title = o_regular_expression_results.group(2) - s_chain = o_regular_expression_results.group(3) #цепь, на которой лежит ген. "+" или "-". - n_leftmost_coordinate_of_the_gene = int(o_regular_expression_results.group(4)) + 1 #прибавляю единицу, потому что Metaeuk выдаёт координаты zero-based. - n_rightmost_coordinate_of_the_gene = int(o_regular_expression_results.group(5)) - s_string_with_exons = o_regular_expression_results.group(6) - - s_gene_description = s_contig_title + ":" + str(n_leftmost_coordinate_of_the_gene) + "-" + str(n_rightmost_coordinate_of_the_gene) - - if s_orthogroup_title not in dl_orthogroup_title_to_the_list_of_its_genes: - dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title] = [] - dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title].append(s_gene_description) - - #f_logs.write("Started to analyze the coverage in exons of " + s_gene_description + "\n") - - if s_gene_description not in dl_gene_description_to_the_list_of_coverages_in_its_exons: - dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] = [] - - #иду по всем координатам экзонов. Координаты экзона содержатся в подстроке вида 20872[20872]:20765[20765]:108[108] - while re.search(r"\d+\[(\d+)\]\:\d+\[(\d+)\]\:\d+\[\d+\]", s_string_with_exons): - o_regular_expression_results_2 = re.search(r"\d+\[(\d+)\]\:\d+\[(\d+)\]\:\d+\[\d+\]", s_string_with_exons) - n_first_exon_coordinate = int(o_regular_expression_results_2.group(1)) - n_second_exon_coordinate = int(o_regular_expression_results_2.group(2)) + if n_bit_score >= d_orthogroup_title_to_bit_score_cutoff[s_orthogroup_title]: + n_z_value = (n_target_length - d_orthogroup_title_to_the_average_BUSCO_protein_length[s_orthogroup_title]) / d_orthogroup_title_to_the_standard_deviation_of_BUSCO_protein_lengths[s_orthogroup_title] + + if n_z_value >= -2: + o_regular_expression_results = re.search(r"^(.*?)\|([^\|]+)\|(\+|\-)\|.*?\|.*?\|.*?\|(\d+)\|(\d+)\|(.+)", s_target_name) + + if o_regular_expression_results: + s_orthogroup_title = o_regular_expression_results.group(1) + s_contig_title = o_regular_expression_results.group(2) + s_chain = o_regular_expression_results.group(3) #цепь, на которой лежит ген. "+" или "-". + n_leftmost_coordinate_of_the_gene = int(o_regular_expression_results.group(4)) + 1 #прибавляю единицу, потому что Metaeuk выдаёт координаты zero-based. + n_rightmost_coordinate_of_the_gene = int(o_regular_expression_results.group(5)) + s_string_with_exons = o_regular_expression_results.group(6) - #если ген обратно-комплементарный, то первой координатой была записана бОльшая. Делаю первой координатой меньшую. - if n_first_exon_coordinate > n_second_exon_coordinate: - n_temp = n_second_exon_coordinate - n_second_exon_coordinate = n_first_exon_coordinate - n_first_exon_coordinate = n_temp + s_gene_description = s_contig_title + ":" + str(n_leftmost_coordinate_of_the_gene) + "-" + str(n_rightmost_coordinate_of_the_gene) - n_first_exon_coordinate += 1 #прибавляю единицу, потому что Metaeuk выдаёт координаты zero-based. + if s_orthogroup_title not in dl_orthogroup_title_to_the_list_of_its_genes: + dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title] = [] + dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title].append(s_gene_description) - l_coverages_in_this_exon = [] #список покрытий в этом экзоне + #f_log.write("Started to analyze the coverage in exons of " + s_gene_description + "\n") - for n_position in range(n_first_exon_coordinate, n_second_exon_coordinate + 1): + if s_gene_description not in dl_gene_description_to_the_list_of_coverages_in_its_exons: + dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] = [] + + #иду по всем координатам экзонов. Координаты экзона содержатся в подстроке вида 20872[20872]:20765[20765]:108[108] + while re.search(r"\d+\[(\d+)\]\:\d+\[(\d+)\]\:\d+\[\d+\]", s_string_with_exons): + o_regular_expression_results_2 = re.search(r"\d+\[(\d+)\]\:\d+\[(\d+)\]\:\d+\[\d+\]", s_string_with_exons) + n_first_exon_coordinate = int(o_regular_expression_results_2.group(1)) + n_second_exon_coordinate = int(o_regular_expression_results_2.group(2)) - #проверяю, нет ли такого, что в этом контиге не было вообще ни одной покрытой позиции. Тогда dd_contig_title_and_position_to_coverage будет неинициализирован для s_contig_title. В таком случае я считаю покрытие в этой позиции равным нулю. - if s_contig_title not in dd_contig_title_and_position_to_coverage: - n_coverage = 0 - else: - #поскольку в двойной словарь dd_contig_title_and_position_to_coverage я записывал покрытие только тех позиций, покрытие которых было ненулевым, то сейчас нужно проверить, есть ли эта позиция в этом двойном словаре. - if n_position in dd_contig_title_and_position_to_coverage[s_contig_title]: - n_coverage = dd_contig_title_and_position_to_coverage[s_contig_title][n_position] - else: + #если ген обратно-комплементарный, то первой координатой была записана бОльшая. Делаю первой координатой меньшую. + if n_first_exon_coordinate > n_second_exon_coordinate: + n_temp = n_second_exon_coordinate + n_second_exon_coordinate = n_first_exon_coordinate + n_first_exon_coordinate = n_temp + + n_first_exon_coordinate += 1 #прибавляю единицу, потому что Metaeuk выдаёт координаты zero-based. + + l_coverages_in_this_exon = [] #список покрытий в этом экзоне + + for n_position in range(n_first_exon_coordinate, n_second_exon_coordinate + 1): + + #проверяю, нет ли такого, что в этом контиге не было вообще ни одной покрытой позиции. Тогда dd_contig_title_and_position_to_coverage будет неинициализирован для s_contig_title. В таком случае я считаю покрытие в этой позиции равным нулю. + if s_contig_title not in dd_contig_title_and_position_to_coverage: n_coverage = 0 + else: + #поскольку в двойной словарь dd_contig_title_and_position_to_coverage я записывал покрытие только тех позиций, покрытие которых было ненулевым, то сейчас нужно проверить, есть ли эта позиция в этом двойном словаре. + if n_position in dd_contig_title_and_position_to_coverage[s_contig_title]: + n_coverage = dd_contig_title_and_position_to_coverage[s_contig_title][n_position] + else: + n_coverage = 0 + + l_coverages_in_this_exon.append(n_coverage) - l_coverages_in_this_exon.append(n_coverage) - - #добавляю список покрытий этого экзона к словарю списков, который содержит списки покрытий для каждого гена - dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] += l_coverages_in_this_exon - #удаляю упоминание об этом экзоне из строки, чтобы можно было начать рассматривать новый. - #f_logs.write("analyzed exon " + o_regular_expression_results_2.group(0) + "\n") - s_exon_information_with_masked_metacharacters = re.escape(o_regular_expression_results_2.group(0)) #s_exon_information_with_masked_metacharacters это как o_regular_expression_results_2.group(0) , но все метасимволы замаскированы. Нужно, чтобы правильно прошло удаление этой подстроки из s_string_with_exons с помощью re.sub - s_string_with_exons = re.sub(s_exon_information_with_masked_metacharacters, "", s_string_with_exons) - - #если ген присутствует, но фрагментирован, то я информацию о нём никак не использую. - else: - pass + #добавляю список покрытий этого экзона к словарю списков, который содержит списки покрытий для каждого гена + dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] += l_coverages_in_this_exon + #удаляю упоминание об этом экзоне из строки, чтобы можно было начать рассматривать новый. + #f_log.write("analyzed exon " + o_regular_expression_results_2.group(0) + "\n") + s_exon_information_with_masked_metacharacters = re.escape(o_regular_expression_results_2.group(0)) #s_exon_information_with_masked_metacharacters это как o_regular_expression_results_2.group(0) , но все метасимволы замаскированы. Нужно, чтобы правильно прошло удаление этой подстроки из s_string_with_exons с помощью re.sub + s_string_with_exons = re.sub(s_exon_information_with_masked_metacharacters, "", s_string_with_exons) + + #если ген присутствует, но фрагментирован, то я информацию о нём никак не использую. + else: + pass f_infile.close() @@ -814,7 +840,7 @@ l_coverages_in_exons_of_genes += dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] - f_logs.write("For the gene " + s_gene_description + " from a single-copy orthogroup " + s_orthogroup_title + ", the median coverage is " + str(d_gene_description_to_the_median_coverage_in_its_exons[s_gene_description]) + ". It was calculated using " + str(len(dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description])) + " positions.\n") + f_log.write("For the gene " + s_gene_description + " from a single-copy orthogroup " + s_orthogroup_title + ", the median coverage is " + str(d_gene_description_to_the_median_coverage_in_its_exons[s_gene_description]) + ". It was calculated using " + str(len(dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description])) + " positions.\n") #если для ортогруппы найдено больше одного гена. if len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title]) > 1: @@ -823,16 +849,16 @@ l_coverages_in_exons_of_genes += dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description] - f_logs.write("For the gene " + s_gene_description + " from a multicopy orthogroup " + s_orthogroup_title + ", the median coverage is " + str(d_gene_description_to_the_median_coverage_in_its_exons[s_gene_description]) + ". It was calculated using " + str(len(dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description])) + " positions.\n") + f_log.write("For the gene " + s_gene_description + " from a multicopy orthogroup " + s_orthogroup_title + ", the median coverage is " + str(d_gene_description_to_the_median_coverage_in_its_exons[s_gene_description]) + ". It was calculated using " + str(len(dl_gene_description_to_the_list_of_coverages_in_its_exons[s_gene_description])) + " positions.\n") #если ни одного однокопийного гена найдено не было (крайне маловероятно, но, в принципе, такое может быть), но многокопийные гены были, то, для простоты, считаю медианным покрытием однокопийных генов медианное покрытие по всем многокопийным генам, делённое пополам. if len(l_coverages_in_exons_of_single_copy_genes) == 0: n_median_coverage_of_exons_of_single_copy_genes = statistics.median(l_coverages_in_exons_of_genes) / 2 - f_logs.write("\nWarning! No single-copy orthogroups were found. Hence, as the approximate coverage of genes in single-copy orthogroups I take half the median coverage by positions of genes from multicopy, which is " + str(n_median_coverage_of_exons_of_single_copy_genes) + "\n\n") + f_log.write("\nWarning! No single-copy orthogroups were found. Hence, as the approximate coverage of genes in single-copy orthogroups I take half the median coverage by positions of genes from multicopy, which is " + str(n_median_coverage_of_exons_of_single_copy_genes) + "\n\n") #если хотя бы один однокопийный ген был. else: n_median_coverage_of_exons_of_single_copy_genes = statistics.median(l_coverages_in_exons_of_single_copy_genes) - f_logs.write("\nThe median coverage in exons of genes from single-copy BUSCO orthogroups is " + str(n_median_coverage_of_exons_of_single_copy_genes) + ". It was calculated using " + str(len(l_coverages_in_exons_of_single_copy_genes)) + " positions.\n\n") + f_log.write("\nThe median coverage in exons of genes from single-copy BUSCO orthogroups is " + str(n_median_coverage_of_exons_of_single_copy_genes) + ". It was calculated using " + str(len(l_coverages_in_exons_of_single_copy_genes)) + " positions.\n\n") #теперь иду по всем ортогруппам и для каждой многокопийной ортогруппы считаю среднее покрытие в ней. Если оно < 0.75*(медианное покрытие в однокопийных генах), то считаю, что это ложно многокопийных ортогруппа. А если >=0.75*(медианное покрытие в однокопийных генах), то считаю, что истинно многокопийная. Параллельно, считаю количество генов в истинно многокопийных ортогруппах и ложно многокопийных ортогруппах. n_number_of_true_multicopy_genes = 0 @@ -846,33 +872,33 @@ n_mean_coverage_of_genes_in_this_orthogroup += d_gene_description_to_the_median_coverage_in_its_exons[s_gene_description] / len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title]) if n_mean_coverage_of_genes_in_this_orthogroup < 0.75 * n_median_coverage_of_exons_of_single_copy_genes: - f_logs.write("The mean coverage of genes from a multicopy BUSCO orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is smaller than " + str(round(0.75 * n_median_coverage_of_exons_of_single_copy_genes, 1)) + ", hence this orthogroup is considered a false multicopy.\n") + f_log.write("The mean coverage of genes from a multicopy orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is smaller than " + str(round(0.75 * n_median_coverage_of_exons_of_single_copy_genes, 1)) + ", hence this orthogroup is considered a false multicopy.\n") n_number_of_false_multicopy_genes += len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title]) #в принципе, это условие и следующее можно объединить в одно (">="). Но для удобства чтения логов я разделю случай ">" и случай "=". Впрочем, думаю, случай "=" будет встречаться крайне редко. elif n_mean_coverage_of_genes_in_this_orthogroup == 0.75 * n_median_coverage_of_exons_of_single_copy_genes: - f_logs.write("The mean coverage of genes from a multicopy BUSCO orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is equal to 0.75 * (median_coverage_in_exons_of_single_copy_genes), hence this orthogroup is considered a true multicopy.\n") + f_log.write("The mean coverage of genes from a multicopy orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is equal to 0.75 * (median_coverage_in_exons_of_single_copy_genes), hence this orthogroup is considered a true multicopy.\n") n_number_of_true_multicopy_genes += len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title]) elif n_mean_coverage_of_genes_in_this_orthogroup > 0.75 * n_median_coverage_of_exons_of_single_copy_genes: - f_logs.write("The mean coverage of genes from a multicopy BUSCO orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is larger than " + str(round(0.75 * n_median_coverage_of_exons_of_single_copy_genes, 1)) + ", hence this orthogroup is considered a true multicopy.\n") + f_log.write("The mean coverage of genes from a multicopy orthogroup " + s_orthogroup_title + " which contains " + str(len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title])) + " genes is " + str(round(n_mean_coverage_of_genes_in_this_orthogroup, 1)) + ". It is larger than " + str(round(0.75 * n_median_coverage_of_exons_of_single_copy_genes, 1)) + ", hence this orthogroup is considered a true multicopy.\n") n_number_of_true_multicopy_genes += len(dl_orthogroup_title_to_the_list_of_its_genes[s_orthogroup_title]) n_AG = n_number_of_single_copy_genes_found_in_the_assembly + n_number_of_true_multicopy_genes else: - f_logs.write("AG is 0. Number of genes in single-copy orthogroups is 0. Number of genes in true multicopy orthogroups is 0. Number of genes in false multicopy orthogroups is 0.\n") + f_log.write("AG is 0. Number of genes in single-copy orthogroups is 0. Number of genes in true multicopy orthogroups is 0. Number of genes in false multicopy orthogroups is 0.\n") f_AG_calculation_results.write("AG is 0") sys.exit() - f_logs.write("AG is " + str(n_AG) + ". Number of genes in single-copy orthogroups is " + str(n_number_of_single_copy_genes_found_in_the_assembly) + ". Number of genes in true multicopy orthogroups is " + str(n_number_of_true_multicopy_genes) + ". Number of genes in false multicopy orthogroups is " + str(n_number_of_false_multicopy_genes) +".\n") + f_log.write("AG is " + str(n_AG) + ". Number of genes in single-copy orthogroups is " + str(n_number_of_single_copy_genes_found_in_the_assembly) + ". Number of genes in true multicopy orthogroups is " + str(n_number_of_true_multicopy_genes) + ". Number of genes in false multicopy orthogroups is " + str(n_number_of_false_multicopy_genes) +".\n") f_AG_calculation_results.write("AG is " + str(n_AG)) - f_logs.close + f_log.close #Строю синаплот с покрытием генов. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/plot_gene_coverage_distribution.py " + s_path_to_the_output_folder + "/logs.txt 2.5 auto " + s_path_to_the_output_folder + "/gene_coverage_distribution") + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/plot_gene_coverage_distribution.py " + s_path_to_the_output_folder + "/log.txt 2.5 auto " + s_path_to_the_output_folder + "/gene_coverage_distribution") diff --git a/install.sh b/install.sh index 85cad71..52ceda6 100755 --- a/install.sh +++ b/install.sh @@ -69,7 +69,6 @@ cd ./Additional/HMMER chmod 755 ./configure ./configure make -make check cd ../.. #MetaEuk is pre-compiled, I just change permissions. The pre-compiled version is for SSE4.1. Actually, there are MetaEuk versions for newer CPUs, but since MetaEuk is not a time-limiting step of Mabs, I don't provide them. @@ -96,14 +95,5 @@ cd ../.. #Proovframe is written in Perl, it does not require to be compiled. I just change permissions. The source code of Proovframe was slightly modified by me — mostly for Proovframe to be able to find DIAMOND provided with Mabs. chmod 755 ./Additional/Proovframe/bin/* -#Upgrading pip to the latest version. This is necessary because the installation of Plotnine (see below) may fail with very old versions of pip. -python3 -m pip install pip --user --upgrade --no-warn-script-location - -#Installing the Python module Pandas. -python3 -m pip install --user --no-warn-script-location Pandas - -#Installing the Python module Plotnine. Mabs may not work with old (approximately prior to 2019) versions of Plotnine, thus Plotnine is upgraded to the latest version. -python3 -m pip install --upgrade --user --no-warn-script-location Plotnine - #Making mabs-hifiasm.py, mabs-flye.py and calculate_AG.py executable chmod 755 mabs-hifiasm.py mabs-flye.py calculate_AG.py \ No newline at end of file diff --git a/mabs-flye.py b/mabs-flye.py index 96fa354..be6a7d9 100755 --- a/mabs-flye.py +++ b/mabs-flye.py @@ -19,12 +19,17 @@ import sys import os import re +import time import datetime import urllib.request #import ssl import math import shutil import subprocess +import gzip +import statistics +import scipy +import scipy.optimize from Additional import mabs_function_preprocess_busco_dataset @@ -92,7 +97,6 @@ if not os.path.isdir(s_path_to_the_folder_where_Mabs_lies + "/Test_datasets"): l_unavailable_files_and_folders.append("The subfolder \"Test_datasets\" should be in the folder where Mabs lies.") - #делаю парсинг аргументов командной строки. Можно было бы использовать argparse, но когда я делаю это без библиотек, то больше возможностей для того, чтобы сделать интерфейс таким, какой мне нравится. s_command_line = " ".join(sys.argv) #команда, которой запущен Mabs-flye, в одну строку. @@ -109,10 +113,11 @@ s_genome_size_estimate = "auto" #оценка размера генома. s_number_of_busco_orthogroups_to_use = "1000" #сколько ортогрупп BUSCO использовать. Это строка, содержащая или число, или слово "all", если нужно использовать все. Если пользователь укажет больше, чем есть в используемой базе данных BUSCO, то Mabs-flye всё равно будет использовать все. + n_maximum_number_of_points_to_try = 10 #максимальное количество точек, которые Mabs-flye должен пробовать в процессе оптимизации методом Нелдера-Мида. s_maximum_allowed_intron_length = "from_BUSCO" #максимальная разрешённая длина интрона. По умолчанию, используется значение из файла dataset.cfg датасета BUSCO. s_additional_flye_parameters = "" #дополнительные параметры Flye. - s_Mabs_version = "2.19" + s_Mabs_version = "2.24" l_errors_in_command_line = [] #список ошибок в командной строке. Если пользователь совершил много ошибок, то Mabs-flye напишет про них все, а не только про первую встреченную. @@ -131,10 +136,11 @@ 5) --threads Number of CPU threads to be used by Mabs-flye. The default value is 10. 6) --output_folder Output folder for Mabs-flye results. The default is "Mabs_results". 7) --number_of_busco_orthogroups How many BUSCO orthogroups should Mabs-flye use. Should be either a positive integral value or "all" to use all orthogroups. The default value is 1000. -8) --genome_size Haploid genome size. Should be either "auto" for automatic estimation, or a number ending with "k", "m" or "g". For example, 1.5g means 1.5 gigabases. The default value is "auto". -9) --max_intron_length Maximum allowed length of an intron. Should be either "from_BUSCO" to use a value from a BUSCO dataset, or a number, possibly ending with "k", "m" or "g". For example, 20k means 20 kilobases. The default is "from_BUSCO". Change --max_intron_length if you assemble a genome with unusually long introns. -10) --local_busco_dataset Path to a local BUSCO dataset, manually pre-downloaded from http://mikeshelk.site/Data/BUSCO_datasets/Latest/ or http://busco-data.ezlab.org/v5/data/lineages/. Example: "--local_busco_dataset /home/test/Data/primates_odb10.2021-02-19.tar.gz". May be a .tar.gz file or a decompressed folder. This option is mutually exclusive with "--download_busco_dataset". -11) --additional_flye_parameters A string with additional parameters to be passed to Flye, enclosed in square brackets. Example: "--additional_flye_parameters [--scaffold --min-overlap 20000]". +8) --maximum_number_of_points_to_try The maximum number of combinations of Flye parameters to be tried by Mabs-flye using the Nelder-Mead algorithm. The default value is 10. Increasing the value of this parameter will increase the computation time but may increase the accuracy of the results. +9) --genome_size Haploid genome size. Should be either "auto" for automatic estimation, or a number ending with "k", "m" or "g". For example, 1.5g means 1.5 gigabases. The default value is "auto". +10) --max_intron_length Maximum allowed length of an intron. Should be either "from_BUSCO" to use a value from a BUSCO dataset, or a number, possibly ending with "k", "m" or "g". For example, 20k means 20 kilobases. The default is "from_BUSCO". Change --max_intron_length if you assemble a genome with unusually long introns. +11) --local_busco_dataset Path to a local BUSCO dataset, manually pre-downloaded from http://mikeshelk.site/Data/BUSCO_datasets/Latest/ or http://busco-data.ezlab.org/v5/data/lineages/. Example: "--local_busco_dataset /home/test/Data/primates_odb10.2021-02-19.tar.gz". May be a .tar.gz file or a decompressed folder. This option is mutually exclusive with "--download_busco_dataset". +12) --additional_flye_parameters A string with additional parameters to be passed to Flye, enclosed in square brackets. Example: "--additional_flye_parameters [--scaffold --min-overlap 20000]". Informational options: 12) --help Print this help. @@ -186,7 +192,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name_online #путь к месту, где будет лежать скачанный архивированный gzip файл с датасетом BUSCO. - #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ + #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то пробую ещё два раза с интервалом в 5 секунд. Если адрес так и не станет доступным, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ try: s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() @@ -197,7 +203,28 @@ l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") except: - l_errors_in_command_line.append("Unfortunately, http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is currently not accessible. To test Mabs-flye, download the file http://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz and run the following command:\nmabs-flye.py --nanopore_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/nanopore_test_reads.fastq.gz --pacbio_clr_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/pacbio_clr_test_reads.fastq.gz --local_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz") + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + + except: + l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") + + except: + l_errors_in_command_line.append("Unfortunately, http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is currently not accessible. To test Mabs-flye, download the file http://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz and run the following command:\nmabs-flye.py --nanopore_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/nanopore_test_reads.fastq.gz --pacbio_clr_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/pacbio_clr_test_reads.fastq.gz --local_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz") if len(l_errors_in_command_line) != 0: #Если ошибка была всего одна. @@ -235,29 +262,39 @@ s_string_to_remove = re.escape(o_regular_expression_results.group(0)) s_command_line_reduced = re.sub(s_string_to_remove, "", s_command_line_reduced, 1) - #проверяю, что пользователь не дал опцией --additional_flye_parameters следующие опции: --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative . Это потому, что Mabs-flye их и так использует. + #проверяю, что пользователь не дал опцией --additional_flye_parameters следующие опции: --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative . Это потому, что Mabs-flye их и так использует. if re.search(r"\-\-nano\-raw ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"--nano-raw\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"--nano-raw\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + if re.search(r"\-\-nano\-corr ", s_additional_flye_parameters): + l_errors_in_command_line.append("You have given Mabs-flye the option \"--nano-corr\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + if re.search(r"\-\-nano\-hq ", s_additional_flye_parameters): + l_errors_in_command_line.append("You have given Mabs-flye the option \"--nano-hq\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + if re.search(r"\-\-pacbio\-hifi ", s_additional_flye_parameters): + l_errors_in_command_line.append("You have given Mabs-flye the option \"--pacbio-hifi\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + if re.search(r"\-\-pacbio\-raw ", s_additional_flye_parameters): + l_errors_in_command_line.append("You have given Mabs-flye the option \"--pacbio-raw\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + if re.search(r"\-\-pacbio\-corr ", s_additional_flye_parameters): + l_errors_in_command_line.append("You have given Mabs-flye the option \"--pacbio-corr\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-\-out\-dir ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"--out-dir\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"--out-dir\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-\-threads ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"--threads\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"--threads\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-\-no\-alt\-contigs ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"--no-alt-contigs\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"--no-alt-contigs\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-\-nano\-raw ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"--genome-size\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"--genome-size\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-o ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"-o\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"-o\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-t ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"-t\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"-t\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"\-g ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"-g\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"-g\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"assemble_ovlp_divergence ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"assemble_ovlp_divergence\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"assemble_ovlp_divergence\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"repeat_graph_ovlp_divergence ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"repeat_graph_ovlp_divergence\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"repeat_graph_ovlp_divergence\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") if re.search(r"assemble_divergence_relative ", s_additional_flye_parameters): - l_errors_in_command_line.append("You have given Mabs-flye the option \"assemble_divergence_relative\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") + l_errors_in_command_line.append("You have given Mabs-flye the option \"assemble_divergence_relative\" via the option \"--additional_flye_parameters\". The following options cannot be passed via \"--additional_flye_parameters\": --nano-raw, --nano-corr, --nano-hq, --pacbio-hifi, --pacbio-raw, --pacbio-corr, --out-dir, --threads, --no-alt-contigs, --genome-size, -o, -t, -g, assemble_ovlp_divergence, repeat_graph_ovlp_divergence, assemble_divergence_relative.") #смотрю, дал ли пользователь риды Нанопора o_regular_expression_results = re.search(r" --nanopore_reads (\S+)", s_command_line_reduced) @@ -330,7 +367,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name_online #путь к месту, где будет лежать скачанный архивированный gzip файл с датасетом BUSCO. - #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ + #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Если не доступен, то пробую ещё два раза с интервалом в 5 секунд. Если адрес так и не станет доступным, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ try: s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() @@ -341,7 +378,27 @@ l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") except: - l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") #если пользователь использовал --local_busco_dataset o_regular_expression_results = re.search(r" --local_busco_dataset (\S+)", s_command_line_reduced) @@ -374,6 +431,14 @@ s_string_to_remove = re.escape(o_regular_expression_results.group(0)) s_command_line_reduced = re.sub(s_string_to_remove, "", s_command_line_reduced, 1) + #смотрю, указал ли пользователь в командной строке максимальное количество точек, которые нужно попробовать с помощью метода Нелдера-Мида. + o_regular_expression_results = re.search(r" --maximum_number_of_points_to_try ([\d\.\+]+)", s_command_line_reduced) + if o_regular_expression_results: + n_maximum_number_of_points_to_try = int(o_regular_expression_results.group(1)) + + s_string_to_remove = re.escape(o_regular_expression_results.group(0)) + s_command_line_reduced = re.sub(s_string_to_remove, "", s_command_line_reduced, 1) + #смотрю, указал ли пользователь в командной строке размер генома. Разрешается три варианта формата: число, число с [kmgKMG] на конце, "auto". o_regular_expression_results = re.search(r" --genome_size ([\d\.eE\-\+]+[kmgKMG]?|auto)", s_command_line_reduced) if o_regular_expression_results: @@ -454,19 +519,24 @@ sys.exit() - f_logs = open(s_path_to_the_output_folder + "/mabs_logs.txt","w",buffering=1) #f_logs это общий файл с логами Mabs-flye, в отличие от трёх дополнительных файлов с логами, которые ведут три отдельных экземпляра Mabs-flye. buffering=1 означает, что буферизация идёт только на уровне строк. + f_log = open(s_path_to_the_output_folder + "/mabs_log.txt","w",buffering=1) #f_log это общий файл с логами Mabs-flye, в отличие от трёх дополнительных файлов с логами, которые ведут три отдельных экземпляра Mabs-flye. buffering=1 означает, что буферизация идёт только на уровне строк. o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Started Mabs-flye\n\n") - - f_logs.write("You have run Mabs-flye of version " + s_Mabs_version + " with the following command: " + s_command_line + "\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Started Mabs-flye\n\n") + f_log.write("You have run Mabs-flye of version " + s_Mabs_version + " with the following command: " + s_command_line + "\n\n") + + #Если пользователь дал больше 128 потоков, то ограничиваю количество потоков 128-ю, потому что Flye при попытке использовать больше 128 потоков вылетает с ошибкой, говоря, что не может использовать больше 128. + if n_number_of_cpu_threads_to_use > 128: + n_number_of_cpu_threads_to_use = 128 + f_log.write("Warning: you indicated Mabs-flye to use " + str(n_number_of_cpu_threads_to_use) + " CPU threads. However, Flye cannot use more than 128 threads. Hence, the number of used threads will be limited to 128\n\n") + #если пользователь делает сборку тестового набора ридов Mabs-flye, то нужно написать подробности этого тестового набора. if (len(sys.argv) == 2) and re.search(r"\s\-\-run_test", s_command_line): - f_logs.write("As a test, Mabs-flye will assemble the first chromosome of Saccharomyces cerevisiae, which is approximately 200 kbp long, using 20x Nanopore reads and 10x PacBio CLR reads.\n\n") - f_logs.write("The command \"mabs-flye.py --run_test\" is equivalent to the command \"mabs-flye.py --nanopore_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/nanopore_test_reads.fastq.gz --pacbio_clr_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/pacbio_clr_test_reads.fastq.gz --download_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz\"\n") - f_logs.write("If after Mabs-flye finishes you see a file ./Mabs_results/The_best_assembly/assembly.fasta which has a size of approximately 200 kilobytes, then the test succeeded.\n\n") + f_log.write("As a test, Mabs-flye will assemble the first chromosome of Saccharomyces cerevisiae, which is approximately 200 kbp long, using 20x Nanopore reads and 10x PacBio CLR reads.\n\n") + f_log.write("The command \"mabs-flye.py --run_test\" is equivalent to the command \"mabs-flye.py --nanopore_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/nanopore_test_reads.fastq.gz --pacbio_clr_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/pacbio_clr_test_reads.fastq.gz --download_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz\"\n") + f_log.write("If after Mabs-flye finishes you see a file ./Mabs_results/The_best_assembly/assembly.fasta which has a size of approximately 200 kilobytes, then the test succeeded.\n\n") #если пользователь сказал скачать файл с базой BUSCO или сам дал файл (но не папку), то разархивирую файл и меняю значение переменной s_path_to_a_local_busco_dataset с пути к файлу на путь к папке. if os.path.isfile(s_path_to_a_local_busco_dataset): @@ -479,7 +549,7 @@ #Оставляю из базы BUSCO только нужное количество (s_number_of_busco_orthogroups_to_use) ортогрупп — тех, которые имеют наиболее консервативные последовательности. Если пользователь указал использовать все ортогруппы, то Mabs-flye использует все. Если пользователь указал больше ортогрупп, чем есть в этом наборе BUSCO, то Mabs-flye использует все и пишет Warning в основной файл с логами. - mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_logs) + mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_log) #делаю ссылку на файл "ancestral", давая ему расширение .fasta. Затем делаю базу данных DIAMOND. #с помощью os.path.abspath() я получают абсолютный путь. Если он относительный, то это может создать проблемы в работоспособности мягкой ссылки. @@ -508,6 +578,11 @@ else: s_output_extension = "fasta" + #Проверяю, что DIAMOND выдал файл. Файла может не быть, если у DIAMOND были какие-то проблемы при запуске (см. https://github.com/shelkmike/Mabs/issues/3) + if not os.path.exists(s_path_to_the_output_folder + "/diamond_results_for_alignment_of_nanopore_reads_to_busco_proteins.txt"): + print("Mabs-flye has stopped because there was an error during DIAMOND execution.") + sys.exit() + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/get_single_end_reads_from_DIAMOND_results.py " + s_path_to_nanopore_reads + " " + s_path_to_the_output_folder + "/diamond_results_for_alignment_of_nanopore_reads_to_busco_proteins.txt " + s_path_to_the_output_folder + "/nanopore_reads_that_have_matches_to_busco_proteins." + s_output_extension) s_path_to_nanopore_reads_that_correspond_to_busco_genes = s_path_to_the_output_folder + "/nanopore_reads_that_have_matches_to_busco_proteins." + s_output_extension @@ -523,6 +598,11 @@ else: s_output_extension = "fasta" + #Проверяю, что DIAMOND выдал файл. Файла может не быть, если у DIAMOND были какие-то проблемы при запуске (см. https://github.com/shelkmike/Mabs/issues/3) + if not os.path.exists(s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_hifi_reads_to_busco_proteins.txt"): + print("Mabs-flye has stopped because there was an error during DIAMOND execution.") + sys.exit() + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/get_single_end_reads_from_DIAMOND_results.py " + s_path_to_pacbio_hifi_reads + " " + s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_hifi_reads_to_busco_proteins.txt " + s_path_to_the_output_folder + "/pacbio_hifi_reads_that_have_matches_to_busco_proteins." + s_output_extension) s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes = s_path_to_the_output_folder + "/pacbio_hifi_reads_that_have_matches_to_busco_proteins." + s_output_extension @@ -538,6 +618,11 @@ else: s_output_extension = "fasta" + #Проверяю, что DIAMOND выдал файл. Файла может не быть, если у DIAMOND были какие-то проблемы при запуске (см. https://github.com/shelkmike/Mabs/issues/3) + if not os.path.exists(s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_clr_reads_to_busco_proteins.txt"): + print("Mabs-flye has stopped because there was an error during DIAMOND execution.") + sys.exit() + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/get_single_end_reads_from_DIAMOND_results.py " + s_path_to_pacbio_clr_reads + " " + s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_clr_reads_to_busco_proteins.txt " + s_path_to_the_output_folder + "/pacbio_clr_reads_that_have_matches_to_busco_proteins." + s_output_extension) s_path_to_pacbio_clr_reads_that_correspond_to_busco_genes = s_path_to_the_output_folder + "/pacbio_clr_reads_that_have_matches_to_busco_proteins." + s_output_extension @@ -617,7 +702,7 @@ if (s_path_to_nanopore_reads != "") and (s_path_to_pacbio_hifi_reads != "") and (s_path_to_pacbio_clr_reads != ""): #если хотя бы один из наборов ридов был в формате FASTA. - if (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_hifi_reads, flags = re.IGNORECASE)) or (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_clr_reads, flags = re.IGNORECASE)): + if (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_nanopore_reads, flags = re.IGNORECASE)) or (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_hifi_reads, flags = re.IGNORECASE)) or (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_clr_reads, flags = re.IGNORECASE)): subprocess.call(["bash", "-c", "cat <(" + s_path_to_the_folder_where_Mabs_lies + "/Additional/SeqTk/seqtk seq -a " + s_path_to_nanopore_reads + ") <(" + s_path_to_the_folder_where_Mabs_lies + "/Additional/SeqTk/seqtk seq -a " + s_path_to_pacbio_hifi_reads + ") <(" + s_path_to_the_folder_where_Mabs_lies + "/Additional/SeqTk/seqtk seq -a " + s_path_to_pacbio_clr_reads + ") | gzip -1 > " + s_path_to_the_output_folder + "/all_long_reads.fasta.gz"]) s_path_to_the_file_with_all_long_reads = s_path_to_the_output_folder + "/all_long_reads.fasta.gz" #если все наборы ридов были в формате FASTQ. @@ -643,7 +728,7 @@ #если пользователь дал риды Нанопора и PacBio CLR. if (s_path_to_nanopore_reads != "") and (s_path_to_pacbio_hifi_reads == "") and (s_path_to_pacbio_clr_reads != ""): #если хотя бы один из наборов ридов был в формате FASTA. - if (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_hifi_reads, flags = re.IGNORECASE)) or (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_clr_reads, flags = re.IGNORECASE)): + if (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_nanopore_reads, flags = re.IGNORECASE)) or (not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_pacbio_clr_reads, flags = re.IGNORECASE)): subprocess.call(["bash", "-c", "cat <(" + s_path_to_the_folder_where_Mabs_lies + "/Additional/SeqTk/seqtk seq -a " + s_path_to_nanopore_reads + ") <(" + s_path_to_the_folder_where_Mabs_lies + "/Additional/SeqTk/seqtk seq -a " + s_path_to_pacbio_clr_reads + ") | gzip -1 > " + s_path_to_the_output_folder + "/all_long_reads.fasta.gz"]) s_path_to_the_file_with_all_long_reads = s_path_to_the_output_folder + "/all_long_reads.fasta.gz" #если все наборы ридов были в формате FASTQ. @@ -678,241 +763,205 @@ if (s_path_to_nanopore_reads == "") and (s_path_to_pacbio_hifi_reads == "") and (s_path_to_pacbio_clr_reads != ""): s_path_to_the_file_with_all_long_reads = s_path_to_pacbio_clr_reads s_long_reads_option_for_calculate_AG = "--pacbio_clr_reads" - - - #Теперь, собственно, начинаю проверку 10 точек методом золотого сечения. n_point_1 это самая левая в данный момент точка (то есть, с наименьшим log10(max_divergence)), n_point_4 это самая правая (то есть, с наибольшим log10(max_divergence)), а n_point_2 и n_point_3 это две промежуточные, положение которых, собственно, и определяется золотым сечением. - #Любые риды я при сборке даю Flye как "nano-raw", то есть нескорректированные риды Нанопора, потому что если мне нужно коллапсировать области генома с очень высокой гетерозиготностью, то для Mabs-flye это примерно эквивалентно тому, что в ридах много ошибок секвенирования. Нужно будет подумать, насколько это правильно. Скрипту calculate_AG.py я тоже даю любые риды как риды Нанопора, то есть с опцией --nanopore_reads. - n_point_1 = 0.0001 #Нижняя граница пробуемых max_divergence. 0.0001 это 0.01%. - n_point_4 = 0.5 #Верхняя граница пробуемых max_divergence. 0.5 это 50%. - n_point_2 = round(10**(math.log10(n_point_1) + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(math.log10(n_point_4) - math.log10(n_point_1))), 6) #округлю до шестого знака после запятой, иначе у Питона иногда вылезают числа вроде 0.0001442000001 - n_point_3 = round(10**(math.log10(n_point_4) - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(math.log10(n_point_4) - math.log10(n_point_1))), 6) - - #Для 0.0001 и 0.5 я не делаю измерений, потому что метод золотого сечения этого не требует. - #Это список, в который для каждого проверенного max_divergence будет записан AG. Ключ это max_divergence, а значение это AG. - d_max_divergence_to_AG = {} #Например, [0.123] = 762. - - #Анализирую вторую точку. - n_number_of_the_point_under_analysis = 1 - n_max_divergence = n_point_2 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-flye started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. Max_divergence in this point is " + str(n_max_divergence) + "\n") - - #если пользователь не указывал размер генома - if s_genome_size_estimate == "auto": - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - #если пользователь указал размер генома + """ + Теперь нужно определить, какой опцией Flye давать риды (--nano-raw, --nano-corr или другие). Если пользователь хотя бы один файл дал в формате FASTA, то использую --nano-raw. Если все риды в формате FASTQ, то я делаю следующее: + I) Считаю точность для каждого рида, используя строку с качеством. "Точность" выражается в процентах. + II) Считаю медианное значение по значениям из "I)" + III) В завимисимости от значения из "II)" выбираю, какой опцией давать риды программе Flye. Задавая эти числа, я ориентировался на описания опций в https://github.com/fenderglass/Flye/blob/flye/docs/USAGE.md + Соответствие между медианной точностью ридов, и выбранным режимом Flye: + (0; 95] - --nano-raw + (95; 97] - --nano-hq + (97; 99] - --nano-corr + (99; 100] - --pacbio-hifi + Если среди файлов с ридами, которые пользователь дал программе, хотя бы один в формате FASTA, то Mabs пишет в логи "WARNING: you have provided reads in FASTA, while FASTQ is recommended. Using reads in FASTA may reduce the accuracy of the assembly." + + Для скорости, медианная точность считается только по ридам, относящимся к генам BUSCO. + """ + + s_flye_option_to_provide_reads_with = "" #может быть "--nano-raw", "--nano-hq", "--nano-corr", "--pacbio-hifi". + + if not re.search(r"(\.fastq|\.fq|\.fastq\.gz|\.fq\.gz)$", s_path_to_all_long_reads_that_correspond_to_busco_genes, flags = re.IGNORECASE): + f_log.write("WARNING: you have provided reads in FASTA, while FASTQ is recommended. Using reads in FASTA may reduce the accuracy of the assembly.\n\n") + + s_flye_option_to_provide_reads_with = "--nano-raw" else: - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) + #Считаю медианный Phred score ридов. Считаю, что риды в формате Phred+33, потому что Phred+64 для длинных ридов, по-моему, никогда не использовался. - - #Смотрю, получился ли файл assembly.fasta. Его может не быть, если Flye не собрал ни одного дисджойнтига — в таком случае Flye прекращает работу преждевременно, не выдавая файла assembly.fasta. То, что Flye не выдал ни одного дисджойнтига, может быть связано с тем, что для ридов с большим количеством ошибок Mabs-flye попробовал очень маленький max_divergence. В случае, если файла assembly.fasta нет, я, даже не запуская скрипт calculate_AG.py, сразу считаю, что AG=0. - if not os.path.isfile(s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta"): - n_AG_for_point_2 = 0 - else: - - #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + " --assembly " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta " + s_long_reads_option_for_calculate_AG + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe true --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) - - #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt", "r") - s_line_1 = f_infile.readline() - #AG is 487 - o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_2 = int(o_regular_expression_results.group(1)) + if re.search(r"(\.gz)$", s_path_to_all_long_reads_that_correspond_to_busco_genes, flags = re.IGNORECASE): + f_infile = gzip.open(s_path_to_all_long_reads_that_correspond_to_busco_genes, mode = "rt") + else: + f_infile = open(s_path_to_all_long_reads_that_correspond_to_busco_genes, "r") + + #Список, элементы которого это точности ридов, выраженные в процентах. 100%, значит рид идеально точный. В списке по одному элементу на каждый рид. + l_accuracies_of_reads = [] + + n_line_number = 0 #Номер строки. Считается от 1. + for s_line in f_infile: + n_line_number += 1 + + if (n_line_number - 4) % 4 == 0: + s_quality_line = re.sub(r"[\r\n]+$", "", s_line) + + n_accuracy_of_the_read = 0 #Точность рида, в процентах. Сначала посчитаю просто как сумму точностей отдельных нуклеотидов, а потом поделю на длину рида. + + for s_character in s_quality_line: + n_Phred_score_corresponding_to_the_character = ord(s_character) - 33 + n_accuracy_of_the_read += 100 * (1 - (10 ** (- n_Phred_score_corresponding_to_the_character / 10))) + + n_accuracy_of_the_read = n_accuracy_of_the_read / len(s_quality_line) + + l_accuracies_of_reads.append(n_accuracy_of_the_read) + + f_infile.close() + + n_median_accuracy_of_reads = statistics.median(l_accuracies_of_reads) + + if n_median_accuracy_of_reads <= 95: + s_flye_option_to_provide_reads_with = "--nano-raw" + elif (n_median_accuracy_of_reads > 95) and (n_median_accuracy_of_reads <= 97): + s_flye_option_to_provide_reads_with = "--nano-hq" + elif (n_median_accuracy_of_reads > 97) and (n_median_accuracy_of_reads <= 99): + s_flye_option_to_provide_reads_with = "--nano-corr" + elif n_median_accuracy_of_reads > 99: + s_flye_option_to_provide_reads_with = "--pacbio-hifi" else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + o_current_time_and_date = datetime.datetime.now() + s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") + f_log.write(s_current_time_and_date + "\n") + f_log.write("A very strange error in parsing of FASTQ happened. Please, report at https://github.com/shelkmike/Mabs/issues .") sys.exit() + + #Округляю медианную точность, чтобы показать её пользователю. Если точность <=90%, то округляю до целого. Если точность от 90% до 99%, то округляю до первого знака после запятой. Если точность >99%, то округляю до второго знака после запятой. + if n_median_accuracy_of_reads <= 90: + n_median_accuracy_of_reads__rounded = int(round(n_median_accuracy_of_reads, 0)) + elif (n_median_accuracy_of_reads > 90) and (n_median_accuracy_of_reads <= 99): + n_median_accuracy_of_reads__rounded = round(n_median_accuracy_of_reads, 1) + else: + n_median_accuracy_of_reads__rounded = round(n_median_accuracy_of_reads, 2) + + f_log.write("The median accuracy of reads has been estimated as approximately " + str(n_median_accuracy_of_reads__rounded) + "%. The reads will be provided to Flye via the option \"" + s_flye_option_to_provide_reads_with + "\".\n\n") + + + #Теперь делаю сборку с дефолтными значениями параметров Flye, чтобы посмотреть, какой порог по сходству при перекрытии ридов в процессе образования дисджойнтигов он установит. Делаю это во временной папке "Test_gene_assembly_to_determine_Flye_default_parameters". Заодно, посмотрю, какой repeat_graph_ovlp_divergence он установит (хотя это, в общем, можно определить и без сборки, а просто по значению s_flye_option_to_provide_reads_with) + n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters = -100 #Значение этой переменной эквивалентно значению параметра assemble_ovlp_divergence, когда он используется в сочетании с "assemble_divergence_relative=0". -100 это плейсхолдер. + n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters = -100 #Значение параметра repeat_graph_ovlp_divergence. -100 это плейсхолдер - d_max_divergence_to_AG[n_max_divergence] = n_AG_for_point_2 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for max_divergence " + str(n_max_divergence) + " is " + str(n_AG_for_point_2) + "\n\n") - - #Анализирую третью точку. - n_number_of_the_point_under_analysis += 1 - n_max_divergence = n_point_3 - o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-flye started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. Max_divergence in this point is " + str(n_max_divergence) + "\n") - + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-flye started a test assembly to determine Flye default parameters\n\n") + #если пользователь не указывал размер генома if s_genome_size_estimate == "auto": - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Test_gene_assembly_to_determine_Flye_default_parameters --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --stop-after assembly") #если пользователь указал размер генома else: - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - - #Смотрю, получился ли файл assembly.fasta. Его может не быть, если Flye не собрал ни одного дисджойнтига — в таком случае Flye прекращает работу преждевременно, не выдавая файла assembly.fasta. То, что Flye не выдал ни одного дисджойнтига, может быть связано с тем, что для ридов с большим количеством ошибок Mabs-flye попробовал очень маленький max_divergence. В случае, если файла assembly.fasta нет, я, даже не запуская скрипт calculate_AG.py, сразу считаю, что AG=0. - if not os.path.isfile(s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta"): - n_AG_for_point_3 = 0 - else: - #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + " --assembly " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta " + s_long_reads_option_for_calculate_AG + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe true --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) - - #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt", "r") - s_line_1 = f_infile.readline() - #AG is 487 - o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_3 = int(o_regular_expression_results.group(1)) - else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") - sys.exit() + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Test_gene_assembly_to_determine_Flye_default_parameters --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --stop-after assembly") + + #Смотрю, какие значения n_max_divergence_between_reads_during_disjointig_construction и n_repeat_graph_ovlp_divergence Flye использовал, когда он запускался по умолчанию. + f_infile = open(s_path_to_the_output_folder + "/Test_gene_assembly_to_determine_Flye_default_parameters/flye.log", "r") + + for s_line in f_infile: + #[2023-07-10 10:52:57] DEBUG: Max divergence threshold set to 0.218233 + o_regular_expression_results = re.search(r"Max divergence threshold set to ([\d\.]+)", s_line) + if o_regular_expression_results: + n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters = float(o_regular_expression_results.group(1)) + #[2023-07-10 19:24:50] DEBUG: repeat_graph_ovlp_divergence=0.08 + o_regular_expression_results = re.search(r"repeat_graph_ovlp_divergence=([\d\.]+)", s_line) + if o_regular_expression_results: + n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters = float(o_regular_expression_results.group(1)) + + f_infile.close() + + if (n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters == -100) or (n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters == -100): + f_log.write("Error. Couldn't parse the results of Flye run with default parameters. If you don't know the cause of this, please report this problem to https://github.com/shelkmike/Mabs/issues\n") + sys.exit() + + n_number_of_the_point_under_analysis = 0 #порядковый номер точки, которую я анализирую. Считается от 1. + + #Тут я описываю функцию, которой на вход даются значения assemble_ovlp_divergence и repeat_graph_ovlp_divergence, а выдаёт функция -AG. С минусом спереди — потому, что scipy.optimize.minimize ищет минимум, а не максимум. Поэтому для максимизации AG нужно минимизировать -AG. Два параметра, дающихся на вход, я даю через список (потому что так нужно scipy.optimize.minimize). + def function_two_Flye_parameters_to_minus_AG(l_two_input_parameters): + global n_number_of_the_point_under_analysis #Без этой строки возникает ошибка "local variable 'n_number_of_the_point_under_analysis' referenced before assignment", потому что без этой строки нельзя модифицировать ("n_number_of_the_point_under_analysis += 1") глобальную переменную внутри функции. + + #Числа округляю до 5-го знака, чтобы не было всяких 0.12000000000000001 + n_assemble_ovlp_divergence = round(l_two_input_parameters[0], 5) + n_repeat_graph_ovlp_divergence = round(l_two_input_parameters[1], 5) - d_max_divergence_to_AG[n_max_divergence] = n_AG_for_point_3 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for max_divergence " + str(n_max_divergence) + " is " + str(n_AG_for_point_3) + "\n\n") - - #теперь последовательно выбираю остальные 8 точек методом золотого сечения и меряю AG для них. - while n_number_of_the_point_under_analysis < 10: #"<", а не "<=", потому что увеличение номера точки здесь делается в начале цикла. n_number_of_the_point_under_analysis += 1 - #Смотрю, какая из двух центральных точек (вторая или третья) имеют меньшее значение AG. Если вторая имеет меньшее ли равное третьей, то выкидываю первую точку и сужаю интервал. Если третья имеет меньшее, чем вторая, то выкидываю четвёртую точку и сужаю интервал. При равных значениях выкидывыю левую, потому что равные значения могут быть из-за того, что в обеих точках AG = 0 из-за того, что при сборке по ридам с большим количеством ошибок обе центральные точки имели слишком маленький max_divergence, из-за чего Flye не смог собрать ни одного дисджойнтига и поэтому выдал пустой файл assembly.fasta. - if n_AG_for_point_2 <= n_AG_for_point_3: - n_point_1 = n_point_2 - n_point_2 = n_point_3 - #n_point_4 не меняется - n_point_3 = round(10**(math.log10(n_point_4) - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(math.log10(n_point_4) - math.log10(n_point_1))), 6) - - n_AG_for_point_1 = n_AG_for_point_2 - n_AG_for_point_2 = n_AG_for_point_3 - #n_AG_for_point_4 не меняется - n_AG_for_point_3 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. - - #Анализирую третью точку. - n_max_divergence = n_point_3 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-flye started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. Max_divergence in this point is " + str(n_max_divergence) + "\n") - - #если пользователь не указывал размер генома - if s_genome_size_estimate == "auto": - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - #если пользователь указал размер генома - else: - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - - #Смотрю, получился ли файл assembly.fasta. Его может не быть, если Flye не собрал ни одного дисджойнтига — в таком случае Flye прекращает работу преждевременно, не выдавая файла assembly.fasta. То, что Flye не выдал ни одного дисджойнтига, может быть связано с тем, что для ридов с большим количеством ошибок Mabs-flye попробовал очень маленький max_divergence. В случае, если файла assembly.fasta нет, я, даже не запуская скрипт calculate_AG.py, сразу считаю, что AG=0. - if not os.path.isfile(s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta"): - n_AG_for_point_3 = 0 - else: - #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + " --assembly " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta " + s_long_reads_option_for_calculate_AG + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe true --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) - - #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt", "r") - s_line_1 = f_infile.readline() - #AG is 487 - o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_3 = int(o_regular_expression_results.group(1)) - else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") - sys.exit() - - d_max_divergence_to_AG[n_max_divergence] = n_AG_for_point_3 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for max_divergence " + str(n_max_divergence) + " is " + str(n_AG_for_point_3) + "\n\n") - - elif n_AG_for_point_2 > n_AG_for_point_3: - #n_point_1 не меняется - n_point_4 = n_point_3 - n_point_3 = n_point_2 - n_point_2 = round(10**(math.log10(n_point_1) + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(math.log10(n_point_4) - math.log10(n_point_1))), 6) - - #n_AG_for_point_1 не меняется - n_AG_for_point_4 = n_AG_for_point_3 - n_AG_for_point_3 = n_AG_for_point_2 - n_AG_for_point_2 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. - - #Анализирую вторую точку. - n_max_divergence = n_point_2 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-flye started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. Max_divergence in this point is " + str(n_max_divergence) + "\n") - - #если пользователь не указывал размер генома - if s_genome_size_estimate == "auto": - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - #если пользователь указал размер генома - else: - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) - - #Смотрю, получился ли файл assembly.fasta. Его может не быть, если Flye не собрал ни одного дисджойнтига — в таком случае Flye прекращает работу преждевременно, не выдавая файла assembly.fasta. То, что Flye не выдал ни одного дисджойнтига, может быть связано с тем, что для ридов с большим количеством ошибок Mabs-flye попробовал очень маленький max_divergence. В случае, если файла assembly.fasta нет, я, даже не запуская скрипт calculate_AG.py, сразу считаю, что AG=0. - if not os.path.isfile(s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta"): - n_AG_for_point_2 = 0 + o_current_time_and_date = datetime.datetime.now() + s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-flye started to analyze point " + str(n_number_of_the_point_under_analysis) + ". This point is: assemble_ovlp_divergence = " + str(n_assemble_ovlp_divergence) + ", repeat_graph_ovlp_divergence = " + str(n_repeat_graph_ovlp_divergence) + ", assemble_divergence_relative = 0\n") + + #если пользователь не указывал размер генома + if s_genome_size_estimate == "auto": + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_point_" + str(n_number_of_the_point_under_analysis) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_assemble_ovlp_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_repeat_graph_ovlp_divergence) + ",assemble_divergence_relative=0 " + " " + s_additional_flye_parameters) + #если пользователь указал размер генома + else: + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --out-dir " + s_path_to_the_output_folder + "/Gene_assembly_for_point_" + str(n_number_of_the_point_under_analysis) + " --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_assemble_ovlp_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_repeat_graph_ovlp_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) + + #Смотрю, получился ли файл assembly.fasta. Его может не быть, если Flye не собрал ни одного дисджойнтига — в таком случае Flye прекращает работу преждевременно, не выдавая файла assembly.fasta. То, что Flye не выдал ни одного дисджойнтига, может быть связано с тем, что для ридов с большим количеством ошибок Mabs-flye попробовал очень маленький assemble_ovlp_divergence. В случае, если файла assembly.fasta нет, я, даже не запуская скрипт calculate_AG.py, сразу считаю, что AG=0. + if not os.path.isfile(s_path_to_the_output_folder + "/Gene_assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta"): + n_AG = 0 + else: + #Считаю AG + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + " --assembly " + s_path_to_the_output_folder + "/Gene_assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta " + s_long_reads_option_for_calculate_AG + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe true --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) + + #Беру AG, посчитанный скриптом calculate_AG.py + if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt"): + f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt", "r") + s_line_1 = f_infile.readline() + f_infile.close() + #AG is 487 + o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) + n_AG = int(o_regular_expression_results.group(1)) else: - #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + " --assembly " + s_path_to_the_output_folder + "/Gene_assembly_for_max_divergence_" + str(n_max_divergence) + "/assembly.fasta " + s_long_reads_option_for_calculate_AG + " " + s_path_to_all_long_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe true --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) - - #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_max_divergence_" + str(n_max_divergence) + "/AG.txt", "r") - s_line_1 = f_infile.readline() - #AG is 487 - o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_2 = int(o_regular_expression_results.group(1)) - else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") - - d_max_divergence_to_AG[n_max_divergence] = n_AG_for_point_2 - - o_current_time_and_date = datetime.datetime.now() - s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for max_divergence " + str(n_max_divergence) + " is " + str(n_AG_for_point_2) + "\n\n") - + f_log.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + sys.exit() + + o_current_time_and_date = datetime.datetime.now() + s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") + f_log.write(s_current_time_and_date + "\n") + f_log.write("AG for point " + str(n_number_of_the_point_under_analysis) + " is " + str(n_AG) + "\n\n") + + return(-n_AG) + + #Теперь, собственно, делаю сборку, проверяя максимум n_maximum_number_of_points_to_try точек + + o_current_time_and_date = datetime.datetime.now() + s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-flye will try at most " + str(n_maximum_number_of_points_to_try) + " points\n\n") - #После того, как посчитал AG для всех 10 точек, я смотрю, какая из них имела наибольший AG. Тот max_divergence, который соответствует этой точке, я и считаю оптимальным для сборки Flye. Если две точки дают одинаковый AG, то, для определённости, выбираю ту из них, которая имеет больший max_divergence. + """ + Максимальное возможное значение repeat_graph_ovlp_divergence ставлю в 0.5. Сомневаюсь, что могут быть полезны более высокие значение. Поэтому, чтобы Mabs-flye терял меньше времени на проверку малозначащих точек, поставил такое ограничение. + """ + o_optimization_results = scipy.optimize.minimize(fun = function_two_Flye_parameters_to_minus_AG, x0 = [n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters, n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters], method = "Nelder-Mead", bounds = ((0, 1), (0, 0.5)), options = {"maxfev" : n_maximum_number_of_points_to_try, "initial_simplex" : [[n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters, n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters], [2 * n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters, 2 * n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters], [n_max_divergence_between_reads_during_disjointig_construction__when_Flye_is_run_with_default_parameters, n_repeat_graph_ovlp_divergence__when_Flye_is_run_with_default_parameters / 2]]}) #Не понял, чем maxfev отличается от maxiter. Но оптимизация останавливается на n_maximum_number_of_points_to_try именно если указать это число как maxfev, а не как maxiter. - n_max_divergence_that_provides_maximum_AG = -100 - n_maximum_AG = -100 - for n_max_divergence in d_max_divergence_to_AG: - if d_max_divergence_to_AG[n_max_divergence] > n_maximum_AG: - n_max_divergence_that_provides_maximum_AG = n_max_divergence - n_maximum_AG = d_max_divergence_to_AG[n_max_divergence] - - if (d_max_divergence_to_AG[n_max_divergence] == n_maximum_AG) and (n_max_divergence > n_max_divergence_that_provides_maximum_AG): - n_max_divergence_that_provides_maximum_AG = n_max_divergence + n_optimal_assemble_ovlp_divergence = o_optimization_results.x[0] + n_optimal_repeat_graph_ovlp_divergence = o_optimization_results.x[1] + n_maximum_AG = - int(o_optimization_results.fun) #конвертирую в int, потому что scipy.optimize.minimize выдаёт это число во float (хоть это всегда и целый float). А в логи я его хочу записать в виде int. o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("The optimal max_divergence is " + str(n_max_divergence_that_provides_maximum_AG) + ". When assembling only genes, it provides AG = " + str(n_maximum_AG) + ". Now Mabs-flye starts to assemble the genome using all reads and max_divergence = " + str(n_max_divergence_that_provides_maximum_AG) + "\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("The optimal combination of Flye parameters as determined by Mabs-flye is: assemble_ovlp_divergence = " + str(n_optimal_assemble_ovlp_divergence) + ", repeat_graph_ovlp_divergence = " + str(n_optimal_repeat_graph_ovlp_divergence) + ", assemble_divergence_relative = 0. When assembling only genes, it provides AG = " + str(n_maximum_AG) + ". Now Mabs-flye starts to assemble the genome using all reads with the optimal combination of parameters.\n\n") - #Теперь делаю сборку Flye по всем ридам, используя n_max_divergence_that_provides_maximum_AG. + #Теперь делаю сборку Flye по всем ридам, используя найденную оптимальную комбинацию параметров. #если пользователь не указывал размер генома if s_genome_size_estimate == "auto": - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_the_file_with_all_long_reads + " --out-dir " + s_path_to_the_output_folder + "/The_best_assembly --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence_that_provides_maximum_AG) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence_that_provides_maximum_AG) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_the_file_with_all_long_reads + " --out-dir " + s_path_to_the_output_folder + "/The_best_assembly --threads " + str(n_number_of_cpu_threads_to_use) + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_optimal_assemble_ovlp_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_optimal_repeat_graph_ovlp_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) #если пользователь указал размер генома else: - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye --nano-raw " + s_path_to_the_file_with_all_long_reads + " --out-dir " + s_path_to_the_output_folder + "/The_best_assembly --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_max_divergence_that_provides_maximum_AG) + ",repeat_graph_ovlp_divergence=" + str(n_max_divergence_that_provides_maximum_AG) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Flye/bin/flye " + s_flye_option_to_provide_reads_with + " " + s_path_to_the_file_with_all_long_reads + " --out-dir " + s_path_to_the_output_folder + "/The_best_assembly --threads " + str(n_number_of_cpu_threads_to_use) + " --genome-size " + s_genome_size_estimate + " --no-alt-contigs --extra-params assemble_ovlp_divergence=" + str(n_optimal_assemble_ovlp_divergence) + ",repeat_graph_ovlp_divergence=" + str(n_optimal_repeat_graph_ovlp_divergence) + ",assemble_divergence_relative=0 " + s_additional_flye_parameters) o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-flye finished. The contigs are in the file " + s_path_to_the_output_folder + "/The_best_assembly/assembly.fasta. Now I recommend to use a separate tool, for example HyPo (https://github.com/kensung-lab/hypo), to polish these contigs with accurate (HiFi, Illumina or MGI) reads.") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-flye finished. The contigs are in the file " + s_path_to_the_output_folder + "/The_best_assembly/assembly.fasta. Now I recommend to use a separate tool, for example HyPo (https://github.com/kensung-lab/hypo), to polish these contigs with accurate (HiFi, Illumina or MGI) reads.") diff --git a/mabs-hifiasm.py b/mabs-hifiasm.py index 356058e..a63db78 100755 --- a/mabs-hifiasm.py +++ b/mabs-hifiasm.py @@ -19,6 +19,7 @@ import sys import os import re +import time import datetime import urllib.request #import ssl @@ -67,6 +68,9 @@ if not os.path.isfile(s_path_to_the_folder_where_Mabs_lies + "/Additional/get_single_end_reads_from_DIAMOND_results.py"): l_unavailable_files_and_folders.append("The file get_single_end_reads_from_DIAMOND_results.py should be in the subfolder \"Additional\" of the folder where Mabs lies.") + + if not os.path.isfile(s_path_to_the_folder_where_Mabs_lies + "/Additional/calculate_N50.py"): + l_unavailable_files_and_folders.append("The file calculate_N50.py should be in the subfolder \"Additional\" of the folder where Mabs lies.") if not os.path.isdir(s_path_to_the_folder_where_Mabs_lies + "/Test_datasets"): l_unavailable_files_and_folders.append("The subfolder \"Test_datasets\" should be in the folder where Mabs lies.") @@ -94,7 +98,7 @@ s_additional_hifiasm_parameters = "" #дополнительные параметры Hifiasm. - s_Mabs_version = "2.19" + s_Mabs_version = "2.24" l_errors_in_command_line = [] #список ошибок в командной строке. Если пользователь совершил много ошибок, то Mabs-hifiasm напишет про них все, а не только про первую встреченную. @@ -169,7 +173,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name_online #путь к месту, где будет лежать скачанный архивированный gzip файл с датасетом BUSCO. - #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ + #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то пробую ещё два раза с интервалом в 5 секунд. Если адрес так и не станет доступным, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ try: s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() #проверяю, доступен ли нужный файл, и если доступен, то качаю его. @@ -179,7 +183,27 @@ l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") except: - l_errors_in_command_line.append("Unfortunately, http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is currently not accessible. To test Mabs-hifiasm, download the file http://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz and run the following command:\nmabs-hifiasm.py --pacbio_hifi_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/pacbio_hifi_test_reads.fastq.gz --local_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz") + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + l_errors_in_command_line.append("Unfortunately, http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is currently not accessible. To test Mabs-hifiasm, download the file http://busco-data.ezlab.org/v5/data/lineages/saccharomycetes_odb10.2020-08-05.tar.gz and run the following command:\nmabs-hifiasm.py --pacbio_hifi_reads [PATH TO THE FOLDER WITH MABS]/Test_datasets/pacbio_hifi_test_reads.fastq.gz --local_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz") if len(l_errors_in_command_line) != 0: @@ -328,10 +352,9 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name_online #путь к месту, где будет лежать скачанный архивированный gzip файл с датасетом BUSCO. - #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ + #проверяю, доступен ли адрес http://mikeshelk.site/Data/BUSCO_datasets/Latest/. Он может быть недоступен из-за каких-то проблем с сервером. Если не доступен, то пробую ещё два раза с интервалом в 5 секунд. Если адрес так и не станет доступным, то рекомендую пользователю скачать базу с http://busco-data.ezlab.org/v5/data/lineages/ и использовать опцию --local_busco_dataset. Проверку делаю примерно как написано на https://stackoverflow.com/questions/1949318/checking-if-a-website-is-up-via-python . А если доступен, то делаю ещё одну проверку — на то, есть ли нужный файл в папке http://mikeshelk.site/Data/BUSCO_datasets/Latest/ try: s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() - #проверяю, доступен ли нужный файл, и если доступен, то качаю его. try: urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) @@ -339,7 +362,27 @@ l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") except: - l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + time.sleep(5) + try: + s_dummy_variable = urllib.request.urlopen("http://mikeshelk.site/Data/BUSCO_datasets/Latest/").getcode() + #проверяю, доступен ли нужный файл, и если доступен, то качаю его. + try: + urllib.request.urlretrieve("http://mikeshelk.site/Data/BUSCO_datasets/Latest/" + s_busco_dataset_name_online, s_path_to_a_local_busco_dataset) + except: + l_errors_in_command_line.append("The file " + s_busco_dataset_name_online + " does not exist at http://mikeshelk.site/Data/BUSCO_datasets/Latest/ .") + + except: + l_errors_in_command_line.append("http://mikeshelk.site/Data/BUSCO_datasets/Latest/ is not accessible. Please, download a BUSCO dataset from http://busco-data.ezlab.org/v5/data/lineages/ and use \"--local_busco_dataset\" instead of \"--download_busco_dataset\".") #если пользователь использовал --local_busco_dataset o_regular_expression_results = re.search(r" --local_busco_dataset (\S+)", s_command_line_reduced) @@ -460,13 +503,13 @@ sys.exit() - f_logs = open(s_path_to_the_output_folder + "/mabs_logs.txt","w",buffering=1) #f_logs это общий файл с логами Mabs-hifiasm, в отличие от трёх дополнительных файлов с логами, которые ведут три отдельных экземпляра Mabs-hifiasm. buffering=1 означает, что буферизация идёт только на уровне строк. + f_log = open(s_path_to_the_output_folder + "/mabs_log.txt","w",buffering=1) #f_log это общий файл с логами Mabs-hifiasm, в отличие от трёх дополнительных файлов с логами, которые ведут три отдельных экземпляра Mabs-hifiasm. buffering=1 означает, что буферизация идёт только на уровне строк. o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Started Mabs-hifiasm\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Started Mabs-hifiasm\n\n") - f_logs.write("You have run Mabs-hifiasm of version " + s_Mabs_version + " with the following command: " + s_command_line + "\n\n") + f_log.write("You have run Mabs-hifiasm of version " + s_Mabs_version + " with the following command: " + s_command_line + "\n\n") #Это строка, в которой указаны пути ко всем ридам, которые нужно давать Modified_hifiasm, а также, если пользователь указал размер генома, то и размер генома. Например, "--hg-size 1g --h1 hic_reads_R1.fastq --hi2 hic_reads_R1.fastq --ul nanopore_reads.fastq hifi_reads.fastq", если был указан размер генома, и были указаны и риды Hi-C, и ультрадлинные риды Нанопора, и риды HiFi. Или, например, просто "hifi_reads.fastq" если размер генома не был указан, и были только риды HiFi. Эта строка нужна, чтобы Mabs-hifiasm было проще передавать аргументы командной строки Modified_hifiasm. Иначе, передача аргументов несколько осложнена, потому что в зависимости от того, какие опции дал Mabs-hifiasm пользователь, программа Modified_hifiasm нужно передавать разное количество аргументов. s_command_line_arguments_with_reads_for_Modified_hifiasm = s_path_to_pacbio_hifi_reads @@ -486,9 +529,9 @@ #если пользователь делает сборку тестового набора ридов Mabs-hifiasm, то нужно написать подробности этого тестового набора. if (len(sys.argv) == 2) and re.search(r"\s\-\-run_test", s_command_line): - f_logs.write("As a test, Mabs-hifiasm will assemble the first chromosome of Saccharomyces cerevisiae, which is approximately 200 kbp long, using 40x PacBio HiFi reads.\n\n") - f_logs.write("The command \"mabs-hifiasm.py --run_test\" is equivalent to the command \"mabs-hifiasm.py --pacbio_hifi_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/pacbio_hifi_reads__test_set__for_diploid_assembly.fastq.gz --download_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz\"\n") - f_logs.write("If after Mabs-hifiasm finishes you see a file ./Mabs_results/The_best_assembly/assembly.fasta which has a size of approximately 200 kilobytes, then the test succeeded.\n\n") + f_log.write("As a test, Mabs-hifiasm will assemble the first chromosome of Saccharomyces cerevisiae, which is approximately 200 kbp long, using 40x PacBio HiFi reads.\n\n") + f_log.write("The command \"mabs-hifiasm.py --run_test\" is equivalent to the command \"mabs-hifiasm.py --pacbio_hifi_reads " + s_path_to_the_folder_where_Mabs_lies + "/Test_datasets/pacbio_hifi_reads__test_set__for_diploid_assembly.fastq.gz --download_busco_dataset saccharomycetes_odb10.2020-08-05.tar.gz\"\n") + f_log.write("If after Mabs-hifiasm finishes you see a file ./Mabs_results/The_best_assembly/assembly.fasta which has a size of approximately 200 kilobytes, then the test succeeded.\n\n") #если пользователь сказал скачать файл с базой BUSCO или сам дал файл (но не папку), то разархивирую файл и меняю значение переменной s_path_to_a_local_busco_dataset с пути к файлу на путь к папке. if os.path.isfile(s_path_to_a_local_busco_dataset): @@ -500,7 +543,7 @@ s_path_to_a_local_busco_dataset = s_path_to_the_output_folder + "/" + s_busco_dataset_name #Оставляю из базы BUSCO только нужное количество (s_number_of_busco_orthogroups_to_use) ортогрупп — тех, которые имеют наиболее консервативные последовательности. Если пользователь указал использовать все ортогруппы, то Mabs-hifiasm использует все. Если пользователь указал больше ортогрупп, чем есть в этом наборе BUSCO, то Mabs-hifiasm использует все и пишет Warning в основной файл с логами. - mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_logs) + mabs_function_preprocess_busco_dataset.function_preprocess_busco_dataset(s_path_to_a_local_busco_dataset, s_number_of_busco_orthogroups_to_use, s_path_to_the_output_folder, f_log) #делаю ссылку на файл "ancestral", давая ему расширение .fasta. Затем делаю базу данных DIAMOND. #с помощью os.path.abspath() я получают абсолютный путь. Если он относительный, то это может создать проблемы в работоспособности мягкой ссылки. @@ -523,47 +566,55 @@ else: s_output_extension = "fasta" + #Проверяю, что DIAMOND выдал файл. Файла может не быть, если у DIAMOND были какие-то проблемы при запуске (см. https://github.com/shelkmike/Mabs/issues/3) + if not os.path.exists(s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_hifi_reads_to_busco_proteins.txt"): + print("Mabs-hifiasm has stopped because there was an error during DIAMOND execution.") + sys.exit() + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/get_single_end_reads_from_DIAMOND_results.py " + s_path_to_pacbio_hifi_reads + " " + s_path_to_the_output_folder + "/diamond_results_for_alignment_of_pacbio_hifi_reads_to_busco_proteins.txt " + s_path_to_the_output_folder + "/pacbio_hifi_reads_that_have_matches_to_busco_proteins." + s_output_extension) s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes = s_path_to_the_output_folder + "/pacbio_hifi_reads_that_have_matches_to_busco_proteins." + s_output_extension - #Теперь, собственно, начинаю проверку 10 точек методом золотого сечения. Параметр, который я оптимизирую, это параметр "-s" Hifiasm. Стартовый интервал -s: [0;1]. n_point_1 это самая левая в данный момент точка (то есть, с наименьшим -s), n_point_4 это самая правая (то есть, с наибольшим -s), а n_point_2 и n_point_3 это две промежуточные, положение которых, собственно, и определяется золотым сечением. - n_point_1 = 0 #Нижняя граница пробуемых -s. - n_point_4 = 1 #Верхняя граница пробуемых -s. - n_point_2 = round(n_point_1 + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_point_4 - n_point_1), 3) #округлю до третьего знака после запятой, иначе у Питона иногда вылезают числа вроде 0.144200000001 - n_point_3 = round(n_point_4 - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_point_4 - n_point_1), 3) + #Теперь, собственно, начинаю проверку 10 точек методом золотого сечения. Параметр, который я оптимизирую, это параметр "-s" Hifiasm. Стартовый интервал -s: [0;1]. n_golden_section_point_1 это самая левая в данный момент точка (то есть, с наименьшим -s), n_golden_section_point_4 это самая правая (то есть, с наибольшим -s), а n_golden_section_point_2 и n_golden_section_point_3 это две промежуточные, положение которых, собственно, и определяется золотым сечением. + #ВАЖНО: возможно, в будущем нужно изменить терминологию. Потому что сейчас можно перепутать обозначения четырёх точек метода золотого сечения и те "точки", которые в n_number_of_the_point_under_analysis + n_golden_section_point_1 = 0 #Нижняя граница пробуемых -s. + n_golden_section_point_4 = 1 #Верхняя граница пробуемых -s. + n_golden_section_point_2 = round(n_golden_section_point_1 + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_golden_section_point_4 - n_golden_section_point_1), 3) #округлю до третьего знака после запятой, иначе у Питона иногда вылезают числа вроде 0.144200000001 + n_golden_section_point_3 = round(n_golden_section_point_4 - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_golden_section_point_4 - n_golden_section_point_1), 3) #Для 0 и 1 (двух крайних точек стартового интервала) я не делаю измерений, потому что метод золотого сечения этого не требует. #Это список, в который для каждого проверенного -s будет записан AG. Ключ это -s, а значение это AG. d_s_to_AG = {} #Например, [0.362] = 762. + #Это список, в который для каждого проверенного -s будет записан N50. Ключ это -s, а значение это N50. + d_s_to_N50 = {} #Например, [0.362] = 12345678. #Анализирую вторую точку. n_number_of_the_point_under_analysis = 1 - n_s = n_point_2 + n_s = n_golden_section_point_2 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") - os.mkdir(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s)) + os.mkdir(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis)) #Делаю сборку, после чего конвертирую файл p_ctg.gfa в FASTA, делая файл assembly.fasta . - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) #Название выходного файла зависит от того, давал ли пользователь риды Hi-C или нет. #если пользователь не дал риды Hi-C if (s_path_to_hic_short_reads_R1 == ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.bp.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.bp.p_ctg.gfa" #если пользователь дал риды Hi-C. if (s_path_to_hic_short_reads_R1 != ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.hic.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.hic.p_ctg.gfa" #теперь из файла GFA с первичными контигами делаю файл FASTA с ними. f_infile = open(s_path_to_gfa_with_primary_contigs, "r") - f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta", "w") + f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta", "w") for s_line in f_infile: #S ptg000001l AGTTTACGTTGAACAACCTCCAGGGTTTGT... o_regular_expression_results = re.search(r"^[sS]\s+(\S+)\s+(\S+)", s_line) @@ -572,58 +623,65 @@ f_infile.close() f_outfile.close() - s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/" #путь к последней папке со сборкой. Нужен, чтобы из неё перемещать файлы с расширениями .bin и .utg в новую папку со сборкой. Их присутствие ускоряет сборку. + s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/" #путь к последней папке со сборкой. Нужен, чтобы из неё перемещать файлы с расширениями .bin и .utg в новую папку со сборкой. Их присутствие ускоряет сборку. #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt", "r") + if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt"): + f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt", "r") s_line_1 = f_infile.readline() + f_infile.close() #AG is 487 o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_2 = int(o_regular_expression_results.group(1)) + n_AG_for_golden_section_point_2 = int(o_regular_expression_results.group(1)) else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + f_log.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") sys.exit() - d_s_to_AG[n_s] = n_AG_for_point_2 + d_s_to_AG[n_s] = n_AG_for_golden_section_point_2 + + #Считаю N50 + s_command_output = subprocess.getoutput("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/calculate_N50.py " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta") #скрипт calculate_N50.py выдаёт N50. + s_command_output = re.sub(r"\n", r"", s_command_output) #удаляю символ переноса строки + n_N50_for_golden_section_point_2 = int(s_command_output) + d_s_to_N50[n_s] = n_N50_for_golden_section_point_2 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for -s " + str(n_s) + " is " + str(n_AG_for_point_2) + "\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("For -s = " + str(n_s) + ": AG = " + str(n_AG_for_golden_section_point_2) + " and N50 = " + str(n_N50_for_golden_section_point_2) + "\n\n") #Анализирую третью точку. n_number_of_the_point_under_analysis += 1 - n_s = n_point_3 + n_s = n_golden_section_point_3 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") - os.mkdir(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s)) + os.mkdir(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis)) #Делаю сборку, после чего конвертирую файл p_ctg.gfa в FASTA, делая файл assembly.fasta . #Перемещаю из прошлой папки со сборкой в эту файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. - os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/") + os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/") - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) #Название выходного файла зависит от того, давал ли пользователь риды Hi-C или нет. #если пользователь не дал риды Hi-C if (s_path_to_hic_short_reads_R1 == ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.bp.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.bp.p_ctg.gfa" #если пользователь дал риды Hi-C. if (s_path_to_hic_short_reads_R1 != ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.hic.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.hic.p_ctg.gfa" #теперь из файла GFA с первичными контигами делаю файл FASTA с ними. f_infile = open(s_path_to_gfa_with_primary_contigs, "r") - f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta", "w") + f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta", "w") for s_line in f_infile: #S ptg000001l AGTTTACGTTGAACAACCTCCAGGGTTTGT... o_regular_expression_results = re.search(r"^[sS]\s+(\S+)\s+(\S+)", s_line) @@ -632,73 +690,85 @@ f_infile.close() f_outfile.close() - s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. + s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt", "r") + if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt"): + f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt", "r") s_line_1 = f_infile.readline() + f_infile.close() #AG is 487 o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_3 = int(o_regular_expression_results.group(1)) + n_AG_for_golden_section_point_3 = int(o_regular_expression_results.group(1)) else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + f_log.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") sys.exit() - d_s_to_AG[n_s] = n_AG_for_point_3 + d_s_to_AG[n_s] = n_AG_for_golden_section_point_3 + + #Считаю N50 + s_command_output = subprocess.getoutput("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/calculate_N50.py " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta") #скрипт calculate_N50.py выдаёт N50. + s_command_output = re.sub(r"\n", r"", s_command_output) #удаляю символ переноса строки + n_N50_for_golden_section_point_3 = int(s_command_output) + d_s_to_N50[n_s] = n_N50_for_golden_section_point_3 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for -s " + str(n_s) + " is " + str(n_AG_for_point_3) + "\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("For -s = " + str(n_s) + ": AG = " + str(n_AG_for_golden_section_point_3) + " and N50 = " + str(n_N50_for_golden_section_point_3) + "\n\n") #теперь последовательно выбираю остальные 8 точек методом золотого сечения и меряю AG для них. while n_number_of_the_point_under_analysis < 10: #"<", а не "<=", потому что увеличение номера точки здесь делается в начале цикла. n_number_of_the_point_under_analysis += 1 - #Смотрю, какая из двух центральных точек (вторая или третья) имеют меньшее значение AG. Если вторая имеет меньшее ли равное третьей, то выкидываю первую точку и сужаю интервал. Если третья имеет меньшее, чем вторая, то выкидываю четвёртую точку и сужаю интервал. При равных значениях выкидывыю правую. Правую выкидываю потому, что по моим впечатлениям оптимальный -s чаще бывает ближе к 0, чем к 1. - if n_AG_for_point_2 < n_AG_for_point_3: - n_point_1 = n_point_2 - n_point_2 = n_point_3 - #n_point_4 не меняется - n_point_3 = round((n_point_4 - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_point_4 - n_point_1)), 3) + #Смотрю, какая из двух центральных точек (вторая или третья) имеют меньшее значение AG. Если вторая имеет меньшее, то выкидываю первую точку и сужаю интервал. Если третья имеет меньшее, чем вторая, то выкидываю четвёртую точку и сужаю интервал. При равных значениях AG делаю такое же сравнение, но для N50. Если и N50 равные, то выкидываю правую точку. Правую выкидываю потому, что по моим впечатлениям оптимальный -s чаще бывает ближе к 0, чем к 1. + if (n_AG_for_golden_section_point_2 < n_AG_for_golden_section_point_3) or ((n_AG_for_golden_section_point_2 == n_AG_for_golden_section_point_3) and (n_N50_for_golden_section_point_2 < n_N50_for_golden_section_point_3)): + n_golden_section_point_1 = n_golden_section_point_2 + n_golden_section_point_2 = n_golden_section_point_3 + #n_golden_section_point_4 не меняется + n_golden_section_point_3 = round((n_golden_section_point_4 - ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_golden_section_point_4 - n_golden_section_point_1)), 3) + + n_AG_for_golden_section_point_1 = n_AG_for_golden_section_point_2 + n_N50_for_golden_section_point_1 = n_N50_for_golden_section_point_2 + + n_AG_for_golden_section_point_2 = n_AG_for_golden_section_point_3 + n_N50_for_golden_section_point_2 = n_N50_for_golden_section_point_3 - n_AG_for_point_1 = n_AG_for_point_2 - n_AG_for_point_2 = n_AG_for_point_3 - #n_AG_for_point_4 не меняется - n_AG_for_point_3 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. + #n_AG_for_golden_section_point_4 и n_N50_for_golden_section_point_4 не меняется + n_AG_for_golden_section_point_3 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. + n_N50_for_golden_section_point_3 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. #Анализирую третью точку. - n_s = n_point_3 + n_s = n_golden_section_point_3 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") - os.mkdir(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s)) + os.mkdir(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis)) #Делаю сборку, после чего конвертирую файл p_ctg.gfa в FASTA, делая файл assembly.fasta . #Перемещаю из прошлой папки со сборкой в эту файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. - os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/") + os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/") - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) #Название выходного файла зависит от того, давал ли пользователь риды Hi-C или нет. #если пользователь не дал риды Hi-C if (s_path_to_hic_short_reads_R1 == ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.bp.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.bp.p_ctg.gfa" #если пользователь дал риды Hi-C. if (s_path_to_hic_short_reads_R1 != ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.hic.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.hic.p_ctg.gfa" #теперь из файла GFA с первичными контигами делаю файл FASTA с ними. f_infile = open(s_path_to_gfa_with_primary_contigs, "r") - f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta", "w") + f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta", "w") for s_line in f_infile: #S ptg000001l AGTTTACGTTGAACAACCTCCAGGGTTTGT... o_regular_expression_results = re.search(r"^[sS]\s+(\S+)\s+(\S+)", s_line) @@ -707,68 +777,80 @@ f_infile.close() f_outfile.close() - s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. + s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt", "r") + if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt"): + f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt", "r") s_line_1 = f_infile.readline() + f_infile.close() #AG is 487 o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_3 = int(o_regular_expression_results.group(1)) + n_AG_for_golden_section_point_3 = int(o_regular_expression_results.group(1)) else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + f_log.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") sys.exit() - d_s_to_AG[n_s] = n_AG_for_point_3 + d_s_to_AG[n_s] = n_AG_for_golden_section_point_3 + + #Считаю N50 + s_command_output = subprocess.getoutput("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/calculate_N50.py " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta") #скрипт calculate_N50.py выдаёт N50. + s_command_output = re.sub(r"\n", r"", s_command_output) #удаляю символ переноса строки + n_N50_for_golden_section_point_3 = int(s_command_output) + d_s_to_N50[n_s] = n_N50_for_golden_section_point_3 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for -s " + str(n_s) + " is " + str(n_AG_for_point_3) + "\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("For -s = " + str(n_s) + ": AG = " + str(n_AG_for_golden_section_point_3) + " and N50 = " + str(n_N50_for_golden_section_point_3) + "\n\n") + + elif (n_AG_for_golden_section_point_2 > n_AG_for_golden_section_point_3) or ((n_AG_for_golden_section_point_2 == n_AG_for_golden_section_point_3) and (n_N50_for_golden_section_point_2 >= n_N50_for_golden_section_point_3)): + #n_golden_section_point_1 не меняется + n_golden_section_point_4 = n_golden_section_point_3 + n_golden_section_point_3 = n_golden_section_point_2 + n_golden_section_point_2 = round((n_golden_section_point_1 + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_golden_section_point_4 - n_golden_section_point_1)), 3) - elif n_AG_for_point_2 >= n_AG_for_point_3: - #n_point_1 не меняется - n_point_4 = n_point_3 - n_point_3 = n_point_2 - n_point_2 = round((n_point_1 + ((math.sqrt(5) - 1) / (math.sqrt(5) + 1))*(n_point_4 - n_point_1)), 3) + #n_AG_for_golden_section_point_1 и n_N50_for_golden_section_point_1 не меняются + n_AG_for_golden_section_point_4 = n_AG_for_golden_section_point_3 + n_N50_for_golden_section_point_4 = n_N50_for_golden_section_point_3 - #n_AG_for_point_1 не меняется - n_AG_for_point_4 = n_AG_for_point_3 - n_AG_for_point_3 = n_AG_for_point_2 - n_AG_for_point_2 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. + n_AG_for_golden_section_point_3 = n_AG_for_golden_section_point_2 + n_N50_for_golden_section_point_3 = n_N50_for_golden_section_point_2 + + n_AG_for_golden_section_point_2 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. + n_N50_for_golden_section_point_2 = -100 #плейсхолдер. Всё равно это значение я сейчас посчитаю. #Анализирую вторую точку. - n_s = n_point_2 + n_s = n_golden_section_point_2 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-hifiasm started to analyze point " + str(n_number_of_the_point_under_analysis) + " of 10. -s in this point is " + str(n_s) + "\n") - os.mkdir(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s)) + os.mkdir(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis)) #Делаю сборку, после чего конвертирую файл p_ctg.gfa в FASTA, делая файл assembly.fasta . #Перемещаю из прошлой папки со сборкой в эту файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. - os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/") + os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/") - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s) + " -o " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly --only-primary --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) #Название выходного файла зависит от того, давал ли пользователь риды Hi-C или нет. #если пользователь не дал риды Hi-C if (s_path_to_hic_short_reads_R1 == ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.bp.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.bp.p_ctg.gfa" #если пользователь дал риды Hi-C. if (s_path_to_hic_short_reads_R1 != ""): - s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.hic.p_ctg.gfa" + s_path_to_gfa_with_primary_contigs = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.hic.p_ctg.gfa" #теперь из файла GFA с первичными контигами делаю файл FASTA с ними. f_infile = open(s_path_to_gfa_with_primary_contigs, "r") - f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta", "w") + f_outfile = open(s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta", "w") for s_line in f_infile: #S ptg000001l AGTTTACGTTGAACAACCTCCAGGGTTTGT... o_regular_expression_results = re.search(r"^[sS]\s+(\S+)\s+(\S+)", s_line) @@ -778,48 +860,63 @@ f_outfile.close() #"--number_of_busco_orthogroups all" использую потому, что в папке BUSCO_dataset_to_use уже оставлены только те ортогруппы, которые нужно использовать. - os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) + os.system("python3 " + s_path_to_the_folder_where_Mabs_lies + "/calculate_AG.py --output_folder " + s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + " --assembly " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta --pacbio_hifi_reads " + s_path_to_pacbio_hifi_reads_that_correspond_to_busco_genes + " --number_of_busco_orthogroups all --local_busco_dataset " + s_path_to_the_output_folder + "/BUSCO_dataset_to_use --use_proovframe false --max_intron_length " + s_maximum_allowed_intron_length + " --threads " + str(n_number_of_cpu_threads_to_use)) - s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_-s_" + str(n_s) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. + s_path_to_the_last_assembly_folder = s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/" #путь к последней папке со сборкой. Нужен, чтобы перемещать из неё в новую папку со сборкой файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. #Беру AG, посчитанный скриптом calculate_AG.py - if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt"): - f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_-s_" + str(n_s) + "/AG.txt", "r") + if os.path.isfile(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt"): + f_infile = open(s_path_to_the_output_folder + "/AG_calculation_for_point_" + str(n_number_of_the_point_under_analysis) + "/AG.txt", "r") s_line_1 = f_infile.readline() + f_infile.close() #AG is 487 o_regular_expression_results = re.search(r"AG is (\d+)", s_line_1) - n_AG_for_point_2 = int(o_regular_expression_results.group(1)) + n_AG_for_golden_section_point_2 = int(o_regular_expression_results.group(1)) else: - f_logs.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + f_log.write("Error. Couldn't calculate AG. See stderr and stdout for the reason why.") + + d_s_to_AG[n_s] = n_AG_for_golden_section_point_2 - d_s_to_AG[n_s] = n_AG_for_point_2 + #Считаю N50 + s_command_output = subprocess.getoutput("python3 " + s_path_to_the_folder_where_Mabs_lies + "/Additional/calculate_N50.py " + s_path_to_the_output_folder + "/Assembly_for_point_" + str(n_number_of_the_point_under_analysis) + "/assembly.fasta") #скрипт calculate_N50.py выдаёт N50. + s_command_output = re.sub(r"\n", r"", s_command_output) #удаляю символ переноса строки + n_N50_for_golden_section_point_2 = int(s_command_output) + d_s_to_N50[n_s] = n_N50_for_golden_section_point_2 o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("AG for -s " + str(n_s) + " is " + str(n_AG_for_point_2) + "\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("For -s = " + str(n_s) + ": AG = " + str(n_AG_for_golden_section_point_2) + " and N50 = " + str(n_N50_for_golden_section_point_2) + "\n\n") - #После того, как посчитал AG для всех 10 точек, я смотрю, какая из них дала лучший AG. После этого делаю сборку для этого значения "-s", но на этот раз без использования параметра Modified_hifiasm "--only-primary", потому что тут я хочу сделать все файлы, в том числе файлы с фазированной сборкой — может быть, они будут полезны пользователю. Если две точки дают одинаковый AG, то, для определённости, выбираю ту из них, которая имеет меньший -s. - n_s_that_provides_maximum_AG = -100 - n_maximum_AG = -100 + + #После того, как посчитал AG для всех 10 точек, я смотрю, какая из них дала наибольший AG. Если наибольший AG принадлежит сразу нескольким точкам, то выбираю ту из них, которая дала больший N50. Если несколько точек дают одинаковый AG и N50, то, для определённости, выбираю ту из них, которая имеет меньший -s. После этого делаю сборку для этого значения "-s", но на этот раз без использования параметра Modified_hifiasm "--only-primary", потому что тут я хочу сделать все файлы, в том числе файлы с фазированной сборкой — может быть, они будут полезны пользователю. + + n_s_that_makes_the_best_assembly = -100 + n_AG_in_the_best_point = -100 + n_N50_in_the_best_point = -100 for n_s in d_s_to_AG: - if d_s_to_AG[n_s] > n_maximum_AG: - n_s_that_provides_maximum_AG = n_s - n_maximum_AG = d_s_to_AG[n_s] - - if (d_s_to_AG[n_s] == n_maximum_AG) and (n_s < n_s_that_provides_maximum_AG): - n_s_that_provides_maximum_AG = n_s + if d_s_to_AG[n_s] > n_AG_in_the_best_point: + n_s_that_makes_the_best_assembly = n_s + n_AG_in_the_best_point = d_s_to_AG[n_s] + n_N50_in_the_best_point = d_s_to_N50[n_s] + elif (d_s_to_AG[n_s] == n_AG_in_the_best_point) and (d_s_to_N50[n_s] > n_N50_in_the_best_point): + n_s_that_makes_the_best_assembly = n_s + n_AG_in_the_best_point = d_s_to_AG[n_s] + n_N50_in_the_best_point = d_s_to_N50[n_s] + elif (d_s_to_AG[n_s] == n_AG_in_the_best_point) and (d_s_to_N50[n_s] == n_N50_in_the_best_point) and (n_s < n_s_that_makes_the_best_assembly): + + n_s_that_makes_the_best_assembly = n_s o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("The optimal -s is " + str(n_s_that_provides_maximum_AG) + ", it provides AG = " + str(n_maximum_AG) + ". Now performing a full assembly for this value of -s.\n\n") + f_log.write(s_current_time_and_date + "\n") + f_log.write("The optimal -s is " + str(n_s_that_makes_the_best_assembly) + ", it provides AG = " + str(n_AG_in_the_best_point) + " and N50 = " + str(n_N50_in_the_best_point) + ". Now performing a full assembly for this value of -s.\n\n") os.mkdir(s_path_to_the_output_folder + "/The_best_assembly") #Перемещаю из прошлой папки со сборкой в эту файлы, названия которых имеют форму *.bin или *utg*. Присутствие этих файлов ускоряет сборку. os.system("mv " + s_path_to_the_last_assembly_folder + "/*.bin " + s_path_to_the_last_assembly_folder + "/*utg* " + s_path_to_the_output_folder + "/The_best_assembly/") - os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s_that_provides_maximum_AG) + " -o " + s_path_to_the_output_folder + "/The_best_assembly/assembly --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) + os.system(s_path_to_the_folder_where_Mabs_lies + "/Additional/Modified_hifiasm/modified_hifiasm -s " + str(n_s_that_makes_the_best_assembly) + " -o " + s_path_to_the_output_folder + "/The_best_assembly/assembly --n-hap " + str(n_ploidy) + " -t " + str(n_number_of_cpu_threads_to_use) + " " + s_additional_hifiasm_parameters + " " + s_command_line_arguments_with_reads_for_Modified_hifiasm) #Название выходного файла зависит от того, давал ли пользователь риды Hi-C или нет. #если пользователь не дал риды Hi-C @@ -842,8 +939,8 @@ o_current_time_and_date = datetime.datetime.now() s_current_time_and_date = o_current_time_and_date.strftime("%H:%M:%S %Y-%m-%d") - f_logs.write(s_current_time_and_date + "\n") - f_logs.write("Mabs-hifiasm finished. The contigs are in the file " + s_path_to_the_output_folder + "/The_best_assembly/assembly.fasta") + f_log.write(s_current_time_and_date + "\n") + f_log.write("Mabs-hifiasm finished. The contigs are in the file " + s_path_to_the_output_folder + "/The_best_assembly/assembly.fasta")