+ Coverage for binette/__init__.py: + 100% +
+ ++ 0 statements + + + +
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +diff --git a/coverage-badge.svg b/coverage-badge.svg new file mode 100644 index 0000000..af2f09e --- /dev/null +++ b/coverage-badge.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/coverage_html.js b/coverage_html.js new file mode 100644 index 0000000..5934882 --- /dev/null +++ b/coverage_html.js @@ -0,0 +1,624 @@ +// Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0 +// For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt + +// Coverage.py HTML report browser code. +/*jslint browser: true, sloppy: true, vars: true, plusplus: true, maxerr: 50, indent: 4 */ +/*global coverage: true, document, window, $ */ + +coverage = {}; + +// General helpers +function debounce(callback, wait) { + let timeoutId = null; + return function(...args) { + clearTimeout(timeoutId); + timeoutId = setTimeout(() => { + callback.apply(this, args); + }, wait); + }; +}; + +function checkVisible(element) { + const rect = element.getBoundingClientRect(); + const viewBottom = Math.max(document.documentElement.clientHeight, window.innerHeight); + const viewTop = 30; + return !(rect.bottom < viewTop || rect.top >= viewBottom); +} + +function on_click(sel, fn) { + const elt = document.querySelector(sel); + if (elt) { + elt.addEventListener("click", fn); + } +} + +// Helpers for table sorting +function getCellValue(row, column = 0) { + const cell = row.cells[column] // nosemgrep: eslint.detect-object-injection + if (cell.childElementCount == 1) { + const child = cell.firstElementChild + if (child instanceof HTMLTimeElement && child.dateTime) { + return child.dateTime + } else if (child instanceof HTMLDataElement && child.value) { + return child.value + } + } + return cell.innerText || cell.textContent; +} + +function rowComparator(rowA, rowB, column = 0) { + let valueA = getCellValue(rowA, column); + let valueB = getCellValue(rowB, column); + if (!isNaN(valueA) && !isNaN(valueB)) { + return valueA - valueB + } + return valueA.localeCompare(valueB, undefined, {numeric: true}); +} + +function sortColumn(th) { + // Get the current sorting direction of the selected header, + // clear state on other headers and then set the new sorting direction + const currentSortOrder = th.getAttribute("aria-sort"); + [...th.parentElement.cells].forEach(header => header.setAttribute("aria-sort", "none")); + if (currentSortOrder === "none") { + th.setAttribute("aria-sort", th.dataset.defaultSortOrder || "ascending"); + } else { + th.setAttribute("aria-sort", currentSortOrder === "ascending" ? "descending" : "ascending"); + } + + const column = [...th.parentElement.cells].indexOf(th) + + // Sort all rows and afterwards append them in order to move them in the DOM + Array.from(th.closest("table").querySelectorAll("tbody tr")) + .sort((rowA, rowB) => rowComparator(rowA, rowB, column) * (th.getAttribute("aria-sort") === "ascending" ? 1 : -1)) + .forEach(tr => tr.parentElement.appendChild(tr) ); +} + +// Find all the elements with data-shortcut attribute, and use them to assign a shortcut key. +coverage.assign_shortkeys = function () { + document.querySelectorAll("[data-shortcut]").forEach(element => { + document.addEventListener("keypress", event => { + if (event.target.tagName.toLowerCase() === "input") { + return; // ignore keypress from search filter + } + if (event.key === element.dataset.shortcut) { + element.click(); + } + }); + }); +}; + +// Create the events for the filter box. +coverage.wire_up_filter = function () { + // Cache elements. + const table = document.querySelector("table.index"); + const table_body_rows = table.querySelectorAll("tbody tr"); + const no_rows = document.getElementById("no_rows"); + + // Observe filter keyevents. + document.getElementById("filter").addEventListener("input", debounce(event => { + // Keep running total of each metric, first index contains number of shown rows + const totals = new Array(table.rows[0].cells.length).fill(0); + // Accumulate the percentage as fraction + totals[totals.length - 1] = { "numer": 0, "denom": 0 }; // nosemgrep: eslint.detect-object-injection + + // Hide / show elements. + table_body_rows.forEach(row => { + if (!row.cells[0].textContent.includes(event.target.value)) { + // hide + row.classList.add("hidden"); + return; + } + + // show + row.classList.remove("hidden"); + totals[0]++; + + for (let column = 1; column < totals.length; column++) { + // Accumulate dynamic totals + cell = row.cells[column] // nosemgrep: eslint.detect-object-injection + if (column === totals.length - 1) { + // Last column contains percentage + const [numer, denom] = cell.dataset.ratio.split(" "); + totals[column]["numer"] += parseInt(numer, 10); // nosemgrep: eslint.detect-object-injection + totals[column]["denom"] += parseInt(denom, 10); // nosemgrep: eslint.detect-object-injection + } else { + totals[column] += parseInt(cell.textContent, 10); // nosemgrep: eslint.detect-object-injection + } + } + }); + + // Show placeholder if no rows will be displayed. + if (!totals[0]) { + // Show placeholder, hide table. + no_rows.style.display = "block"; + table.style.display = "none"; + return; + } + + // Hide placeholder, show table. + no_rows.style.display = null; + table.style.display = null; + + const footer = table.tFoot.rows[0]; + // Calculate new dynamic sum values based on visible rows. + for (let column = 1; column < totals.length; column++) { + // Get footer cell element. + const cell = footer.cells[column]; // nosemgrep: eslint.detect-object-injection + + // Set value into dynamic footer cell element. + if (column === totals.length - 1) { + // Percentage column uses the numerator and denominator, + // and adapts to the number of decimal places. + const match = /\.([0-9]+)/.exec(cell.textContent); + const places = match ? match[1].length : 0; + const { numer, denom } = totals[column]; // nosemgrep: eslint.detect-object-injection + cell.dataset.ratio = `${numer} ${denom}`; + // Check denom to prevent NaN if filtered files contain no statements + cell.textContent = denom + ? `${(numer * 100 / denom).toFixed(places)}%` + : `${(100).toFixed(places)}%`; + } else { + cell.textContent = totals[column]; // nosemgrep: eslint.detect-object-injection + } + } + })); + + // Trigger change event on setup, to force filter on page refresh + // (filter value may still be present). + document.getElementById("filter").dispatchEvent(new Event("input")); +}; + +coverage.INDEX_SORT_STORAGE = "COVERAGE_INDEX_SORT_2"; + +// Loaded on index.html +coverage.index_ready = function () { + coverage.assign_shortkeys(); + coverage.wire_up_filter(); + document.querySelectorAll("[data-sortable] th[aria-sort]").forEach( + th => th.addEventListener("click", e => sortColumn(e.target)) + ); + + // Look for a localStorage item containing previous sort settings: + const stored_list = localStorage.getItem(coverage.INDEX_SORT_STORAGE); + + if (stored_list) { + const {column, direction} = JSON.parse(stored_list); + const th = document.querySelector("[data-sortable]").tHead.rows[0].cells[column]; // nosemgrep: eslint.detect-object-injection + th.setAttribute("aria-sort", direction === "ascending" ? "descending" : "ascending"); + th.click() + } + + // Watch for page unload events so we can save the final sort settings: + window.addEventListener("unload", function () { + const th = document.querySelector('[data-sortable] th[aria-sort="ascending"], [data-sortable] [aria-sort="descending"]'); + if (!th) { + return; + } + localStorage.setItem(coverage.INDEX_SORT_STORAGE, JSON.stringify({ + column: [...th.parentElement.cells].indexOf(th), + direction: th.getAttribute("aria-sort"), + })); + }); + + on_click(".button_prev_file", coverage.to_prev_file); + on_click(".button_next_file", coverage.to_next_file); + + on_click(".button_show_hide_help", coverage.show_hide_help); +}; + +// -- pyfile stuff -- + +coverage.LINE_FILTERS_STORAGE = "COVERAGE_LINE_FILTERS"; + +coverage.pyfile_ready = function () { + // If we're directed to a particular line number, highlight the line. + var frag = location.hash; + if (frag.length > 2 && frag[1] === "t") { + document.querySelector(frag).closest(".n").classList.add("highlight"); + coverage.set_sel(parseInt(frag.substr(2), 10)); + } else { + coverage.set_sel(0); + } + + on_click(".button_toggle_run", coverage.toggle_lines); + on_click(".button_toggle_mis", coverage.toggle_lines); + on_click(".button_toggle_exc", coverage.toggle_lines); + on_click(".button_toggle_par", coverage.toggle_lines); + + on_click(".button_next_chunk", coverage.to_next_chunk_nicely); + on_click(".button_prev_chunk", coverage.to_prev_chunk_nicely); + on_click(".button_top_of_page", coverage.to_top); + on_click(".button_first_chunk", coverage.to_first_chunk); + + on_click(".button_prev_file", coverage.to_prev_file); + on_click(".button_next_file", coverage.to_next_file); + on_click(".button_to_index", coverage.to_index); + + on_click(".button_show_hide_help", coverage.show_hide_help); + + coverage.filters = undefined; + try { + coverage.filters = localStorage.getItem(coverage.LINE_FILTERS_STORAGE); + } catch(err) {} + + if (coverage.filters) { + coverage.filters = JSON.parse(coverage.filters); + } + else { + coverage.filters = {run: false, exc: true, mis: true, par: true}; + } + + for (cls in coverage.filters) { + coverage.set_line_visibilty(cls, coverage.filters[cls]); // nosemgrep: eslint.detect-object-injection + } + + coverage.assign_shortkeys(); + coverage.init_scroll_markers(); + coverage.wire_up_sticky_header(); + + document.querySelectorAll("[id^=ctxs]").forEach( + cbox => cbox.addEventListener("click", coverage.expand_contexts) + ); + + // Rebuild scroll markers when the window height changes. + window.addEventListener("resize", coverage.build_scroll_markers); +}; + +coverage.toggle_lines = function (event) { + const btn = event.target.closest("button"); + const category = btn.value + const show = !btn.classList.contains("show_" + category); + coverage.set_line_visibilty(category, show); + coverage.build_scroll_markers(); + coverage.filters[category] = show; + try { + localStorage.setItem(coverage.LINE_FILTERS_STORAGE, JSON.stringify(coverage.filters)); + } catch(err) {} +}; + +coverage.set_line_visibilty = function (category, should_show) { + const cls = "show_" + category; + const btn = document.querySelector(".button_toggle_" + category); + if (btn) { + if (should_show) { + document.querySelectorAll("#source ." + category).forEach(e => e.classList.add(cls)); + btn.classList.add(cls); + } + else { + document.querySelectorAll("#source ." + category).forEach(e => e.classList.remove(cls)); + btn.classList.remove(cls); + } + } +}; + +// Return the nth line div. +coverage.line_elt = function (n) { + return document.getElementById("t" + n)?.closest("p"); +}; + +// Set the selection. b and e are line numbers. +coverage.set_sel = function (b, e) { + // The first line selected. + coverage.sel_begin = b; + // The next line not selected. + coverage.sel_end = (e === undefined) ? b+1 : e; +}; + +coverage.to_top = function () { + coverage.set_sel(0, 1); + coverage.scroll_window(0); +}; + +coverage.to_first_chunk = function () { + coverage.set_sel(0, 1); + coverage.to_next_chunk(); +}; + +coverage.to_prev_file = function () { + window.location = document.getElementById("prevFileLink").href; +} + +coverage.to_next_file = function () { + window.location = document.getElementById("nextFileLink").href; +} + +coverage.to_index = function () { + location.href = document.getElementById("indexLink").href; +} + +coverage.show_hide_help = function () { + const helpCheck = document.getElementById("help_panel_state") + helpCheck.checked = !helpCheck.checked; +} + +// Return a string indicating what kind of chunk this line belongs to, +// or null if not a chunk. +coverage.chunk_indicator = function (line_elt) { + const classes = line_elt?.className; + if (!classes) { + return null; + } + const match = classes.match(/\bshow_\w+\b/); + if (!match) { + return null; + } + return match[0]; +}; + +coverage.to_next_chunk = function () { + const c = coverage; + + // Find the start of the next colored chunk. + var probe = c.sel_end; + var chunk_indicator, probe_line; + while (true) { + probe_line = c.line_elt(probe); + if (!probe_line) { + return; + } + chunk_indicator = c.chunk_indicator(probe_line); + if (chunk_indicator) { + break; + } + probe++; + } + + // There's a next chunk, `probe` points to it. + var begin = probe; + + // Find the end of this chunk. + var next_indicator = chunk_indicator; + while (next_indicator === chunk_indicator) { + probe++; + probe_line = c.line_elt(probe); + next_indicator = c.chunk_indicator(probe_line); + } + c.set_sel(begin, probe); + c.show_selection(); +}; + +coverage.to_prev_chunk = function () { + const c = coverage; + + // Find the end of the prev colored chunk. + var probe = c.sel_begin-1; + var probe_line = c.line_elt(probe); + if (!probe_line) { + return; + } + var chunk_indicator = c.chunk_indicator(probe_line); + while (probe > 1 && !chunk_indicator) { + probe--; + probe_line = c.line_elt(probe); + if (!probe_line) { + return; + } + chunk_indicator = c.chunk_indicator(probe_line); + } + + // There's a prev chunk, `probe` points to its last line. + var end = probe+1; + + // Find the beginning of this chunk. + var prev_indicator = chunk_indicator; + while (prev_indicator === chunk_indicator) { + probe--; + if (probe <= 0) { + return; + } + probe_line = c.line_elt(probe); + prev_indicator = c.chunk_indicator(probe_line); + } + c.set_sel(probe+1, end); + c.show_selection(); +}; + +// Returns 0, 1, or 2: how many of the two ends of the selection are on +// the screen right now? +coverage.selection_ends_on_screen = function () { + if (coverage.sel_begin === 0) { + return 0; + } + + const begin = coverage.line_elt(coverage.sel_begin); + const end = coverage.line_elt(coverage.sel_end-1); + + return ( + (checkVisible(begin) ? 1 : 0) + + (checkVisible(end) ? 1 : 0) + ); +}; + +coverage.to_next_chunk_nicely = function () { + if (coverage.selection_ends_on_screen() === 0) { + // The selection is entirely off the screen: + // Set the top line on the screen as selection. + + // This will select the top-left of the viewport + // As this is most likely the span with the line number we take the parent + const line = document.elementFromPoint(0, 0).parentElement; + if (line.parentElement !== document.getElementById("source")) { + // The element is not a source line but the header or similar + coverage.select_line_or_chunk(1); + } else { + // We extract the line number from the id + coverage.select_line_or_chunk(parseInt(line.id.substring(1), 10)); + } + } + coverage.to_next_chunk(); +}; + +coverage.to_prev_chunk_nicely = function () { + if (coverage.selection_ends_on_screen() === 0) { + // The selection is entirely off the screen: + // Set the lowest line on the screen as selection. + + // This will select the bottom-left of the viewport + // As this is most likely the span with the line number we take the parent + const line = document.elementFromPoint(document.documentElement.clientHeight-1, 0).parentElement; + if (line.parentElement !== document.getElementById("source")) { + // The element is not a source line but the header or similar + coverage.select_line_or_chunk(coverage.lines_len); + } else { + // We extract the line number from the id + coverage.select_line_or_chunk(parseInt(line.id.substring(1), 10)); + } + } + coverage.to_prev_chunk(); +}; + +// Select line number lineno, or if it is in a colored chunk, select the +// entire chunk +coverage.select_line_or_chunk = function (lineno) { + var c = coverage; + var probe_line = c.line_elt(lineno); + if (!probe_line) { + return; + } + var the_indicator = c.chunk_indicator(probe_line); + if (the_indicator) { + // The line is in a highlighted chunk. + // Search backward for the first line. + var probe = lineno; + var indicator = the_indicator; + while (probe > 0 && indicator === the_indicator) { + probe--; + probe_line = c.line_elt(probe); + if (!probe_line) { + break; + } + indicator = c.chunk_indicator(probe_line); + } + var begin = probe + 1; + + // Search forward for the last line. + probe = lineno; + indicator = the_indicator; + while (indicator === the_indicator) { + probe++; + probe_line = c.line_elt(probe); + indicator = c.chunk_indicator(probe_line); + } + + coverage.set_sel(begin, probe); + } + else { + coverage.set_sel(lineno); + } +}; + +coverage.show_selection = function () { + // Highlight the lines in the chunk + document.querySelectorAll("#source .highlight").forEach(e => e.classList.remove("highlight")); + for (let probe = coverage.sel_begin; probe < coverage.sel_end; probe++) { + coverage.line_elt(probe).querySelector(".n").classList.add("highlight"); + } + + coverage.scroll_to_selection(); +}; + +coverage.scroll_to_selection = function () { + // Scroll the page if the chunk isn't fully visible. + if (coverage.selection_ends_on_screen() < 2) { + const element = coverage.line_elt(coverage.sel_begin); + coverage.scroll_window(element.offsetTop - 60); + } +}; + +coverage.scroll_window = function (to_pos) { + window.scroll({top: to_pos, behavior: "smooth"}); +}; + +coverage.init_scroll_markers = function () { + // Init some variables + coverage.lines_len = document.querySelectorAll("#source > p").length; + + // Build html + coverage.build_scroll_markers(); +}; + +coverage.build_scroll_markers = function () { + const temp_scroll_marker = document.getElementById("scroll_marker") + if (temp_scroll_marker) temp_scroll_marker.remove(); + // Don't build markers if the window has no scroll bar. + if (document.body.scrollHeight <= window.innerHeight) { + return; + } + + const marker_scale = window.innerHeight / document.body.scrollHeight; + const line_height = Math.min(Math.max(3, window.innerHeight / coverage.lines_len), 10); + + let previous_line = -99, last_mark, last_top; + + const scroll_marker = document.createElement("div"); + scroll_marker.id = "scroll_marker"; + document.getElementById("source").querySelectorAll( + "p.show_run, p.show_mis, p.show_exc, p.show_exc, p.show_par" + ).forEach(element => { + const line_top = Math.floor(element.offsetTop * marker_scale); + const line_number = parseInt(element.querySelector(".n a").id.substr(1)); + + if (line_number === previous_line + 1) { + // If this solid missed block just make previous mark higher. + last_mark.style.height = `${line_top + line_height - last_top}px`; + } else { + // Add colored line in scroll_marker block. + last_mark = document.createElement("div"); + last_mark.id = `m${line_number}`; + last_mark.classList.add("marker"); + last_mark.style.height = `${line_height}px`; + last_mark.style.top = `${line_top}px`; + scroll_marker.append(last_mark); + last_top = line_top; + } + + previous_line = line_number; + }); + + // Append last to prevent layout calculation + document.body.append(scroll_marker); +}; + +coverage.wire_up_sticky_header = function () { + const header = document.querySelector("header"); + const header_bottom = ( + header.querySelector(".content h2").getBoundingClientRect().top - + header.getBoundingClientRect().top + ); + + function updateHeader() { + if (window.scrollY > header_bottom) { + header.classList.add("sticky"); + } else { + header.classList.remove("sticky"); + } + } + + window.addEventListener("scroll", updateHeader); + updateHeader(); +}; + +coverage.expand_contexts = function (e) { + var ctxs = e.target.parentNode.querySelector(".ctxs"); + + if (!ctxs.classList.contains("expanded")) { + var ctxs_text = ctxs.textContent; + var width = Number(ctxs_text[0]); + ctxs.textContent = ""; + for (var i = 1; i < ctxs_text.length; i += width) { + key = ctxs_text.substring(i, i + width).trim(); + ctxs.appendChild(document.createTextNode(contexts[key])); + ctxs.appendChild(document.createElement("br")); + } + ctxs.classList.add("expanded"); + } +}; + +document.addEventListener("DOMContentLoaded", () => { + if (document.body.classList.contains("indexfile")) { + coverage.index_ready(); + } else { + coverage.pyfile_ready(); + } +}); diff --git a/d_0bf1b44744b37667___init___py.html b/d_0bf1b44744b37667___init___py.html new file mode 100644 index 0000000..39439da --- /dev/null +++ b/d_0bf1b44744b37667___init___py.html @@ -0,0 +1,97 @@ + + +
+ ++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ ++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1import logging
+2import os
+3from collections import defaultdict
+4import pyfastx
+ +6import itertools
+7import networkx as nx
+8from typing import List, Dict, Iterable, Tuple, Set
+ +10class Bin:
+11 counter = 0
+ +13 def __init__(self, contigs: Iterable[str], origin: str, name: str) -> None:
+14 """
+15 Initialize a Bin object.
+ +17 :param contigs: Iterable of contig names belonging to the bin.
+18 :param origin: Origin/source of the bin.
+19 :param name: Name of the bin.
+20 """
+21 Bin.counter += 1
+ +23 self.origin = origin
+24 self.name = name
+25 self.id = Bin.counter
+26 self.contigs = set(contigs)
+27 self.hash = hash(str(sorted(self.contigs)))
+ +29 self.length = None
+30 self.N50 = None
+ +32 self.completeness = None
+33 self.contamination = None
+34 self.score = None
+ +36 def __eq__(self, other: 'Bin') -> bool:
+37 """
+38 Compare the Bin object with another object for equality.
+ +40 :param other: The object to compare with.
+41 :return: True if the objects are equal, False otherwise.
+42 """
+43 return self.contigs == other.contigs
+ +45 def __hash__(self) -> int:
+46 """
+47 Compute the hash value of the Bin object.
+ +49 :return: The hash value.
+50 """
+51 return self.hash
+ +53 def __str__(self) -> str:
+54 """
+55 Return a string representation of the Bin object.
+ +57 :return: The string representation of the Bin object.
+58 """
+59 return f"{self.origin}_{self.id} ({len(self.contigs)} contigs)"
+ +61 def overlaps_with(self, other: 'Bin') -> Set[str]:
+62 """
+63 Find the contigs that overlap between this bin and another bin.
+ +65 :param other: The other Bin object.
+66 :return: A set of contig names that overlap between the bins.
+67 """
+68 return self.contigs & other.contigs
+ +70 def __and__(self, other: 'Bin') -> 'Bin':
+71 """
+72 Perform a logical AND operation between this bin and another bin.
+ +74 :param other: The other Bin object.
+75 :return: A new Bin object representing the intersection of the bins.
+76 """
+77 contigs = self.contigs & other.contigs
+78 name = f"{self.name} & {other.name}"
+79 origin = f"{self.origin} & {other.origin}"
+ +81 return Bin(contigs, origin, name)
+ + +84 def add_length(self, length: int) -> None:
+85 """
+86 Add the length attribute to the Bin object if the provided length is a positive integer.
+ +88 :param length: The length value to add.
+89 :return: None
+90 """
+91 if isinstance(length, int) and length > 0:
+92 self.length = length
+93 else:
+94 raise ValueError("Length should be a positive integer.")
+ +96 def add_N50(self, n50: int) -> None:
+97 """
+98 Add the N50 attribute to the Bin object.
+ +100 :param n50: The N50 value to add.
+101 :return: None
+102 """
+103 if isinstance(n50, int) and n50 >= 0:
+104 self.N50 = n50
+105 else:
+106 raise ValueError("N50 should be a positive integer.")
+ + +109 def add_quality(self, completeness: float, contamination: float, contamination_weight: float) -> None:
+110 """
+111 Set the quality attributes of the bin.
+ +113 :param completeness: The completeness value.
+114 :param contamination: The contamination value.
+115 :param contamination_weight: The weight assigned to contamination in the score calculation.
+116 """
+117 self.completeness = completeness
+118 self.contamination = contamination
+119 self.score = completeness - contamination_weight * contamination
+ +121 def intersection(self, *others: 'Bin') -> 'Bin':
+122 """
+123 Compute the intersection of the bin with other bins.
+ +125 :param others: Other bins to compute the intersection with.
+126 :return: A new Bin representing the intersection of the bins.
+127 """
+128 other_contigs = (o.contigs for o in others)
+129 contigs = self.contigs.intersection(*other_contigs)
+130 name = f"{self.name} & {' & '.join([other.name for other in others])}"
+131 origin = "intersec"
+ +133 return Bin(contigs, origin, name)
+ +135 def difference(self, *others: 'Bin') -> 'Bin':
+136 """
+137 Compute the difference between the bin and other bins.
+ +139 :param others: Other bins to compute the difference with.
+140 :return: A new Bin representing the difference between the bins.
+141 """
+142 other_contigs = (o.contigs for o in others)
+143 contigs = self.contigs.difference(*other_contigs)
+144 name = f"{self.name} - {' - '.join([other.name for other in others])}"
+145 origin = "diff"
+ +147 return Bin(contigs, origin, name)
+ +149 def union(self, *others: 'Bin') -> 'Bin':
+150 """
+151 Compute the union of the bin with other bins.
+ +153 :param others: Other bins to compute the union with.
+154 :return: A new Bin representing the union of the bins.
+155 """
+156 other_contigs = (o.contigs for o in others)
+157 contigs = self.contigs.union(*other_contigs)
+158 name = f"{self.name} | {' | '.join([other.name for other in others])}"
+159 origin = "union"
+ +161 return Bin(contigs, origin, name)
+ + +164def get_bins_from_directory(bin_dir: str, set_name: str) -> List[Bin]:
+165 """
+166 Retrieves a list of Bin objects from a directory containing bin FASTA files.
+ +168 :param bin_dir: The directory path containing bin FASTA files.
+169 :param set_name: The name of the set the bins belong to.
+ +171 :return: A list of Bin objects created from the bin FASTA files.
+172 """
+173 bins = []
+ +175 for bin_fasta_file in os.listdir(bin_dir):
+ +177 bin_fasta_path = os.path.join(bin_dir, bin_fasta_file)
+178 bin_name = bin_fasta_file
+ +180 contigs = {name for name, _ in pyfastx.Fasta(bin_fasta_path, build_index=False)}
+ +182 bin_obj = Bin(contigs, set_name, bin_name)
+ +184 bins.append(bin_obj)
+ +186 return bins
+ + + +190def parse_bin_directories(bin_name_to_bin_dir: Dict[str, str]) -> Dict[str, list]:
+191 """
+192 Parses multiple bin directories and returns a dictionary mapping bin names to a list of Bin objects.
+ +194 :param bin_name_to_bin_dir: A dictionary mapping bin names to their respective bin directories.
+ +196 :return: A dictionary mapping bin names to a list of Bin objects created from the bin directories.
+197 """
+198 bin_name_to_bins = {}
+ +200 for name, bin_dir in bin_name_to_bin_dir.items():
+201 bin_name_to_bins[name] = get_bins_from_directory(bin_dir, name)
+ +203 return bin_name_to_bins
+ + +206def parse_contig2bin_tables(bin_name_to_bin_tables: Dict[str, str]) -> Dict[str, list]:
+207 """
+208 Parses multiple contig-to-bin tables and returns a dictionary mapping bin names to a list of Bin objects.
+ +210 :param bin_name_to_bin_tables: A dictionary mapping bin names to their respective contig-to-bin tables.
+ +212 :return: A dictionary mapping bin names to a list of Bin objects created from the contig-to-bin tables.
+213 """
+214 bin_name_to_bins = {}
+ +216 for name, contig2bin_table in bin_name_to_bin_tables.items():
+217 bin_name_to_bins[name] = get_bins_from_contig2bin_table(contig2bin_table, name)
+ +219 return bin_name_to_bins
+ + +222def get_bins_from_contig2bin_table(contig2bin_table: str, set_name: str) -> List[Bin]:
+223 """
+224 Retrieves a list of Bin objects from a contig-to-bin table.
+ +226 :param contig2bin_table: The path to the contig-to-bin table.
+227 :param set_name: The name of the set the bins belong to.
+ +229 :return: A list of Bin objects created from the contig-to-bin table.
+230 """
+231 bin_name2contigs = defaultdict(set)
+232 with open(contig2bin_table) as fl:
+233 for line in fl:
+234 if line.startswith("#") or line.startswith("@"):
+235 logging.debug(f"Ignoring a line from {contig2bin_table}: {line}")
+236 continue
+237 contig_name = line.strip().split("\t")[0]
+238 bin_name = line.strip().split("\t")[1]
+239 bin_name2contigs[bin_name].add(contig_name)
+ +241 bins = []
+242 for bin_name, contigs in bin_name2contigs.items():
+243 bin_obj = Bin(contigs, set_name, bin_name)
+244 bins.append(bin_obj)
+245 return bins
+ + +248def from_bin_sets_to_bin_graph(bin_name_to_bin_set: Dict[str, set]) -> nx.Graph:
+249 """
+250 Creates a bin graph from a dictionary of bin sets.
+ +252 :param bin_name_to_bin_set: A dictionary mapping bin names to their respective bin sets.
+ +254 :return: A networkx Graph representing the bin graph of overlapping bins.
+255 """
+256 G = nx.Graph()
+ +258 for set1_name, set2_name in itertools.combinations(bin_name_to_bin_set, 2):
+259 set1 = bin_name_to_bin_set[set1_name]
+260 set2 = bin_name_to_bin_set[set2_name]
+ +262 for bin1, bin2 in itertools.product(set1, set2):
+ +264 if bin1.overlaps_with(bin2):
+265 G.add_edge(bin1, bin2)
+266 return G
+ + + +270def get_all_possible_combinations(clique: Iterable) -> Iterable[Tuple]:
+271 """
+272 Generates all possible combinations of elements from a given clique.
+ +274 :param clique: An iterable representing a clique.
+ +276 :return: An iterable of tuples representing all possible combinations of elements from the clique.
+277 """
+278 return (c for r in range(2, len(clique) + 1) for c in itertools.combinations(clique, r))
+ + +281def get_intersection_bins(G: nx.Graph) -> Set[Bin]:
+282 """
+283 Retrieves the intersection bins from a given graph.
+ +285 :param G: A networkx Graph representing the graph.
+ +287 :return: A set of Bin objects representing the intersection bins.
+288 """
+289 intersect_bins = set()
+ +291 for clique in nx.clique.find_cliques(G):
+292 bins_combinations = get_all_possible_combinations(clique)
+293 for bins in bins_combinations:
+294 if max((b.completeness for b in bins)) < 20:
+295 logging.debug("completeness is not good enough to create a new bin on intersection")
+296 logging.debug(f"{[(str(b), b.completeness, b.contamination) for b in bins]}")
+297 continue
+ +299 intersec_bin = bins[0].intersection(*bins[1:])
+ +301 if intersec_bin.contigs:
+302 intersect_bins.add(intersec_bin)
+ +304 return intersect_bins
+ + +307def get_difference_bins(G: nx.Graph) -> Set[Bin]:
+308 """
+309 Retrieves the difference bins from a given graph.
+ +311 :param G: A networkx Graph representing the graph.
+ +313 :return: A set of Bin objects representing the difference bins.
+314 """
+315 difference_bins = set()
+ +317 for clique in nx.clique.find_cliques(G):
+ +319 bins_combinations = get_all_possible_combinations(clique)
+320 for bins in bins_combinations:
+ +322 for bin_a in bins:
+ +324 bin_diff = bin_a.difference(*(b for b in bins if b != bin_a))
+325 if bin_a.completeness < 20:
+326 logging.debug(f"completeness of {bin_a} is not good enough to do difference... ")
+327 logging.debug(f"{[(str(b), b.completeness, b.contamination) for b in bins]}")
+328 continue
+ +330 if bin_diff.contigs:
+331 difference_bins.add(bin_diff)
+ +333 return difference_bins
+ + +336def get_union_bins(G: nx.Graph, max_conta: int = 50) -> Set[Bin]:
+337 """
+338 Retrieves the union bins from a given graph.
+ +340 :param G: A networkx Graph representing the graph of bins.
+341 :param max_conta: Maximum allowed contamination value for a bin to be included in the union.
+ +343 :return: A set of Bin objects representing the union bins.
+344 """
+345 union_bins = set()
+346 for clique in nx.clique.find_cliques(G):
+347 bins_combinations = get_all_possible_combinations(clique)
+348 for bins in bins_combinations:
+349 if max((b.contamination for b in bins)) > max_conta:
+350 logging.debug("Some bin are too contaminated to make a useful union bin")
+351 logging.debug(f"{[(str(b), b.completeness, b.contamination) for b in bins]}")
+352 continue
+ +354 bins = set(bins)
+355 bin_a = bins.pop()
+356 bin_union = bin_a.union(*bins)
+ +358 if bin_union.contigs:
+359 union_bins.add(bin_union)
+ +361 return union_bins
+ + +364def select_best_bins(bins: List[Bin]) -> List[Bin]:
+365 """
+366 Selects the best bins from a list of bins based on their scores, N50 values, and IDs.
+ +368 :param bins: A list of Bin objects.
+ +370 :return: A list of selected Bin objects.
+371 """
+ +373 logging.info("Sorting bins")
+374 # Sort on score, N50, and ID. Smaller ID values are preferred to select original bins first.
+375 sorted_bins = sorted(bins, key=lambda x: (x.score, x.N50, -x.id), reverse=True)
+ +377 logging.info("Selecting bins")
+378 selected_bins = []
+379 for b in sorted_bins:
+380 if b in bins:
+381 overlapping_bins = {b2 for b2 in bins if b.overlaps_with(b2)}
+382 bins -= overlapping_bins
+ +384 selected_bins.append(b)
+ +386 logging.info(f"Selected {len(selected_bins)} bins")
+387 return selected_bins
+ + +390def dereplicate_bin_sets(bin_sets):
+391 """
+392 Dereplicates bins from different bin sets to obtain a non-redundant bin set.
+ +394 :param bin_sets: A list of bin sets.
+ +396 :return: A set of non-redundant bins.
+397 """
+398 return set().union(*bin_sets)
+ + +401def get_contigs_in_bins(bins: List[Bin]) -> Set[str]:
+402 """
+403 Retrieves all contigs present in the given list of bins.
+ +405 :param bins: A list of Bin objects.
+ +407 :return: A set of contigs present in the bins.
+408 """
+409 return set().union(*(b.contigs for b in bins))
+ + +412def rename_bin_contigs(bins: List[Bin], contig_to_index: dict):
+413 """
+414 Renames the contigs in the bins based on the provided mapping.
+ +416 :param bins: A list of Bin objects.
+417 :param contig_to_index: A dictionary mapping old contig names to new index names.
+418 """
+419 for b in bins:
+420 b.contigs = {contig_to_index[contig] for contig in b.contigs}
+421 b.hash = hash(str(sorted(b.contigs)))
+ +423def create_intermediate_bins(bin_set_name_to_bins: Dict[str, Set[Bin]]) -> Set[Bin]:
+424 """
+425 Creates intermediate bins from a dictionary of bin sets.
+ +427 :param bin_set_name_to_bins: A dictionary mapping bin set names to corresponding bins.
+ +429 :return: A set of intermediate bins created from intersections, differences, and unions.
+430 """
+431 logging.info("Making bin graph...")
+432 connected_bins_graph = from_bin_sets_to_bin_graph(bin_set_name_to_bins)
+ +434 logging.info("Creating intersection bins...")
+435 intersection_bins = get_intersection_bins(connected_bins_graph)
+436 logging.info(f"{len(intersection_bins)} bins created on intersections.")
+ +438 logging.info("Creating difference bins...")
+439 difference_bins = get_difference_bins(connected_bins_graph)
+440 logging.info(f"{len(difference_bins)} bins created based on symmetric difference.")
+ +442 logging.info("Creating union bins...")
+443 union_bins = get_union_bins(connected_bins_graph)
+444 logging.info(f"{len(union_bins)} bins created on unions.")
+ +446 return difference_bins | intersection_bins | union_bins
+ ++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1#!/usr/bin/env python3
+2import concurrent.futures as cf
+3import logging
+4import os
+5from collections import Counter
+6from itertools import islice
+7from typing import Dict, Iterable, List, Tuple, Iterator
+ +9import numpy as np
+10import pandas as pd
+ +12# Suppress unnecessary TensorFlow warnings
+13os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
+14logging.getLogger("tensorflow").setLevel(logging.FATAL)
+ + +17from checkm2 import keggData, modelPostprocessing, modelProcessing
+18from binette.bin_manager import Bin
+ +20def get_bins_metadata_df(bins: List, contig_to_cds_count: Dict[str, int], contig_to_aa_counter: Dict[str, Counter], contig_to_aa_length: Dict[str, int]) -> pd.DataFrame:
+21 """
+22 Generate a DataFrame containing metadata for a list of bins.
+ +24 :param bins: A list of bin objects.
+25 :param contig_to_cds_count: A dictionary mapping contig names to CDS counts.
+26 :param contig_to_aa_counter: A dictionary mapping contig names to amino acid composition (Counter object).
+27 :param contig_to_aa_length: A dictionary mapping contig names to total amino acid length.
+28 :return: A DataFrame containing bin metadata.
+29 """
+ +31 metadata_order = keggData.KeggCalculator().return_proper_order("Metadata")
+ +33 # Get bin metadata
+34 bin_metadata_list = []
+35 for bin_obj in bins:
+36 bin_metadata = {
+37 "Name": bin_obj.id,
+38 "CDS": sum((contig_to_cds_count[c] for c in bin_obj.contigs if c in contig_to_cds_count)),
+39 "AALength": sum((contig_to_aa_length[c] for c in bin_obj.contigs if c in contig_to_aa_length)),
+40 }
+ +42 bin_aa_counter = Counter()
+43 for contig in bin_obj.contigs:
+44 if contig in contig_to_aa_counter:
+45 bin_aa_counter += contig_to_aa_counter[contig]
+ +47 bin_metadata.update(dict(bin_aa_counter))
+48 bin_metadata_list.append(bin_metadata)
+ +50 metadata_df = pd.DataFrame(bin_metadata_list, columns=["Name"] + metadata_order)
+ +52 metadata_df = metadata_df.fillna(0)
+ +54 metadata_df = metadata_df.astype({col: int for col in metadata_order})
+ +56 metadata_df = metadata_df.set_index("Name", drop=False)
+57 return metadata_df
+ +59def get_diamond_feature_per_bin_df(bins: List, contig_to_kegg_counter: Dict[str, Counter]) -> Tuple[pd.DataFrame, int]:
+60 """
+61 Generate a DataFrame containing Diamond feature counts per bin and completeness information for pathways, categories, and modules.
+ +63 :param bins: A list of bin objects.
+64 :param contig_to_kegg_counter: A dictionary mapping contig names to KEGG annotation counters.
+65 :type bins: List
+66 :type contig_to_kegg_counter: Dict[str, Counter]
+67 :return: A tuple containing the DataFrame and the number of default KEGG orthologs (KOs).
+68 :rtype: Tuple[pd.DataFrame, int]
+69 """
+70 KeggCalc = keggData.KeggCalculator()
+71 defaultKOs = KeggCalc.return_default_values_from_category("KO_Genes")
+ +73 bin_to_ko_counter = {}
+74 for bin_obj in bins:
+75 bin_ko_counter = Counter()
+76 for contig in bin_obj.contigs:
+77 try:
+78 bin_ko_counter += contig_to_kegg_counter[contig]
+79 except KeyError:
+80 # No KO annotation found in this contig
+81 continue
+ +83 bin_to_ko_counter[bin_obj.id] = bin_ko_counter
+ +85 ko_count_per_bin_df = pd.DataFrame(bin_to_ko_counter, index=defaultKOs).transpose().fillna(0)
+86 ko_count_per_bin_df = ko_count_per_bin_df.astype(int)
+87 ko_count_per_bin_df["Name"] = ko_count_per_bin_df.index
+ +89 logging.info("Calculating completeness of pathways and modules.")
+90 logging.debug("Calculating pathway completeness information")
+91 KO_pathways = KeggCalc.calculate_KO_group("KO_Pathways", ko_count_per_bin_df.copy())
+ +93 logging.debug("Calculating category completeness information")
+94 KO_categories = KeggCalc.calculate_KO_group("KO_Categories", ko_count_per_bin_df.copy())
+ +96 logging.debug("Calculating module completeness information")
+97 KO_modules = KeggCalc.calculate_module_completeness(ko_count_per_bin_df.copy())
+ +99 diamond_complete_results = pd.concat([ko_count_per_bin_df, KO_pathways, KO_modules, KO_categories], axis=1)
+ +101 return diamond_complete_results, len(defaultKOs)
+ + +104def compute_N50(list_of_lengths) -> int:
+105 """
+106 Calculate N50 for a sequence of numbers.
+ +108 :param list_of_lengths: List of numbers.
+109 :param list_of_lengths: list
+110 :return: N50 value.
+111 """
+112 list_of_lengths = sorted(list_of_lengths)
+113 sum_len = sum(list_of_lengths)
+ +115 cum_length = 0
+116 length = 0
+117 for length in list_of_lengths:
+118 if cum_length + length >= sum_len / 2:
+119 return length
+120 cum_length += length
+121 return length
+ +123def add_bin_size_and_N50(bins: Iterable[Bin], contig_to_size: Dict[str,int]):
+124 """
+125 Add bin size and N50 to a list of bin objects.
+ +127 :param bins: List of bin objects.
+128 :param contig_to_size: Dictionary mapping contig names to their sizes.
+129 """
+130 for bin_obj in bins:
+131 lengths = [contig_to_size[c] for c in bin_obj.contigs]
+132 n50 = compute_N50(lengths)
+ +134 bin_obj.add_length(sum(lengths))
+135 bin_obj.add_N50(n50)
+ + +138def add_bin_metrics(bins: List, contig_info: Dict, contamination_weight: float, threads: int = 1):
+139 """
+140 Add metrics to a list of bins.
+ +142 :param bins: List of bin objects.
+143 :param contig_info: Dictionary containing contig information.
+144 :param contamination_weight: Weight for contamination assessment.
+145 :param threads: Number of threads for parallel processing (default is 1).
+ +147 :return: List of processed bin objects.
+148 """
+149 postProcessor = modelPostprocessing.modelProcessor(threads)
+ +151 contig_to_kegg_counter = contig_info["contig_to_kegg_counter"]
+152 contig_to_cds_count = contig_info["contig_to_cds_count"]
+153 contig_to_aa_counter = contig_info["contig_to_aa_counter"]
+154 contig_to_aa_length = contig_info["contig_to_aa_length"]
+155 contig_to_length = contig_info["contig_to_length"]
+ +157 logging.info("Assess bin length and N50")
+ +159 add_bin_size_and_N50(bins, contig_to_length)
+ +161 logging.info("Assess bin quality")
+162 assess_bins_quality_by_chunk(
+163 bins,
+164 contig_to_kegg_counter,
+165 contig_to_cds_count,
+166 contig_to_aa_counter,
+167 contig_to_aa_length,
+168 contamination_weight,
+169 postProcessor,
+170 )
+171 return bins
+ + +174def chunks(iterable: Iterable, size: int) -> Iterator[Tuple]:
+175 """
+176 Generate adjacent chunks of data from an iterable.
+ +178 :param iterable: The iterable to be divided into chunks.
+179 :param size: The size of each chunk.
+180 :return: An iterator that produces tuples of elements in chunks.
+181 """
+182 it = iter(iterable)
+183 return iter(lambda: tuple(islice(it, size)), ())
+ + +186def assess_bins_quality_by_chunk(bins: List,
+187 contig_to_kegg_counter: Dict,
+188 contig_to_cds_count: Dict,
+189 contig_to_aa_counter: Dict,
+190 contig_to_aa_length: Dict,
+191 contamination_weight: float,
+192 postProcessor:modelPostprocessing.modelProcessor = None,
+193 threads: int = 1,
+194 chunk_size: int = 2500):
+195 """
+196 Assess the quality of bins in chunks.
+ +198 This function assesses the quality of bins in chunks to improve processing efficiency.
+ +200 :param bins: List of bin objects.
+201 :param contig_to_kegg_counter: Dictionary mapping contig names to KEGG counters.
+202 :param contig_to_cds_count: Dictionary mapping contig names to CDS counts.
+203 :param contig_to_aa_counter: Dictionary mapping contig names to amino acid counters.
+204 :param contig_to_aa_length: Dictionary mapping contig names to amino acid lengths.
+205 :param contamination_weight: Weight for contamination assessment.
+206 :param postProcessor: post-processor from checkm2
+207 :param threads: Number of threads for parallel processing (default is 1).
+208 :param chunk_size: The size of each chunk.
+209 """
+ +211 for i, chunk_bins_iter in enumerate(chunks(bins, chunk_size)):
+212 chunk_bins = set(chunk_bins_iter)
+213 logging.debug(f"chunk {i}: assessing quality of {len(chunk_bins)}")
+214 assess_bins_quality(
+215 bins=chunk_bins,
+216 contig_to_kegg_counter= contig_to_kegg_counter,
+217 contig_to_cds_count=contig_to_cds_count,
+218 contig_to_aa_counter=contig_to_aa_counter,
+219 contig_to_aa_length=contig_to_aa_length,
+220 contamination_weight=contamination_weight,
+221 postProcessor=postProcessor,
+222 threads=threads
+223 )
+ +225def assess_bins_quality(
+226 bins: List,
+227 contig_to_kegg_counter: Dict,
+228 contig_to_cds_count: Dict,
+229 contig_to_aa_counter: Dict,
+230 contig_to_aa_length: Dict,
+231 contamination_weight: float,
+232 postProcessor: modelPostprocessing.modelProcessor = None,
+233 threads: int = 1,):
+234 """
+235 Assess the quality of bins.
+ +237 This function assesses the quality of bins based on various criteria and assigns completeness and contamination scores.
+238 This code is taken from checkm2 and adjusted
+ +240 :param bins: List of bin objects.
+241 :param contig_to_kegg_counter: Dictionary mapping contig names to KEGG counters.
+242 :param contig_to_cds_count: Dictionary mapping contig names to CDS counts.
+243 :param contig_to_aa_counter: Dictionary mapping contig names to amino acid counters.
+244 :param contig_to_aa_length: Dictionary mapping contig names to amino acid lengths.
+245 :param contamination_weight: Weight for contamination assessment.
+246 :param postProcessor: A post-processor from checkm2
+247 :param threads: Number of threads for parallel processing (default is 1).
+248 """
+249 if postProcessor is None:
+250 postProcessor = modelPostprocessing.modelProcessor(threads)
+ +252 metadata_df = get_bins_metadata_df(bins, contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length)
+ +254 diamond_complete_results, ko_list_length = get_diamond_feature_per_bin_df(bins, contig_to_kegg_counter)
+255 diamond_complete_results = diamond_complete_results.drop(columns=["Name"])
+ +257 feature_vectors = pd.concat([metadata_df, diamond_complete_results], axis=1)
+258 feature_vectors = feature_vectors.sort_values(by="Name")
+ +260 # 4: Call general model & specific models and derive predictions"""
+261 modelProc = modelProcessing.modelProcessor(threads)
+ +263 vector_array = feature_vectors.iloc[:, 1:].values.astype(np.float)
+ +265 logging.info("Predicting completeness and contamination using the general model.")
+266 general_results_comp, general_results_cont = modelProc.run_prediction_general(vector_array)
+ +268 logging.info("Predicting completeness using the specific model.")
+269 specific_model_vector_len = (
+270 ko_list_length + len(metadata_df.columns)
+271 ) - 1
+ +273 # also retrieve scaled data for CSM calculations
+274 specific_results_comp, scaled_features = modelProc.run_prediction_specific(vector_array, specific_model_vector_len)
+ +276 logging.info("Using cosine similarity to reference data to select an appropriate predictor model.")
+ +278 final_comp, final_cont, models_chosen, csm_array = postProcessor.calculate_general_specific_ratio(
+279 vector_array[:, 20], scaled_features, general_results_comp, general_results_cont, specific_results_comp
+280 )
+ +282 final_results = feature_vectors[["Name"]].copy()
+283 final_results["Completeness"] = np.round(final_comp, 2)
+284 final_results["Contamination"] = np.round(final_cont, 2)
+ +286 for bin_obj in bins:
+287 completeness = final_results.loc[bin_obj.id, "Completeness"]
+288 contamination = final_results.loc[bin_obj.id, "Contamination"]
+ +290 bin_obj.add_quality(completeness, contamination, contamination_weight)
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1#!/usr/bin/env python
+2"""
+3Module : Main
+4Description : The main entry point for the program.
+5Copyright : (c) Jean Mainguy, 28 nov. 2022
+6License : MIT
+7Maintainer : Jean Mainguy
+8Portability : POSIX
+9"""
+ +11from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+ +13import sys
+14import logging
+15import os
+16import pkg_resources
+ +18from binette import contig_manager, cds, diamond, bin_quality, bin_manager, io_manager as io
+19from typing import List, Dict, Set, Tuple
+ + +22def init_logging(verbose, debug):
+23 """Initialise logging."""
+24 if debug:
+25 level = logging.DEBUG
+26 elif verbose:
+27 level = logging.INFO
+28 else:
+29 level = logging.WARNING
+ +31 logging.basicConfig(
+32 level=level,
+33 format="%(asctime)s %(levelname)s - %(message)s",
+34 datefmt="[%Y-%m-%d %H:%M:%S]",
+35 )
+ +37 logging.info("Program started")
+38 logging.info(
+39 f'command line: {" ".join(sys.argv)}',
+40 )
+ + +43def parse_arguments(args):
+44 """Parse script arguments."""
+45 program_version = pkg_resources.get_distribution("Binette").version
+ +47 parser = ArgumentParser(
+48 description=f"Binette version={program_version}",
+49 formatter_class=ArgumentDefaultsHelpFormatter,
+50 )
+51 # TODO add catagory to better visualize the required and the optional args
+52 input_arg = parser.add_mutually_exclusive_group(required=True)
+ +54 input_arg.add_argument(
+55 "-d",
+56 "--bin_dirs",
+57 nargs="+",
+58 help="list of bin folders containing each bin in a fasta file.",
+59 )
+ +61 input_arg.add_argument(
+62 "-b",
+63 "--contig2bin_tables",
+64 nargs="+",
+65 help="list of contig2bin table with two columns separated\
+66 with a tabulation: contig, bin",
+67 )
+ +69 parser.add_argument("-c", "--contigs", required=True, help="Contigs in fasta format.")
+ +71 parser.add_argument(
+72 "-m",
+73 "--min_completeness",
+74 default=10,
+75 type=int,
+76 help="Minimum completeness required for final bin selections.",
+77 )
+ +79 parser.add_argument("-t", "--threads", default=1, type=int, help="Number of threads.")
+ +81 parser.add_argument("-o", "--outdir", default="results", help="Output directory.")
+ +83 parser.add_argument(
+84 "-w",
+85 "--contamination_weight",
+86 default=5,
+87 type=float,
+88 help="Bin are scored as follow: completeness - weight * contamination. "
+89 "A low contamination_weight favor complete bins over low contaminated bins.",
+90 )
+ +92 parser.add_argument(
+93 "-e",
+94 "--extension",
+95 default="fasta",
+96 help="Extension of fasta files in bin folders "
+97 "(necessary when --bin_dirs is used).",
+98 )
+ +100 parser.add_argument(
+101 "--checkm2_db",
+102 help="Provide a path for the CheckM2 diamond database. "
+103 "By default the database set via <checkm2 database> is used.",
+104 )
+ +106 parser.add_argument("-v", "--verbose", help="increase output verbosity", action="store_true")
+ +108 parser.add_argument("--debug", help="active debug mode", action="store_true")
+ +110 parser.add_argument("--resume", help="active resume mode", action="store_true")
+ +112 parser.add_argument("--low_mem", help="low mem mode", action="store_true")
+ +114 parser.add_argument("--version", action="version", version=program_version)
+ +116 args = parser.parse_args(args)
+117 return args
+ + +120def parse_input_files(bin_dirs: List[str], contig2bin_tables: List[str], contigs_fasta: str) -> Tuple[Dict[str, List], List, Dict[str, List], Dict[str, int]]:
+121 """
+122 Parses input files to retrieve information related to bins and contigs.
+ +124 :param bin_dirs: List of paths to directories containing bin FASTA files.
+125 :param contig2bin_tables: List of paths to contig-to-bin tables.
+126 :param contigs_fasta: Path to the contigs FASTA file.
+ +128 :return: A tuple containing:
+129 - Dictionary mapping bin set names to lists of bins.
+130 - List of original bins.
+131 - Dictionary mapping bins to lists of contigs.
+132 - Dictionary mapping contig names to their lengths.
+133 """
+ +135 if bin_dirs:
+136 logging.info("Parsing bin directories.")
+137 bin_name_to_bin_dir = io.infer_bin_name_from_bin_inputs(bin_dirs)
+138 bin_set_name_to_bins = bin_manager.parse_bin_directories(bin_name_to_bin_dir)
+139 else:
+140 logging.info("Parsing bin2contig files.")
+141 bin_name_to_bin_table = io.infer_bin_name_from_bin_inputs(contig2bin_tables)
+142 bin_set_name_to_bins = bin_manager.parse_contig2bin_tables(bin_name_to_bin_table)
+ +144 logging.info(f"{len(bin_set_name_to_bins)} bin sets processed:")
+145 for bin_set_id, bins in bin_set_name_to_bins.items():
+146 logging.info(f" {bin_set_id} - {len(bins)} bins")
+ +148 original_bins = bin_manager.dereplicate_bin_sets(bin_set_name_to_bins.values())
+149 contigs_in_bins = bin_manager.get_contigs_in_bins(original_bins)
+ +151 logging.info(f"Parsing contig fasta file: {contigs_fasta}")
+152 contigs_object = contig_manager.parse_fasta_file(contigs_fasta)
+153 contig_to_length = {seq.name: len(seq) for seq in contigs_object if seq.name in contigs_in_bins}
+ +155 return bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length
+ + +158def manage_protein_alignement(faa_file: str, contigs_fasta: str, contig_to_length: Dict[str, List],
+159 contigs_in_bins: Dict[str, List], diamond_result_file: str,
+160 checkm2_db: str, threads: int, resume: bool, low_mem: bool) -> Tuple[Dict[str, int], Dict[str, int]]:
+161 """
+162 Predicts or reuses proteins prediction and runs diamond on them.
+ +164 :param faa_file: The path to the .faa file.
+165 :param contigs_fasta: The path to the contigs FASTA file.
+166 :param contig_to_length: Dictionary mapping contig names to their lengths.
+167 :param contigs_in_bins: Dictionary mapping bin names to lists of contigs.
+168 :param diamond_result_file: The path to the diamond result file.
+169 :param checkm2_db: The path to the CheckM2 database.
+170 :param threads: Number of threads for parallel processing.
+171 :param resume: Boolean indicating whether to resume the process.
+172 :param low_mem: Boolean indicating whether to use low memory mode.
+ +174 :return: A tuple containing dictionaries - contig_to_kegg_counter and contig_to_genes.
+175 """
+ +177 # Predict or reuse proteins prediction and run diamond on them
+178 if resume:
+179 logging.info(f"Parsing faa file: {faa_file}.")
+180 contig_to_genes = cds.parse_faa_file(faa_file)
+181 io.check_contig_consistency(contig_to_length, contig_to_genes, contigs_fasta, faa_file)
+ +183 else:
+184 contigs_iterator = (s for s in contig_manager.parse_fasta_file(contigs_fasta) if s.name in contigs_in_bins)
+185 contig_to_genes = cds.predict(contigs_iterator, faa_file, threads)
+ +187 if checkm2_db:
+188 diamond_db_path = checkm2_db
+189 else:
+190 # get checkm2 db stored in checkm2 install
+191 diamond_db_path = diamond.get_checkm2_db()
+ +193 diamond_log = f"{os.path.splitext(diamond_result_file)[0]}.log"
+ +195 diamond.run(
+196 faa_file,
+197 diamond_result_file,
+198 diamond_db_path,
+199 diamond_log,
+200 threads,
+201 low_mem=low_mem,
+202 )
+ +204 logging.info("Parsing diamond results.")
+205 contig_to_kegg_counter = diamond.get_contig_to_kegg_id(diamond_result_file)
+ +207 # Check contigs from diamond vs input assembly consistency
+208 io.check_contig_consistency(contig_to_length, contig_to_kegg_counter, contigs_fasta, diamond_result_file)
+ +210 return contig_to_kegg_counter, contig_to_genes
+ + +213def select_bins_and_write_them(all_bins: Set[bin_manager.Bin], contigs_fasta: str, final_bin_report: str, min_completeness: float,
+214 index_to_contig: dict, outdir: str, debug: bool) -> List[bin_manager.Bin]:
+215 """
+216 Selects and writes bins based on specific criteria.
+ +218 :param all_bins: Set of Bin objects.
+219 :param contigs_fasta: Path to the contigs FASTA file.
+220 :param final_bin_report: Path to write the final bin report.
+221 :param min_completeness: Minimum completeness threshold for bin selection.
+222 :param index_to_contig: Dictionary mapping indices to contig names.
+223 :param outdir: Output directory to save final bins and reports.
+224 :param debug: Debug mode flag.
+225 :return: Selected bins that meet the completeness threshold.
+226 """
+ +228 outdir_final_bin_set = os.path.join(outdir, "final_bins")
+229 os.makedirs(outdir_final_bin_set, exist_ok=True)
+ +231 if debug:
+232 all_bins_for_debug = set(all_bins)
+233 all_bin_compo_file = os.path.join(outdir, "all_bins_quality_reports.tsv")
+ +235 logging.info(f"Writing all bins in {all_bin_compo_file}")
+ +237 io.write_bin_info(all_bins_for_debug, all_bin_compo_file, add_contigs=True)
+ +239 with open(os.path.join(outdir, "index_to_contig.tsv"), 'w') as flout:
+240 flout.write('\n'.join((f'{i}\t{c}' for i, c in index_to_contig.items())))
+ +242 logging.info("Selecting best bins")
+243 selected_bins = bin_manager.select_best_bins(all_bins)
+ +245 logging.info(f"Bin Selection: {len(selected_bins)} selected bins")
+ +247 logging.info(f"Filtering bins: only bins with completeness >= {min_completeness} are kept")
+248 selected_bins = [b for b in selected_bins if b.completeness >= min_completeness]
+ +250 logging.info(f"Filtering bins: {len(selected_bins)} selected bins")
+ +252 logging.info(f"Writing selected bins in {final_bin_report}")
+ +254 for b in selected_bins:
+255 b.contigs = {index_to_contig[c_index] for c_index in b.contigs}
+ +257 io.write_bin_info(selected_bins, final_bin_report)
+ +259 io.write_bins_fasta(selected_bins, contigs_fasta, outdir_final_bin_set)
+ +261 return selected_bins
+ + + +265def log_selected_bin_info(selected_bins: List[bin_manager.Bin], hq_min_completeness: float, hq_max_conta: float):
+266 """
+267 Log information about selected bins based on quality thresholds.
+ +269 :param selected_bins: List of Bin objects to analyze.
+270 :param hq_min_completeness: Minimum completeness threshold for high-quality bins.
+271 :param hq_max_conta: Maximum contamination threshold for high-quality bins.
+ +273 This function logs information about selected bins that meet specified quality thresholds.
+274 It counts the number of high-quality bins based on completeness and contamination values.
+275 """
+ +277 # Log completeness and contamination in debug log
+278 logging.debug("High quality bins:")
+279 for sb in selected_bins:
+280 if sb.completeness >= hq_min_completeness and sb.contamination <= hq_max_conta:
+281 logging.debug(f"> {sb} completeness={sb.completeness}, contamination={sb.contamination}")
+ +283 # Count high-quality bins and single-contig high-quality bins
+284 hq_bins = len([sb for sb in selected_bins if sb.completeness >= hq_min_completeness and sb.contamination <= hq_max_conta])
+ +286 # Log information about high-quality bins
+287 thresholds = f"(completeness >= {hq_min_completeness} and contamination <= {hq_max_conta})"
+288 logging.info(f"{hq_bins}/{len(selected_bins)} selected bins have a high quality {thresholds}.")
+ + +291def main():
+292 "Orchestrate the execution of the program"
+ +294 args = parse_arguments(sys.argv[1:]) # sys.argv is passed in order to be able to test the function parse_arguments
+ +296 init_logging(args.verbose, args.debug)
+ +298 # High quality threshold used just to log number of high quality bins.
+299 hq_max_conta = 5
+300 hq_min_completeness = 90
+ +302 # Temporary files #
+303 out_tmp_dir = os.path.join(args.outdir, "temporary_files")
+304 os.makedirs(out_tmp_dir, exist_ok=True)
+ +306 faa_file = os.path.join(out_tmp_dir, "assembly_proteins.faa")
+307 diamond_result_file = os.path.join(out_tmp_dir, "diamond_result.tsv")
+ +309 # Output files #
+310 final_bin_report = os.path.join(args.outdir, "final_bins_quality_reports.tsv")
+ + +313 if args.resume:
+314 io.check_resume_file(faa_file, diamond_result_file)
+ +316 bin_set_name_to_bins, original_bins, contigs_in_bins, contig_to_length = parse_input_files(args.bin_dirs, args.contig2bin_tables, args.contigs)
+ +318 contig_to_kegg_counter, contig_to_genes = manage_protein_alignement(faa_file=faa_file, contigs_fasta=args.contigs, contig_to_length=contig_to_length,
+319 contigs_in_bins=contigs_in_bins,
+320 diamond_result_file=diamond_result_file, checkm2_db=args.checkm2_db,
+321 threads=args.threads, resume=args.resume, low_mem=args.low_mem)
+ +323 # Use contig index instead of contig name to save memory
+324 contig_to_index, index_to_contig = contig_manager.make_contig_index(contigs_in_bins)
+ +326 contig_to_kegg_counter = contig_manager.apply_contig_index(contig_to_index, contig_to_kegg_counter)
+327 contig_to_genes = contig_manager.apply_contig_index(contig_to_index, contig_to_genes)
+328 contig_to_length = contig_manager.apply_contig_index(contig_to_index, contig_to_length)
+ +330 bin_manager.rename_bin_contigs(original_bins, contig_to_index)
+ + +333 # Extract cds metadata ##
+334 logging.info("Compute cds metadata.")
+335 contig_metadat = cds.get_contig_cds_metadata(contig_to_genes, args.threads)
+ +337 contig_metadat["contig_to_kegg_counter"] = contig_to_kegg_counter
+338 contig_metadat["contig_to_length"] = contig_to_length
+ + +341 logging.info("Add size and assess quality of input bins")
+342 bin_quality.add_bin_metrics(original_bins, contig_metadat, args.contamination_weight, args.threads)
+ +344 logging.info("Create intermediate bins:")
+345 new_bins = bin_manager.create_intermediate_bins(bin_set_name_to_bins)
+ +347 logging.info("Assess quality for supplementary intermediate bins.")
+348 new_bins = bin_quality.add_bin_metrics(new_bins, contig_metadat, args.contamination_weight, args.threads)
+ + +351 logging.info("Dereplicating input bins and new bins")
+352 all_bins = original_bins | new_bins
+ +354 selected_bins = select_bins_and_write_them(all_bins, args.contigs, final_bin_report, args.min_completeness, index_to_contig, args.outdir, args.debug)
+ +356 log_selected_bin_info(selected_bins, hq_min_completeness, hq_max_conta)
+ +358 return 0
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +2import concurrent.futures as cf
+3import multiprocessing.pool
+4import logging
+5from collections import Counter, defaultdict
+6from typing import Dict, List, Iterator, Tuple
+ +8import pyfastx
+9import pyrodigal
+10from tqdm import tqdm
+ + +13def get_contig_from_cds_name(cds_name: str) -> str:
+14 """
+15 Extract the contig name from a CDS name.
+ +17 :param cds_name: The name of the CDS.
+18 :type cds_name: str
+19 :return: The name of the contig.
+20 :rtype: str
+21 """
+22 return "_".join(cds_name.split("_")[:-1])
+ +24def predict(contigs_iterator: Iterator, outfaa: str, threads: int =1) -> Dict[str, List[str]]:
+25 """
+26 Predict open reading frames with Pyrodigal.
+ +28 :param contigs_iterator: An iterator of contig sequences.
+29 :param outfaa: The output file path for predicted protein sequences (in FASTA format).
+30 :param threads: Number of CPU threads to use (default is 1).
+ +32 :return: A dictionary mapping contig names to predicted genes.
+33 """
+34 try:
+35 # for version >=3 of pyrodigal
+36 orf_finder = pyrodigal.GeneFinder(meta="meta")
+37 except AttributeError:
+38 orf_finder = pyrodigal.OrfFinder(meta="meta")
+ +40 logging.info(f"Predicting cds sequences with Pyrodigal using {threads} threads.")
+ +42 with multiprocessing.pool.ThreadPool(processes=threads) as pool:
+43 contig_and_genes = pool.starmap(predict_genes, ((orf_finder.find_genes, seq) for seq in contigs_iterator))
+ +45 write_faa(outfaa, contig_and_genes)
+ + +48 contig_to_genes = {
+49 contig_id: [gene.translate() for gene in pyrodigal_genes]
+50 for contig_id, pyrodigal_genes in contig_and_genes
+51 }
+ +53 return contig_to_genes
+ +55def predict_genes(find_genes, seq):
+ + +58 return (seq.name, find_genes(seq.seq) )
+ + +61def write_faa(outfaa: str, contig_to_genes: Dict[str, List[str]]) -> None:
+62 """
+63 Write predicted protein sequences to a FASTA file.
+ +65 :param outfaa: The output file path for predicted protein sequences (in FASTA format).
+66 :param contig_to_genes: A dictionary mapping contig names to predicted genes.
+ +68 """
+69 logging.info("Writing predicted protein sequences.")
+70 with open(outfaa, "w") as fl:
+71 for contig_id, genes in contig_to_genes:
+72 genes.write_translations(fl, contig_id)
+ +74def parse_faa_file(faa_file: str) -> Dict[str, List]:
+75 """
+76 Parse a FASTA file containing protein sequences and organize them by contig.
+ +78 :param faa_file: Path to the input FASTA file.
+79 :return: A dictionary mapping contig names to lists of protein sequences.
+80 """
+81 contig_to_genes = defaultdict(list)
+82 for name, seq in pyfastx.Fastx(faa_file):
+83 contig = get_contig_from_cds_name(name)
+84 contig_to_genes[contig].append(seq)
+ +86 return dict(contig_to_genes)
+ +88def get_aa_composition(genes: List[str]) -> Counter:
+89 """
+90 Calculate the amino acid composition of a list of protein sequences.
+ +92 :param genes: A list of protein sequences.
+93 :return: A Counter object representing the amino acid composition.
+94 """
+95 aa_counter = Counter()
+96 for gene in genes:
+97 aa_counter += Counter(gene)
+ +99 return aa_counter
+ +101def get_contig_cds_metadata_flat(contig_to_genes: Dict[str, List[str]]) -> Tuple[Dict[str, int], Dict[str, Counter], Dict[str, int]]:
+102 """
+103 Calculate metadata for contigs, including CDS count, amino acid composition, and total amino acid length.
+ +105 :param contig_to_genes: A dictionary mapping contig names to lists of protein sequences.
+106 :return: A tuple containing dictionaries for CDS count, amino acid composition, and total amino acid length.
+107 """
+108 contig_to_cds_count = {contig: len(genes) for contig, genes in contig_to_genes.items()}
+ +110 contig_to_aa_counter = {contig: get_aa_composition(genes) for contig, genes in tqdm(contig_to_genes.items(), unit="contig")}
+111 logging.info("Calculating amino acid composition.")
+ +113 contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in tqdm(contig_to_aa_counter.items(), unit="contig")}
+114 logging.info("Calculating total amino acid length.")
+ +116 return contig_to_cds_count, contig_to_aa_counter, contig_to_aa_length
+ +118def get_contig_cds_metadata(contig_to_genes: Dict[str, List[str]], threads: int) -> Tuple[Dict[str, int], Dict[str, Counter], Dict[str, int]]:
+119 """
+120 Calculate metadata for contigs in parallel, including CDS count, amino acid composition, and total amino acid length.
+ +122 :param contig_to_genes: A dictionary mapping contig names to lists of protein sequences.
+123 :param threads: Number of CPU threads to use.
+124 :return: A tuple containing dictionaries for CDS count, amino acid composition, and total amino acid length.
+125 """
+126 contig_to_cds_count = {contig: len(genes) for contig, genes in contig_to_genes.items()}
+ +128 contig_to_future = {}
+129 logging.info(f"Collecting contig amino acid composition using {threads} threads.")
+130 with cf.ProcessPoolExecutor(max_workers=threads) as tpe:
+131 for contig, genes in tqdm(contig_to_genes.items()):
+132 contig_to_future[contig] = tpe.submit(get_aa_composition, genes)
+ +134 contig_to_aa_counter = {contig: future.result() for contig, future in tqdm(contig_to_future.items(), unit="contig")}
+135 logging.info("Calculating amino acid composition in parallel.")
+ +137 contig_to_aa_length = {contig: sum(counter.values()) for contig, counter in tqdm(contig_to_aa_counter.items(), unit="contig")}
+138 logging.info("Calculating total amino acid length in parallel.")
+ +140 contig_info = {
+141 "contig_to_cds_count": contig_to_cds_count,
+142 "contig_to_aa_counter": contig_to_aa_counter,
+143 "contig_to_aa_length": contig_to_aa_length,
+144 }
+ +146 return contig_info
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1import pyfastx
+2from typing import Dict, Tuple
+ + +5def parse_fasta_file(fasta_file: str) -> pyfastx.Fasta:
+6 """
+7 Parse a FASTA file and return a pyfastx.Fasta object.
+ +9 :param fasta_file: The path to the FASTA file.
+ +11 :return: A pyfastx.Fasta object representing the parsed FASTA file.
+12 """
+13 fa = pyfastx.Fasta(fasta_file, build_index=True)
+14 return fa
+ + +17def make_contig_index(contigs: list) -> Tuple[Dict[str, int], Dict[int, str]]:
+18 """
+19 Create an index mapping for contigs.
+ +21 :param contigs: A list of contig names.
+ +23 :return: A tuple containing the contig index mapping dictionaries (contig_to_index, index_to_contig).
+24 """
+25 contig_to_index = {contig: index for index, contig in enumerate(contigs)}
+26 index_to_contig = {index: contig for contig, index in contig_to_index.items()}
+27 return contig_to_index, index_to_contig
+ + +30def apply_contig_index(contig_to_index: Dict[str, int], contig_to_info: Dict[str, str]) -> Dict[int, str]:
+31 """
+32 Apply the contig index mapping to the contig info dictionary.
+ +34 :param contig_to_index: A dictionary mapping contig names to their corresponding index.
+35 :param contig_to_info: A dictionary mapping contig names to their associated information.
+ +37 :return: A dictionary mapping contig indices to their associated information.
+38 """
+39 return {contig_to_index[contig]: info for contig, info in contig_to_info.items()}
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1import subprocess
+2import logging
+3import sys
+4import shutil
+5import re
+6import pandas as pd
+7from collections import Counter
+ +9from checkm2 import keggData
+ + +12def get_checkm2_db() -> str:
+13 """
+14 Get the path to the CheckM2 database.
+ +16 :return: The path to the CheckM2 database.
+17 """
+18 if shutil.which("checkm2") is None:
+19 logging.error("Make sure checkm2 is on your system path.")
+20 sys.exit(1)
+ +22 checkm2_database_raw = subprocess.run(["checkm2", "database", "--current"], text=True, stderr=subprocess.PIPE)
+ +24 if checkm2_database_raw.returncode != 0:
+25 logging.error(f"Something went wrong with checkm2:\n=======\n{checkm2_database_raw.stderr}========")
+26 sys.exit(1)
+ +28 reg_result = re.search("INFO: (/.*.dmnd)", checkm2_database_raw.stderr)
+ +30 try:
+31 db_path = reg_result.group(1)
+32 except AttributeError:
+33 logging.error(f"Something went wrong when retrieving checkm2 db path:\n{checkm2_database_raw.stderr}")
+34 sys.exit(1)
+ +36 return db_path
+ + + +40def check_tool_exists(tool_name: str):
+41 """
+42 Check if a specified tool is on the system's PATH.
+ +44 :param tool_name: The name of the tool to check for.
+45 :type tool_name: str
+46 :raises FileNotFoundError: If the tool is not found on the system's PATH.
+47 """
+48 if shutil.which(tool_name) is None:
+49 raise FileNotFoundError(f"The '{tool_name}' tool is not found on your system PATH.")
+ + +52def run(
+53 faa_file: str, output: str, db: str, log: str, threads: int = 1, query_cover: int = 80, subject_cover: int = 80,
+54 percent_id: int = 30, evalue: float = 1e-05, low_mem: bool = False):
+55 """
+56 Run Diamond with specified parameters.
+ +58 :param faa_file: Path to the input protein sequence file (FASTA format).
+59 :param output: Path to the Diamond output file.
+60 :param db: Path to the Diamond database.
+61 :param log: Path to the log file.
+62 :param threads: Number of CPU threads to use (default is 1).
+63 :param query_cover: Minimum query coverage percentage (default is 80).
+64 :param subject_cover: Minimum subject coverage percentage (default is 80).
+65 :param percent_id: Minimum percent identity (default is 30).
+66 :param evalue: Maximum e-value threshold (default is 1e-05).
+67 :param low_mem: Use low memory mode if True (default is False).
+68 """
+69 check_tool_exists("diamond")
+ +71 blocksize = 0.5 if low_mem else 2
+ +73 cmd = (
+74 "diamond blastp --outfmt 6 --max-target-seqs 1 "
+75 f"--query {faa_file} "
+76 f"-o {output} "
+77 f"--threads {threads} "
+78 f"--db {db} "
+79 f"--query-cover {query_cover} "
+80 f"--subject-cover {subject_cover} "
+81 f"--id {percent_id} "
+82 f"--evalue {evalue} --block-size {blocksize} 2> {log}"
+83 )
+ +85 logging.info("Running diamond")
+86 logging.info(cmd)
+ +88 run = subprocess.run(cmd, shell=True)
+ +90 if run.returncode != 0:
+91 logging.error(f"An error occurred while running DIAMOND. Check log file: {log}")
+92 sys.exit(1)
+ +94 logging.info("Finished Running DIAMOND")
+ +96def get_contig_to_kegg_id(diamond_result_file: str) -> dict:
+97 """
+98 Get a dictionary mapping contig IDs to KEGG annotations from a Diamond result file.
+ +100 :param diamond_result_file: Path to the Diamond result file.
+101 :return: A dictionary mapping contig IDs to KEGG annotations.
+102 """
+103 diamon_results_df = pd.read_csv(diamond_result_file, sep="\t", usecols=[0, 1], names=["ProteinID", "annotation"])
+104 diamon_results_df[["Ref100_hit", "Kegg_annotation"]] = diamon_results_df["annotation"].str.split(
+105 "~", n=1, expand=True
+106 )
+ +108 KeggCalc = keggData.KeggCalculator()
+109 defaultKOs = KeggCalc.return_default_values_from_category("KO_Genes")
+ +111 diamon_results_df = diamon_results_df.loc[diamon_results_df["Kegg_annotation"].isin(defaultKOs.keys())]
+112 diamon_results_df["contig"] = diamon_results_df["ProteinID"].str.split("_", n=-1).str[:-1].str.join("_")
+ +114 contig_to_kegg_counter = (
+115 diamon_results_df.groupby("contig").agg({"Kegg_annotation": Counter}).reset_index()
+116 )
+ +118 contig_to_kegg_counter = dict(zip(contig_to_kegg_counter["contig"], contig_to_kegg_counter["Kegg_annotation"]))
+ +120 return contig_to_kegg_counter
++ « prev + ^ index + » next + + coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+ +1import logging
+2import os
+3import pyfastx
+4from typing import List, Dict
+5import csv
+ +7from binette.bin_manager import Bin
+ + +10def infer_bin_name_from_bin_inputs(input_bins: List[str]) -> Dict[str, str]:
+11 """
+12 Infer bin names from a list of bin input directories.
+ +14 :param input_bins: List of input bin directories.
+15 :return: Dictionary mapping inferred bin names to their corresponding directories.
+16 """
+17 logging.debug(f"Inferring bin names from input bins:")
+ +19 commonprefix_len = len(os.path.commonprefix(input_bins))
+20 reversed_strings = [s[::-1] for s in input_bins]
+21 commonsufix_len = len(os.path.commonprefix(reversed_strings))
+ +23 bin_name_to_bin_dir = {d[commonprefix_len: len(d) - commonsufix_len]: d for d in input_bins}
+ +25 logging.debug(f"Input bins: {' '.join(input_bins)}")
+26 logging.debug(f"Common prefix to remove: {os.path.commonprefix(reversed_strings)[::-1]}")
+27 logging.debug(f"Common suffix to remove: {os.path.commonprefix(input_bins)}")
+28 logging.debug(f"bin_name_to_bin_dir: {bin_name_to_bin_dir}")
+ +30 return bin_name_to_bin_dir
+ + +33def write_bin_info(bins: List[Bin], output: str, add_contigs: bool = False):
+34 """
+35 Write bin information to a TSV file.
+ +37 :param bins: List of Bin objects.
+38 :param output: Output file path for writing the TSV.
+39 :param add_contigs: Flag indicating whether to include contig information.
+40 """
+ +42 header = ["bin_id", "origin", "name", "completeness", "contamination", "score", "size", "N50", "contig_count"]
+43 if add_contigs:
+44 header.append('contigs')
+ +46 bin_infos = []
+47 for bin_obj in sorted(bins, key=lambda x: (x.score, x.N50, -x.id), reverse=True):
+48 bin_info = [
+49 bin_obj.id,
+50 bin_obj.origin,
+51 bin_obj.name,
+52 bin_obj.completeness,
+53 bin_obj.contamination,
+54 bin_obj.score,
+55 bin_obj.length,
+56 bin_obj.N50,
+57 len(bin_obj.contigs),
+58 ]
+59 if add_contigs:
+60 bin_info.append(";".join(str(c) for c in bin_obj.contigs) if add_contigs else "")
+ +62 bin_infos.append(bin_info)
+ +64 with open(output, "w", newline="") as fl:
+65 writer = csv.writer(fl, delimiter="\t")
+66 writer.writerow(header)
+67 writer.writerows(bin_infos)
+ + +70def write_bins_fasta(selected_bins: List[Bin], contigs_fasta: str, outdir: str):
+71 """
+72 Write selected bins' contigs to separate FASTA files.
+ +74 :param selected_bins: List of Bin objects representing the selected bins.
+75 :param contigs_fasta: Path to the input FASTA file containing contig sequences.
+76 :param outdir: Output directory to save the individual bin FASTA files.
+77 """
+ +79 fa = pyfastx.Fasta(contigs_fasta, build_index=True)
+ +81 for sbin in selected_bins:
+82 outfile = os.path.join(outdir, f"bin_{sbin.id}.fa")
+ +84 with open(outfile, "w") as outfl:
+85 sequences = (f">{c}\n{fa[c]}" for c in sbin.contigs)
+86 outfl.write("\n".join(sequences) + "\n")
+ + +89def check_contig_consistency(contigs_from_assembly: List[str],
+90 contigs_from_elsewhere: List[str],
+91 assembly_file: str,
+92 elsewhere_file: str ):
+93 """
+94 Check the consistency of contig names between different sources.
+ +96 :param contigs_from_assembly: List of contig names from the assembly file.
+97 :param contigs_from_elsewhere: List of contig names from an external source.
+98 :param assembly_file: Path to the assembly file.
+99 :param elsewhere_file: Path to the file from an external source.
+100 :raises AssertionError: If inconsistencies in contig names are found.
+101 """
+102 logging.debug("check_contig_consistency.")
+103 are_contigs_consistent = len(set(contigs_from_elsewhere) | set(contigs_from_assembly)) <= len(
+104 set(contigs_from_assembly)
+105 )
+ +107 issue_countigs = len(set(contigs_from_elsewhere) - set(contigs_from_assembly))
+ +109 message = f"{issue_countigs} contigs found in file {elsewhere_file} \
+110 were not found in assembly_file ({assembly_file})."
+111 assert are_contigs_consistent, message
+ + +114def check_resume_file(faa_file: str, diamond_result_file: str) -> None:
+115 """
+116 Check the existence of files required for resuming the process.
+ +118 :param faa_file: Path to the protein file.
+119 :param diamond_result_file: Path to the Diamond result file.
+120 :raises FileNotFoundError: If the required files don't exist for resuming.
+121 """
+ +123 if os.path.isfile(faa_file) and os.path.isfile(diamond_result_file):
+124 return
+ +126 if not os.path.isfile(faa_file):
+127 error_msg = f"Protein file '{faa_file}' does not exist. Resuming is not possible."
+128 logging.error(error_msg)
+129 raise FileNotFoundError(error_msg)
+ +131 if not os.path.isfile(diamond_result_file):
+132 error_msg = f"Diamond result file '{diamond_result_file}' does not exist. Resuming is not possible."
+133 logging.error(error_msg)
+134 raise FileNotFoundError(error_msg)
+ + ++ coverage.py v7.3.2, + created at 2023-12-04 22:59 +0000 +
+Module | +statements | +missing | +excluded | +coverage | +
---|---|---|---|---|
binette/__init__.py | +0 | +0 | +0 | +100% | +
binette/bin_manager.py | +184 | +184 | +0 | +0% | +
binette/bin_quality.py | +112 | +112 | +0 | +0% | +
binette/binette.py | +134 | +134 | +0 | +0% | +
binette/cds.py | +59 | +59 | +0 | +0% | +
binette/contig_manager.py | +11 | +11 | +0 | +0% | +
binette/diamond.py | +47 | +47 | +0 | +0% | +
binette/io_manager.py | +55 | +55 | +0 | +0% | +
Total | +602 | +602 | +0 | +0% | +
+ No items found using the specified filter. +
+