From 957ebc43ba6ac8c31bf0a10088459c60e5c8f057 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Sun, 24 Mar 2024 14:56:13 +0100 Subject: [PATCH 01/17] stats: calculate the divergence point and corresponding fractions --- pairtools/lib/stats.py | 203 ++++++++++++++++++++++++++++++++--------- 1 file changed, 159 insertions(+), 44 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 16ed11a..184a540 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -12,6 +12,31 @@ logger = get_logger() +def flatten_dict(d, parent_key='', sep='/'): + """Flatten a nested dictionary to a flat dictionary. + Parameters + ---------- + d: dict + A nested dictionary to flatten. + parent_key: str + The parent key to be added to the key. + sep: str + The separator to use between the parent key and the key. + Returns + ------- + dict + A flat dictionary. + """ + items = [] + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + items.extend(flatten_dict(v, new_key, sep=sep).items()) + else: + items.append((new_key, v)) + return dict(items) + + class PairCounter(Mapping): """ A Counter for Hi-C pairs that accumulates various statistics. @@ -25,6 +50,7 @@ class PairCounter(Mapping): _SEP = "\t" _KEY_SEP = "/" + DIST_FREQ_REL_DIFF_THRESHOLD = 0.05 def __init__( self, @@ -198,18 +224,110 @@ def __iter__(self): def __len__(self): return len(self._stat) + + def find_dist_freq_divergence_distance(self, rel_threshold): + """Finds the largest distance at which the frequency of pairs of reads + with different strands deviates from their average by the specified + relative threshold.""" + + out = {} + all_strands = ["++", "--", "-+", "+-"] + + for filter in self.filters: + out[filter] = {} + + dist_freqs_by_strands = { + strands: np.array(list(self._stat[filter]["dist_freq"][strands].values())) + for strands in all_strands} + + # Calculate the average frequency of pairs with different strands + avg_freq_all_strands = np.mean(np.vstack(list(dist_freqs_by_strands.values())), axis=0) + + # Calculate the largest distance at which the frequency of pairs of at least one strand combination deviates from the average by the given threshold + rel_deviations = {strands: np.nan_to_num( + np.abs(dist_freqs_by_strands[strands] - avg_freq_all_strands) + / avg_freq_all_strands) + for strands in all_strands} + + idx_maxs = {strand:0 for strand in all_strands} + for strands in all_strands: + bin_exceeds = rel_deviations[strands] > rel_threshold + if np.any(bin_exceeds): + idx_maxs[strands] = np.max(np.nonzero(bin_exceeds)) + + # Find the largest distance and the strand combination where frequency of pairs deviates from the average by the given threshold: + divergence_bin_idx = 0 + divergence_strands = '??' + divergence_bin = '0-0' + + for strands in all_strands: + if (idx_maxs[strands] > divergence_bin_idx): + divergence_bin_idx = idx_maxs[strands] + divergence_strands = strands + + if idx_maxs[strands] < len(self._dist_bins): + divergence_bin = f'{self._dist_bins[divergence_bin_idx]}-{self._dist_bins[divergence_bin_idx+1]}' + else: + divergence_bin = f'{self._dist_bins[divergence_bin_idx]}+' + + + out[filter]["dist_bin"] = divergence_bin + out[filter]["strands"] = divergence_strands + out[filter]['rel_diff_threshold'] = rel_threshold + + out[filter]['n_cis_pairs_below_divergence_dist'] = { + strands:dist_freqs_by_strands[strands][:divergence_bin_idx+1].sum() for strands in all_strands + for strands in all_strands + } + + out[filter]['n_cis_pairs_below_divergence_dist_all_strands'] = sum( + list(out[filter]['n_cis_pairs_below_divergence_dist'].values())) + + out[filter]['n_cis_pairs_above_divergence_dist'] = { + strands:dist_freqs_by_strands[strands][divergence_bin_idx+1:].sum() for strands in all_strands + for strands in all_strands + } + + out[filter]['n_cis_pairs_above_divergence_dist_all_strands'] = sum( + list(out[filter]['n_cis_pairs_above_divergence_dist'].values())) + + norms = dict( + cis=self._stat[filter]['cis'], + total_mapped = self._stat[filter]['total_mapped'] + ) + if 'total_nodups' in self._stat[filter]: + norms['total_nodups'] = self._stat[filter]['total_nodups'] + + for key, norm_factor in norms.items(): + out[filter][f'frac_{key}_in_cis_below_divergence_dist'] = { + strands: n_cis_pairs / norm_factor + for strands, n_cis_pairs in out[filter]['n_cis_pairs_below_divergence_dist'].items() + } + + out[filter][f'frac_{key}_in_cis_below_divergence_dist_all_strands'] = sum( + list(out[filter][f'frac_{key}_in_cis_below_divergence_dist'].values())) + + + out[filter][f'frac_{key}_in_cis_above_divergence_dist'] = { + strands: n_cis_pairs / norm_factor + for strands, n_cis_pairs in out[filter]['n_cis_pairs_above_divergence_dist'].items() + } + + out[filter][f'frac_{key}_in_cis_above_divergence_dist_all_strands'] = sum( + list(out[filter][f'frac_{key}_in_cis_above_divergence_dist'].values())) + + return out + + def calculate_summaries(self): """calculate summary statistics (fraction of cis pairs at different cutoffs, complexity estimate) based on accumulated counts. Results are saved into self._stat["filter_name"]['summary"] """ - for key in self.filters.keys(): - self._stat[key]["summary"]["frac_dups"] = ( - (self._stat[key]["total_dups"] / self._stat[key]["total_mapped"]) - if self._stat[key]["total_mapped"] > 0 - else 0 - ) + divergence_stats = self.find_dist_freq_divergence_distance( + self.DIST_FREQ_REL_DIFF_THRESHOLD) + for filter_name in self.filters.keys(): for cis_count in ( "cis", "cis_1kb+", @@ -219,34 +337,43 @@ def calculate_summaries(self): "cis_20kb+", "cis_40kb+", ): - self._stat[key]["summary"][f"frac_{cis_count}"] = ( - (self._stat[key][cis_count] / self._stat[key]["total_nodups"]) - if self._stat[key]["total_nodups"] > 0 + self._stat[filter_name]["summary"][f"frac_{cis_count}"] = ( + (self._stat[filter_name][cis_count] / self._stat[filter_name]["total_nodups"]) + if self._stat[filter_name]["total_nodups"] > 0 else 0 ) - self._stat[key]["summary"][ + self._stat[filter_name]["summary"]["dist_freq_divergence"] = divergence_stats[filter_name] + + self._stat[filter_name]["summary"]["frac_dups"] = ( + (self._stat[filter_name]["total_dups"] / self._stat[filter_name]["total_mapped"]) + if self._stat[filter_name]["total_mapped"] > 0 + else 0 + ) + + self._stat[filter_name]["summary"][ "complexity_naive" ] = estimate_library_complexity( - self._stat[key]["total_mapped"], self._stat[key]["total_dups"], 0 + self._stat[filter_name]["total_mapped"], self._stat[filter_name]["total_dups"], 0 ) - if key == "no_filter" and self._save_bytile_dups: + + if filter_name == "no_filter" and self._save_bytile_dups: # Estimate library complexity with information by tile, if provided: if self._bytile_dups.shape[0] > 0: - self._stat[key]["dups_by_tile_median"] = int( + self._stat[filter_name]["dups_by_tile_median"] = int( round( self._bytile_dups["dup_count"].median() * self._bytile_dups.shape[0] ) ) - if "dups_by_tile_median" in self._stat[key].keys(): - self._stat[key]["summary"][ + if "dups_by_tile_median" in self._stat[filter_name].keys(): + self._stat[filter_name]["summary"][ "complexity_dups_by_tile_median" ] = estimate_library_complexity( - self._stat[key]["total_mapped"], - self._stat[key]["total_dups"], - self._stat[key]["total_dups"] - - self._stat[key]["dups_by_tile_median"], + self._stat[filter_name]["total_mapped"], + self._stat[filter_name]["total_dups"], + self._stat[filter_name]["total_dups"] + - self._stat[filter_name]["dups_by_tile_median"], ) self._summaries_calculated = True @@ -444,24 +571,18 @@ def add_pair( np.searchsorted(self._dist_bins, dist, "right") - 1 ] self._stat[filter]["dist_freq"][strand1 + strand2][dist_bin] += 1 - if dist >= 1000: - self._stat[filter]["cis_1kb+"] += 1 - if dist >= 2000: - self._stat[filter]["cis_2kb+"] += 1 - if dist >= 4000: - self._stat[filter]["cis_4kb+"] += 1 - if dist >= 10000: - self._stat[filter]["cis_10kb+"] += 1 - if dist >= 20000: - self._stat[filter]["cis_20kb+"] += 1 - if dist >= 40000: - self._stat[filter]["cis_40kb+"] += 1 + + for dist_kb in [1, 2, 4, 10, 20, 40]: + if dist >= dist_kb * 1000: + self._stat[filter][f"cis_{dist_kb}kb+"] += 1 + else: self._stat[filter]["trans"] += 1 else: self._stat[filter]["total_single_sided_mapped"] += 1 + def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): """Gather statistics for Hi-C pairs in a dataframe and add to the PairCounter. @@ -541,17 +662,21 @@ def add_pairs_from_dataframe(self, df, unmapped_chrom="!"): self._stat[key]["cis"] += df_cis.shape[0] self._stat[key]["trans"] += df_nodups.shape[0] - df_cis.shape[0] + + # Count cis distance frequencies: dist = np.abs(df_cis["pos2"].values - df_cis["pos1"].values) df_cis.loc[:, "bin_idx"] = ( np.searchsorted(self._dist_bins, dist, "right") - 1 ) + for (strand1, strand2, bin_id), strand_bin_count in ( df_cis[["strand1", "strand2", "bin_idx"]].value_counts().items() ): self._stat[key]["dist_freq"][strand1 + strand2][ self._dist_bins[bin_id].item() ] += strand_bin_count + self._stat[key]["cis_1kb+"] += int(np.sum(dist >= 1000)) self._stat[key]["cis_2kb+"] += int(np.sum(dist >= 2000)) self._stat[key]["cis_4kb+"] += int(np.sum(dist >= 4000)) @@ -690,7 +815,7 @@ def flatten(self, filter="no_filter"): for k, v in self._stat[filter].items(): if isinstance(v, int): flat_stat[k] = v - # store nested dicts/arrays in a context dependet manner: + # store nested dicts/arrays in a context dependent manner: # nested categories are stored only if they are non-trivial else: if (k == "dist_freq") and v: @@ -711,15 +836,10 @@ def flatten(self, filter="no_filter"): raise ValueError("There is a mismatch between dist_freq bins in the instance") # store key,value pair: flat_stat[formatted_key] = freqs[dist] - elif (k in ["pair_types", "dedup", "chromsizes"]) and v: + elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: # 'pair_types' and 'dedup' are simple dicts inside, # treat them the exact same way: - for k_item, freq in v.items(): - formatted_key = self._KEY_SEP.join(["{}", "{}"]).format( - k, k_item - ) - # store key,value pair: - flat_stat[formatted_key] = freq + flat_stat.update(flatten_dict(v, k, self._KEY_SEP)) elif (k == "chrom_freq") and v: for (chrom1, chrom2), freq in v.items(): formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( @@ -727,11 +847,6 @@ def flatten(self, filter="no_filter"): ) # store key,value pair: flat_stat[formatted_key] = freq - elif (k == "summary") and v: - for key, frac in v.items(): - formatted_key = self._KEY_SEP.join(["{}", "{}"]).format(k, key) - # store key,value pair: - flat_stat[formatted_key] = frac # return flattened dict return flat_stat From e627a6a02600d71d3afd3225063ccb49ae0b59b7 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Sun, 24 Mar 2024 17:43:41 +0100 Subject: [PATCH 02/17] stats: refactor code to manipulate nested dicts; pass tests --- pairtools/lib/stats.py | 336 +++++++++++++++++++++++++++-------------- 1 file changed, 219 insertions(+), 117 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 184a540..d415a7d 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -12,31 +12,183 @@ logger = get_logger() -def flatten_dict(d, parent_key='', sep='/'): +def parse_number(s): + if s.isdigit(): + return int(s) + elif s.replace(".", "", 1).isdigit(): + return float(s) + else: + return s + + +def flat_dict_to_nested(input_dict, sep='/'): + output_dict = {} + + for key, value in input_dict.items(): + if type(key) == tuple: + key_parts = key + elif type(key) == str: + key_parts = key.split(sep) + else: + raise ValueError(f"Key type can be either str or tuple. Found key {key} of type {type(key)}.") + + current_dict = output_dict + for key_part in key_parts[:-1]: + current_dict = current_dict.setdefault(key_part, {}) + current_dict[key_parts[-1]] = value + + return output_dict + + +def nested_dict_to_flat(d, tuple_keys=False, sep='/'): """Flatten a nested dictionary to a flat dictionary. + Parameters ---------- d: dict A nested dictionary to flatten. - parent_key: str - The parent key to be added to the key. + tuple_keys: bool + If True, keys will be joined into tuples. Otherwise, they will be joined into strings. sep: str - The separator to use between the parent key and the key. + The separator to use between the parent key and the key if tuple_keys==False. Returns ------- dict A flat dictionary. """ - items = [] - for k, v in d.items(): - new_key = f"{parent_key}{sep}{k}" if parent_key else k - if isinstance(v, dict): - items.extend(flatten_dict(v, new_key, sep=sep).items()) + + if tuple_keys: + join_keys = lambda k1,k2: (k1,) + k2 + else: + join_keys = lambda k1,k2: (k1+sep+k2) if k2 else k1 + + out = {} + + for k1, v1 in d.items(): + if isinstance(v1, dict): + out.update({ + join_keys(k1,k2): v2 + for k2, v2 in nested_dict_to_flat(v1, tuple_keys, sep).items() + }) else: - items.append((new_key, v)) - return dict(items) + if tuple_keys: + out[(k1,)] = v1 + else: + out[k1] = v1 + + return out + +def is_nested_dict(d): + """Check if a dictionary is nested. + + Parameters + ---------- + d: dict + A dictionary to check. + Returns + ------- + bool + True if the dictionary is nested, False otherwise. + """ + + if not isinstance(d, dict): + return False + + for v in d.values(): + if isinstance(v, dict): + return True + + return False + +def is_tuple_keyed_dict(d): + """Check if a dictionary is tuple-keyed. + + Parameters + ---------- + d: dict + A dictionary to check. + Returns + ------- + bool + True if the dictionary is tuple-keyed, False otherwise. + """ + + if not isinstance(d, dict): + return False + + for k,v in d.items(): + if not isinstance(k, tuple): + return False + if isinstance(v, dict): + return False + + return True + +def is_str_keyed_dict(d): + """Check if a dictionary is string-keyed. + + Parameters + ---------- + d: dict + A dictionary to check. + Returns + ------- + bool + True if the dictionary is string-keyed, False otherwise. + """ + + if not isinstance(d, dict): + return False + + for k,v in d.keys(): + if not isinstance(k, str): + return False + if isinstance(v, dict): + return False + + return True +def swap_levels_nested_dict(nested_dict, level1, level2, sep='/'): + """Swap the order of two levels in a nested dictionary. + + Parameters + ---------- + nested_dict: dict + A nested dictionary. + level1: int + The index of the first level to swap. + level2: int + The index of the second level to swap. + Returns + ------- + dict + A nested dictionary with the levels swapped. + """ + + if is_tuple_keyed_dict(nested_dict): + out = {} + for k1, v1 in nested_dict.items(): + k1_list = list(k1) + k1_list[level1], k1_list[level2] = k1_list[level2], k1_list[level1] + out[tuple(k1_list)] = v1 + return out + + elif is_nested_dict(nested_dict): + out = nested_dict_to_flat(nested_dict, tuple_keys=True) + out = swap_levels_nested_dict(out, level1, level2) + out = flat_dict_to_nested(out) + return out + + elif is_str_keyed_dict(nested_dict): + out = nested_dict_to_flat(nested_dict, sep=sep) + out = swap_levels_nested_dict(out, level1, level2) + out = {sep.join(k):v for k,v in out.items()} + return out + + else: + raise ValueError("Input dictionary must be either nested, string-keyed or tuple-keyed") + class PairCounter(Mapping): """ A Counter for Hi-C pairs that accumulates various statistics. @@ -103,8 +255,9 @@ def __init__( self._stat[key]["cis"] = 0 self._stat[key]["trans"] = 0 self._stat[key]["pair_types"] = {} + # to be removed: - self._stat[key]["dedup"] = {} + # self._stat[key]["dedup"] = {} self._stat[key]["cis_1kb+"] = 0 self._stat[key]["cis_2kb+"] = 0 @@ -378,6 +531,7 @@ def calculate_summaries(self): self._summaries_calculated = True + @classmethod def from_file(cls, file_handle): """create instance of PairCounter from file @@ -392,96 +546,40 @@ def from_file(cls, file_handle): # fill in from file - file_handle: default_filter = "no_filter" stat_from_file = cls() + raw_stat = {} for l in file_handle: - fields = l.strip().split(cls._SEP) - if len(fields) == 0: + key_val_pair = l.strip().split(cls._SEP) + if len(key_val_pair) == 0: # skip empty lines: continue - if len(fields) != 2: + if len(key_val_pair) != 2: # expect two _SEP separated values per line: raise fileio.ParseError( "{} is not a valid stats file".format(file_handle.name) ) - # extract key and value, then split the key: - putative_key, putative_val = fields[0], fields[1] - key_fields = putative_key.split(cls._KEY_SEP) - # we should impose a rigid structure of .stats or redo it: - if len(key_fields) == 1: - key = key_fields[0] - if key in stat_from_file._stat[default_filter]: - stat_from_file._stat[default_filter][key] = int(fields[1]) - else: - raise fileio.ParseError( - "{} is not a valid stats file: unknown field {} detected".format( - file_handle.name, key - ) - ) - else: - # in this case key must be in ['pair_types','chrom_freq','dist_freq','dedup', 'summary'] - # get the first 'key' and keep the remainders in 'key_fields' - key = key_fields.pop(0) - if key in ["pair_types", "dedup", "summary", "chromsizes"]: - # assert there is only one element in key_fields left: - # 'pair_types', 'dedup', 'summary' and 'chromsizes' treated the same - if len(key_fields) == 1: - try: - stat_from_file._stat[default_filter][key][ - key_fields[0] - ] = int(fields[1]) - except ValueError: - stat_from_file._stat[default_filter][key][ - key_fields[0] - ] = float(fields[1]) - else: - raise fileio.ParseError( - "{} is not a valid stats file: {} section implies 1 identifier".format( - file_handle.name, key - ) - ) + raw_stat[key_val_pair[0]] = parse_number(key_val_pair[1]) + - elif key == "chrom_freq": - # assert remaining key_fields == [chr1, chr2]: - if len(key_fields) == 2: - stat_from_file._stat[default_filter][key][ - tuple(key_fields) - ] = int(fields[1]) - else: - raise fileio.ParseError( - "{} is not a valid stats file: {} section implies 2 identifiers".format( - file_handle.name, key - ) - ) + ## TODO: check if raw_stat does not contain any unknown keys - elif key == "dist_freq": - # assert that last element of key_fields is the 'directions' - if len(key_fields) == 2: - # assert 'dirs' in ['++','--','+-','-+'] - dirs = key_fields.pop() - # there is only genomic distance range of the bin that's left: - (bin_range,) = key_fields - # extract left border of the bin "1000000+" or "1500-6000": - dist_bin_left = int( - bin_range.strip("+") - if bin_range.endswith("+") - else bin_range.split("-")[0] - ) - # store corresponding value: - stat_from_file._stat[default_filter][key][dirs][dist_bin_left] = int( - fields[1] - ) - else: - raise fileio.ParseError( - "{} is not a valid stats file: {} section implies 2 identifiers".format( - file_handle.name, key - ) - ) - else: - raise fileio.ParseError( - "{} is not a valid stats file: unknown field {} detected".format( - file_handle.name, key - ) - ) - # return PairCounter from a non-empty dict: + # Convert flat dict to nested dict + stat_from_file._stat[default_filter].update(flat_dict_to_nested(raw_stat, sep=cls._KEY_SEP)) + + stat_from_file._stat[default_filter]['chrom_freq'] = nested_dict_to_flat( + stat_from_file._stat[default_filter]['chrom_freq'], tuple_keys=True) + + bin_to_left_val = lambda bin: int(bin.rstrip('+') if ('+' in bin) else bin.split('-')[0]) + + stat_from_file._stat[default_filter]['dist_freq'] = { + bin_to_left_val(k): v + for k,v in stat_from_file._stat[default_filter]['dist_freq'].items() + } + + stat_from_file._stat[default_filter]['dist_freq'] = swap_levels_nested_dict( + stat_from_file._stat[default_filter]['dist_freq'], 0, 1 + ) + + return stat_from_file @classmethod @@ -717,21 +815,23 @@ def __add__(self, other, filter="no_filter"): # use the empty PairCounter to iterate over: for k, v in sum_stat._stat[filter].items(): if k != "chromsizes" and ( - k not in self._stat[filter] or k not in other._stat[filter] + (k not in self._stat[filter]) or (k not in other._stat[filter]) ): # Skip any missing fields and warn logger.warning( f"{k} not found in at least one of the input stats, skipping" ) continue + # not nested fields are summed trivially: if isinstance(v, int): sum_stat._stat[filter][k] = ( self._stat[filter][k] + other._stat[filter][k] ) + # sum nested dicts/arrays in a context dependet manner: else: - if k in ["pair_types", "dedup", "summary"]: + if k in ["pair_types", "dedup"]: # handy function for summation of a pair of dicts: # https://stackoverflow.com/questions/10461531/merge-and-sum-of-two-dictionaries sum_dicts = lambda dict_x, dict_y: { @@ -742,6 +842,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k] = sum_dicts( self._stat[filter][k], other._stat[filter][k] ) + elif k == "chrom_freq": # union list of keys (chr1,chr2) with potential duplicates: union_keys_with_dups = list(self._stat[filter][k].keys()) + list( @@ -756,6 +857,7 @@ def __add__(self, other, filter="no_filter"): sum_stat._stat[filter][k][union_key] = self._stat[filter][ k ].get(union_key, 0) + other._stat[filter][k].get(union_key, 0) + elif k == "dist_freq": for dirs in sum_stat[k]: from functools import reduce @@ -771,6 +873,7 @@ def reducer(accumulator, element): {}, ) # sum_stat[k][dirs] = self._stat[filter][k][dirs] + other._stat[filter][k][dirs] + elif k == "chromsizes": if k in self._stat[filter] and k in other._stat[filter]: if self._stat[filter][k] == other._stat[filter][k]: @@ -796,6 +899,8 @@ def reducer(accumulator, element): "One or both stats don't have chromsizes recorded" ) + sum_stat.calculate_summaries() + return sum_stat # we need this to be able to sum(list_of_PairCounters) @@ -836,10 +941,13 @@ def flatten(self, filter="no_filter"): raise ValueError("There is a mismatch between dist_freq bins in the instance") # store key,value pair: flat_stat[formatted_key] = freqs[dist] + elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: # 'pair_types' and 'dedup' are simple dicts inside, # treat them the exact same way: - flat_stat.update(flatten_dict(v, k, self._KEY_SEP)) + flat_stat.update( + {k+self._KEY_SEP+k2 : v2 for k2,v2 in nested_dict_to_flat(v, sep=self._KEY_SEP).items()}) + elif (k == "chrom_freq") and v: for (chrom1, chrom2), freq in v.items(): formatted_key = self._KEY_SEP.join(["{}", "{}", "{}"]).format( @@ -857,30 +965,23 @@ def format_yaml(self, filter="no_filter"): from copy import deepcopy - formatted_stat = {key: {} for key in self.filters.keys()} + formatted_stat = {filter_name: {} for filter_name in self.filters.keys()} # Storing statistics for each filter - for key in self.filters.keys(): - for k, v in self._stat[key].items(): - if isinstance(v, int): - formatted_stat[key][k] = v - # store nested dicts/arrays in a context dependet manner: - # nested categories are stored only if they are non-trivial - else: - if (k != "chrom_freq") and v: - # simple dicts inside - # treat them the exact same way: - formatted_stat[key][k] = deepcopy(v) - elif (k == "chrom_freq") and v: - # need to convert tuples of chromosome names to str - freqs = {} - for (chrom1, chrom2), freq in sorted(v.items()): - freqs[ - self._KEY_SEP.join(["{}", "{}"]).format(chrom1, chrom2) - ] = freq - # store key,value pair: - formatted_stat[key][k] = deepcopy(freqs) + for filter_name in self.filters.keys(): + for k, v in self._stat[filter_name].items(): + if (k == "chrom_freq"): + v = {self._KEY_SEP.join(k2):v2 for k2, v2 in v.items()} + if v: + formatted_stat[filter_name][k] = deepcopy(v) # return formatted dict + formatted_stat = nested_dict_to_flat(formatted_stat, tuple_keys=True) + for k in formatted_stat: + v = formatted_stat[k] + if isinstance(v, np.generic): + formatted_stat[k] = v.item() + formatted_stat = flat_dict_to_nested(formatted_stat) + return formatted_stat def save(self, outstream, yaml=False, filter="no_filter"): @@ -915,6 +1016,7 @@ def save(self, outstream, yaml=False, filter="no_filter"): for k, v in data.items(): outstream.write("{}{}{}\n".format(k, self._SEP, v)) + def save_bytile_dups(self, outstream): """save bytile duplication counts to a tab-delimited text file. Parameters From c29f92c4ed3ec1652355cc14d2db6011a8934208 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 09:56:38 +0100 Subject: [PATCH 03/17] stats.py: increase n dist bins 2 fold; skip 0-width bins; modify keys --- pairtools/lib/stats.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index d415a7d..0e0d48c 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -208,7 +208,7 @@ def __init__( self, min_log10_dist=0, max_log10_dist=9, - log10_dist_bin_step=0.25, + log10_dist_bin_step=0.125, bytile_dups=False, filters=None, **kwargs, @@ -229,15 +229,17 @@ def __init__( # some variables used for initialization: # genomic distance bining for the ++/--/-+/+- distribution - self._dist_bins = np.r_[ - 0, - np.round( - 10 - ** np.arange( - min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step - ) - ).astype(np.int_), - ] + self._dist_bins = np.unique( + np.r_[ + 0, + np.round( + 10 + ** np.arange( + min_log10_dist, max_log10_dist + 0.001, log10_dist_bin_step + ) + ).astype(np.int_), + ] + ) # establish structure of an empty _stat: for key in self.filters: @@ -411,7 +413,7 @@ def find_dist_freq_divergence_distance(self, rel_threshold): # Find the largest distance and the strand combination where frequency of pairs deviates from the average by the given threshold: divergence_bin_idx = 0 divergence_strands = '??' - divergence_bin = '0-0' + divergence_dist = '0' for strands in all_strands: if (idx_maxs[strands] > divergence_bin_idx): @@ -419,14 +421,14 @@ def find_dist_freq_divergence_distance(self, rel_threshold): divergence_strands = strands if idx_maxs[strands] < len(self._dist_bins): - divergence_bin = f'{self._dist_bins[divergence_bin_idx]}-{self._dist_bins[divergence_bin_idx+1]}' + divergence_dist = self._dist_bins[divergence_bin_idx+1] else: - divergence_bin = f'{self._dist_bins[divergence_bin_idx]}+' + divergence_dist = np.iinfo(np.int64) - out[filter]["dist_bin"] = divergence_bin - out[filter]["strands"] = divergence_strands - out[filter]['rel_diff_threshold'] = rel_threshold + out[filter]["divergence_dist"] = divergence_dist + out[filter]["divergent_strands"] = divergence_strands + out[filter]['divergence_rel_diff_threshold'] = rel_threshold out[filter]['n_cis_pairs_below_divergence_dist'] = { strands:dist_freqs_by_strands[strands][:divergence_bin_idx+1].sum() for strands in all_strands From 5aac9c42d873af03d5bd839ee71b3894d904672c Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 09:57:16 +0100 Subject: [PATCH 04/17] stats.py: modify keys --- pairtools/lib/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 0e0d48c..3c235f4 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -427,7 +427,7 @@ def find_dist_freq_divergence_distance(self, rel_threshold): out[filter]["divergence_dist"] = divergence_dist - out[filter]["divergent_strands"] = divergence_strands + out[filter]["divergence_strands"] = divergence_strands out[filter]['divergence_rel_diff_threshold'] = rel_threshold out[filter]['n_cis_pairs_below_divergence_dist'] = { From e73c5639fad27b0ef3407211eee1d5251466b477 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 12:09:37 +0100 Subject: [PATCH 05/17] doc: bugfix equations in stats --- doc/stats.rst | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/doc/stats.rst b/doc/stats.rst index 49b216a..c47bdbd 100644 --- a/doc/stats.rst +++ b/doc/stats.rst @@ -59,17 +59,23 @@ replacement from a finite pool of fragments in DNA library [1]_ [2]_. With each new sequenced molecule, the expected number of observed unique molecules increases according to a simple equation: -$$ U(N+1) = U(N) + (1 - {U(N) \\over C}), $$ +.. math:: -where $N$ is the number of sequenced molecules, $U(N)$ is the expected number -of observed unique molecules after sequencing $N$ molecules, and C is the library complexity. + U(N+1) = U(N) + (1 - {U(N) \\over C}), + +where :math:`N` is the number of sequenced molecules, :math:`U(N)` is the expected number +of observed unique molecules after sequencing :math:`N` molecules, and :math:`C` is the library complexity. This differential equation yields [1, 2]: -$$ {U(N) \\over C} = 1 - exp( - {N \\over C}), $$ +.. math:: + + {U(N) \\over C} = 1 - exp( - {N \\over C}), which can be solved as -$$ C = \Re(lambert W( - { \exp( - {1 \\over u} ) \\over u} ) ) + {1 \\over u} $$ +.. math:: + + C = \Re(lambert W( - { \exp( - {1 \\over u} ) \\over u} ) ) + {1 \\over u} Library complexity can guide in the choice of sequencing depth of the library and provide an estimate of library quality. From 327bcff5a35cdbb77f9f5645b31a7fdcc20dcdc4 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 12:53:07 +0100 Subject: [PATCH 06/17] update docs --- doc/stats.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/stats.rst b/doc/stats.rst index c47bdbd..5ac5455 100644 --- a/doc/stats.rst +++ b/doc/stats.rst @@ -61,7 +61,7 @@ increases according to a simple equation: .. math:: - U(N+1) = U(N) + (1 - {U(N) \\over C}), + U(N+1) = U(N) + (1 - {U(N) \over C}), where :math:`N` is the number of sequenced molecules, :math:`U(N)` is the expected number of observed unique molecules after sequencing :math:`N` molecules, and :math:`C` is the library complexity. @@ -69,13 +69,13 @@ This differential equation yields [1, 2]: .. math:: - {U(N) \\over C} = 1 - exp( - {N \\over C}), + {U(N) \over C} = 1 - exp( - {N \over C}), which can be solved as .. math:: - C = \Re(lambert W( - { \exp( - {1 \\over u} ) \\over u} ) ) + {1 \\over u} + C = \Re(W_{Lambert}( - { \exp( - {1 \over u} ) \over u} ) ) + {1 \over u} Library complexity can guide in the choice of sequencing depth of the library and provide an estimate of library quality. From 013bcbf01ed797a197f23977b361023fb0eabdcb Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 13:00:39 +0100 Subject: [PATCH 07/17] docs: update math in stats --- doc/stats.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/stats.rst b/doc/stats.rst index 5ac5455..6c55714 100644 --- a/doc/stats.rst +++ b/doc/stats.rst @@ -61,7 +61,7 @@ increases according to a simple equation: .. math:: - U(N+1) = U(N) + (1 - {U(N) \over C}), + U(N+1) = U(N) + \left(1 - \frac{U(N)}{C} \right), where :math:`N` is the number of sequenced molecules, :math:`U(N)` is the expected number of observed unique molecules after sequencing :math:`N` molecules, and :math:`C` is the library complexity. @@ -69,13 +69,13 @@ This differential equation yields [1, 2]: .. math:: - {U(N) \over C} = 1 - exp( - {N \over C}), + {U(N) \over C} = 1 - exp\left( - \frac{N}{C} \right), which can be solved as .. math:: - C = \Re(W_{Lambert}( - { \exp( - {1 \over u} ) \over u} ) ) + {1 \over u} + C = \Re \left( W_{Lambert} \left( - \frac{ \exp\left( - \frac{1}{U} \right) } {U} \right) \right) + \frac{1}{U} Library complexity can guide in the choice of sequencing depth of the library and provide an estimate of library quality. From 5ac84ae93549e349e933a2b95ce9004679a03ad6 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 13:13:47 +0100 Subject: [PATCH 08/17] test: fix tests failing due to smaller bin size --- tests/test_stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_stats.py b/tests/test_stats.py index 0a108e0..1b037dd 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -45,7 +45,7 @@ def test_mock_pairsam(): for orientation in ("++", "+-", "-+", "--"): s = stats["no_filter"]["dist_freq"][orientation] for k, val in s.items(): - if orientation == "++" and k in [1, 2, 32]: + if orientation == "++" and k in [1, 2, 42]: assert s[k] == 1 else: assert s[k] == 0 From a2f55c0ad15edfa5dbd2ab868d765e0aa6f1d66a Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 14:09:34 +0100 Subject: [PATCH 09/17] stats: expose n_dist_bins_decade; rename divergence_dist->convergence_dist --- pairtools/cli/stats.py | 16 +++++-- pairtools/lib/stats.py | 100 +++++++++++++++++++++++------------------ 2 files changed, 69 insertions(+), 47 deletions(-) diff --git a/pairtools/cli/stats.py b/pairtools/cli/stats.py index 9001bfc..c7166ac 100644 --- a/pairtools/cli/stats.py +++ b/pairtools/cli/stats.py @@ -28,6 +28,14 @@ " all overlapping statistics. Non-overlapping statistics are appended to" " the end of the file. Supported for tsv stats with single filter.", ) +@click.option( + "--n-dist-bins-decade", + type=int, + default=PairCounter.N_DIST_BINS_DECADE_DEFAULT, + show_default=True, + required=False, + help="Number of bins to split the distance range in log10-space, specified per a factor of 10 difference.", +) @click.option( "--with-chromsizes/--no-chromsizes", is_flag=True, @@ -107,7 +115,7 @@ ) @common_io_options def stats( - input_path, output, merge, bytile_dups, output_bytile_stats, filter, **kwargs + input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs ): """Calculate pairs statistics. @@ -123,6 +131,7 @@ def stats( input_path, output, merge, + n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, @@ -131,10 +140,10 @@ def stats( def stats_py( - input_path, output, merge, bytile_dups, output_bytile_stats, filter, **kwargs + input_path, output, merge, n_dist_bins_decade, bytile_dups, output_bytile_stats, filter, **kwargs ): if merge: - do_merge(output, input_path, **kwargs) + do_merge(output, input_path, n_dist_bins_decade=n_dist_bins_decade, **kwargs) return if len(input_path) == 0: @@ -181,6 +190,7 @@ def stats_py( filter = None stats = PairCounter( + n_dist_bins_decade=n_dist_bins_decade, bytile_dups=bytile_dups, filters=filter, startup_code=kwargs.get("startup_code", ""), # for evaluation of filters diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 3c235f4..8f2ff69 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -203,12 +203,15 @@ class PairCounter(Mapping): _SEP = "\t" _KEY_SEP = "/" DIST_FREQ_REL_DIFF_THRESHOLD = 0.05 + N_DIST_BINS_DECADE_DEFAULT = 8 + MIN_LOG10_DIST_DEFAULT = 0 + MAX_LOG10_DIST_DEFAULT = 9 def __init__( self, - min_log10_dist=0, - max_log10_dist=9, - log10_dist_bin_step=0.125, + min_log10_dist=MIN_LOG10_DIST_DEFAULT, + max_log10_dist=MAX_LOG10_DIST_DEFAULT, + n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT, bytile_dups=False, filters=None, **kwargs, @@ -229,6 +232,7 @@ def __init__( # some variables used for initialization: # genomic distance bining for the ++/--/-+/+- distribution + log10_dist_bin_step = 1.0 / n_dist_bins_decade self._dist_bins = np.unique( np.r_[ 0, @@ -303,6 +307,7 @@ def __init__( ) self._summaries_calculated = False + def __getitem__(self, key, filter="no_filter"): if isinstance(key, str): # let's strip any unintentional '/' @@ -373,14 +378,16 @@ def __getitem__(self, key, filter="no_filter"): else: raise ValueError("{} is not a valid key".format(k)) + def __iter__(self): return iter(self._stat) + def __len__(self): return len(self._stat) - def find_dist_freq_divergence_distance(self, rel_threshold): + def find_dist_freq_convergence_distance(self, rel_threshold): """Finds the largest distance at which the frequency of pairs of reads with different strands deviates from their average by the specified relative threshold.""" @@ -411,65 +418,60 @@ def find_dist_freq_divergence_distance(self, rel_threshold): idx_maxs[strands] = np.max(np.nonzero(bin_exceeds)) # Find the largest distance and the strand combination where frequency of pairs deviates from the average by the given threshold: - divergence_bin_idx = 0 - divergence_strands = '??' - divergence_dist = '0' + convergence_bin_idx = 0 + convergence_strands = '??' + convergence_dist = '0' for strands in all_strands: - if (idx_maxs[strands] > divergence_bin_idx): - divergence_bin_idx = idx_maxs[strands] - divergence_strands = strands + if (idx_maxs[strands] > convergence_bin_idx): + convergence_bin_idx = idx_maxs[strands] + convergence_strands = strands if idx_maxs[strands] < len(self._dist_bins): - divergence_dist = self._dist_bins[divergence_bin_idx+1] + convergence_dist = self._dist_bins[convergence_bin_idx+1] else: - divergence_dist = np.iinfo(np.int64) + convergence_dist = np.iinfo(np.int64) - out[filter]["divergence_dist"] = divergence_dist - out[filter]["divergence_strands"] = divergence_strands - out[filter]['divergence_rel_diff_threshold'] = rel_threshold + out[filter]["convergence_dist"] = convergence_dist + out[filter]["strands_w_max_convergence_dist"] = convergence_strands + out[filter]['convergence_rel_diff_threshold'] = rel_threshold - out[filter]['n_cis_pairs_below_divergence_dist'] = { - strands:dist_freqs_by_strands[strands][:divergence_bin_idx+1].sum() for strands in all_strands + out[filter]['n_cis_pairs_below_convergence_dist'] = { + strands:dist_freqs_by_strands[strands][:convergence_bin_idx+1].sum() for strands in all_strands for strands in all_strands } - out[filter]['n_cis_pairs_below_divergence_dist_all_strands'] = sum( - list(out[filter]['n_cis_pairs_below_divergence_dist'].values())) + out[filter]['n_cis_pairs_below_convergence_dist_all_strands'] = sum( + list(out[filter]['n_cis_pairs_below_convergence_dist'].values())) - out[filter]['n_cis_pairs_above_divergence_dist'] = { - strands:dist_freqs_by_strands[strands][divergence_bin_idx+1:].sum() for strands in all_strands + n_cis_pairs_above_convergence_dist = { + strands:dist_freqs_by_strands[strands][convergence_bin_idx+1:].sum() for strands in all_strands for strands in all_strands } - out[filter]['n_cis_pairs_above_divergence_dist_all_strands'] = sum( - list(out[filter]['n_cis_pairs_above_divergence_dist'].values())) + out[filter]['n_cis_pairs_above_convergence_dist_all_strands'] = sum( + list(n_cis_pairs_above_convergence_dist.values())) norms = dict( cis=self._stat[filter]['cis'], - total_mapped = self._stat[filter]['total_mapped'] + total_mapped=self._stat[filter]['total_mapped'] ) + if 'total_nodups' in self._stat[filter]: norms['total_nodups'] = self._stat[filter]['total_nodups'] for key, norm_factor in norms.items(): - out[filter][f'frac_{key}_in_cis_below_divergence_dist'] = { + out[filter][f'frac_{key}_in_cis_below_convergence_dist'] = { strands: n_cis_pairs / norm_factor - for strands, n_cis_pairs in out[filter]['n_cis_pairs_below_divergence_dist'].items() + for strands, n_cis_pairs in out[filter]['n_cis_pairs_below_convergence_dist'].items() } - out[filter][f'frac_{key}_in_cis_below_divergence_dist_all_strands'] = sum( - list(out[filter][f'frac_{key}_in_cis_below_divergence_dist'].values())) + out[filter][f'frac_{key}_in_cis_below_convergence_dist_all_strands'] = sum( + list(out[filter][f'frac_{key}_in_cis_below_convergence_dist'].values())) - - out[filter][f'frac_{key}_in_cis_above_divergence_dist'] = { - strands: n_cis_pairs / norm_factor - for strands, n_cis_pairs in out[filter]['n_cis_pairs_above_divergence_dist'].items() - } - - out[filter][f'frac_{key}_in_cis_above_divergence_dist_all_strands'] = sum( - list(out[filter][f'frac_{key}_in_cis_above_divergence_dist'].values())) + out[filter][f'frac_{key}_in_cis_above_convergence_dist_all_strands'] = ( + sum(list(n_cis_pairs_above_convergence_dist.values())) / norm_factor ) return out @@ -479,7 +481,7 @@ def calculate_summaries(self): complexity estimate) based on accumulated counts. Results are saved into self._stat["filter_name"]['summary"] """ - divergence_stats = self.find_dist_freq_divergence_distance( + convergence_stats = self.find_dist_freq_convergence_distance( self.DIST_FREQ_REL_DIFF_THRESHOLD) for filter_name in self.filters.keys(): @@ -498,7 +500,7 @@ def calculate_summaries(self): else 0 ) - self._stat[filter_name]["summary"]["dist_freq_divergence"] = divergence_stats[filter_name] + self._stat[filter_name]["summary"]["dist_freq_convergence"] = convergence_stats[filter_name] self._stat[filter_name]["summary"]["frac_dups"] = ( (self._stat[filter_name]["total_dups"] / self._stat[filter_name]["total_mapped"]) @@ -535,7 +537,7 @@ def calculate_summaries(self): @classmethod - def from_file(cls, file_handle): + def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): """create instance of PairCounter from file Parameters ---------- @@ -547,7 +549,7 @@ def from_file(cls, file_handle): """ # fill in from file - file_handle: default_filter = "no_filter" - stat_from_file = cls() + stat_from_file = cls(n_dist_bins_decade=n_dist_bins_decade) raw_stat = {} for l in file_handle: key_val_pair = l.strip().split(cls._SEP) @@ -585,7 +587,7 @@ def from_file(cls, file_handle): return stat_from_file @classmethod - def from_yaml(cls, file_handle): + def from_yaml(cls, file_handl, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): """create instance of PairCounter from file Parameters ---------- @@ -598,6 +600,7 @@ def from_yaml(cls, file_handle): # fill in from file - file_handle: stat = yaml.safe_load(file_handle) stat_from_file = cls( + n_dist_bins_decade=n_dist_bins_decade, filters={key: val.get("filter_expression", "") for key, val in stat.items()} ) @@ -929,6 +932,12 @@ def flatten(self, filter="no_filter"): for i in range(len(self._dist_bins)): for dirs, freqs in v.items(): dist = self._dist_bins[i] + + # in some previous versions of stats, last bin was not reported, so we need to skip it now: + if (i == len(self._dist_bins) - 1) and dist not in freqs: + flat_stat[formatted_key] = 0 + continue + # last bin is treated differently: "100000+" vs "1200-3000": if i < len(self._dist_bins) - 1: dist_next = self._dist_bins[i + 1] @@ -942,7 +951,10 @@ def flatten(self, filter="no_filter"): else: raise ValueError("There is a mismatch between dist_freq bins in the instance") # store key,value pair: - flat_stat[formatted_key] = freqs[dist] + try: + flat_stat[formatted_key] = freqs[dist] + except: + raise ValueError(f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match") elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: # 'pair_types' and 'dedup' are simple dicts inside, @@ -1050,9 +1062,9 @@ def do_merge(output, files_to_merge, **kwargs): ) # use a factory method to instanciate PairCounter if kwargs.get("yaml", False): - stat = PairCounter.from_yaml(f) + stat = PairCounter.from_yaml(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) else: - stat = PairCounter.from_file(f) + stat = PairCounter.from_file(f, n_dist_bins_decade=kwargs.get('n_dist_bins_decade', PairCounter.N_DIST_BINS_DECADE_DEFAULT)) stats.append(stat) f.close() From c0f053cc4ded96c29e500cc531cd80e295642f6d Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 14:10:57 +0100 Subject: [PATCH 10/17] stats: bugfix --- pairtools/lib/stats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 8f2ff69..24ceb2d 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -587,7 +587,7 @@ def from_file(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): return stat_from_file @classmethod - def from_yaml(cls, file_handl, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): + def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): """create instance of PairCounter from file Parameters ---------- From 1f653b5b6882bb768422d014f646128faa7cd8d0 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 16:50:04 +0100 Subject: [PATCH 11/17] stats doc: bugfix --- doc/stats.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/stats.rst b/doc/stats.rst index 6c55714..4895767 100644 --- a/doc/stats.rst +++ b/doc/stats.rst @@ -14,7 +14,7 @@ output file. - **Global statistics** include: - number of pairs (total, unmapped, single-side mapped, etc.), - - total number of different pair types (UU, NN, NU, and others, see ` Pair types in pairtools docs `_), + - total number of different pair types (UU, NN, NU, and others, see `Pair types in pairtools docs `_), - number of contacts between all chromosome pairs - **Summary statistics** include: From ffbde6b015ce7bc5a997dbbae2570446d608520c Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Mon, 25 Mar 2024 16:58:18 +0100 Subject: [PATCH 12/17] stats.py: fix recently introduced bugs --- pairtools/lib/stats.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pairtools/lib/stats.py b/pairtools/lib/stats.py index 24ceb2d..e008314 100644 --- a/pairtools/lib/stats.py +++ b/pairtools/lib/stats.py @@ -614,6 +614,7 @@ def from_yaml(cls, file_handle, n_dist_bins_decade=N_DIST_BINS_DECADE_DEFAULT): stat_from_file._stat = stat return stat_from_file + def add_pair( self, chrom1, @@ -933,11 +934,6 @@ def flatten(self, filter="no_filter"): for dirs, freqs in v.items(): dist = self._dist_bins[i] - # in some previous versions of stats, last bin was not reported, so we need to skip it now: - if (i == len(self._dist_bins) - 1) and dist not in freqs: - flat_stat[formatted_key] = 0 - continue - # last bin is treated differently: "100000+" vs "1200-3000": if i < len(self._dist_bins) - 1: dist_next = self._dist_bins[i + 1] @@ -950,11 +946,16 @@ def flatten(self, filter="no_filter"): ).format(k, dist, dirs) else: raise ValueError("There is a mismatch between dist_freq bins in the instance") + # store key,value pair: try: flat_stat[formatted_key] = freqs[dist] except: - raise ValueError(f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match") + # in some previous versions of stats, last bin was not reported, so we need to skip it now: + if (dist not in freqs) and (i == len(self._dist_bins) - 1): + flat_stat[formatted_key] = 0 + else: + raise ValueError(f"Error in {k} {dirs} {dist} {dist_next} {freqs}: source and destination bins do not match") elif (k in ["pair_types", "dedup", "chromsizes", 'summary']) and v: # 'pair_types' and 'dedup' are simple dicts inside, From fc282e464d6230e2fb3a91b616b3c72f275dee7e Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Tue, 26 Mar 2024 11:39:06 +0100 Subject: [PATCH 13/17] update phase docs --- doc/examples/pairtools_phase_walkthrough.ipynb | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/doc/examples/pairtools_phase_walkthrough.ipynb b/doc/examples/pairtools_phase_walkthrough.ipynb index 279de7c..9905d5a 100644 --- a/doc/examples/pairtools_phase_walkthrough.ipynb +++ b/doc/examples/pairtools_phase_walkthrough.ipynb @@ -7,8 +7,6 @@ "source": [ "# Pairtools phase walkthrough\n", "\n", - "## Pairtools Phase Walkthrough\n", - "\n", "Welcome to the pairtools phase walkthrough! This notebook will guide you through the process of resolving contacts between homologous chromosomes using haplotype-resolved Hi-C analysis.\n", "\n", "## What is haplotype-resolved Hi-C?\n", @@ -111,7 +109,7 @@ "id": "5ab026af-fe25-4a70-82ef-52af6fb25371", "metadata": {}, "source": [ - "## Create the reference genome\n", + "## Create the homolog-resolved genome\n", "\n", "To phase input reads, we need to map the data to the concatenated genome with two haplotypes. \n", "Below, we will generate such genome in several steps. You will need a reference genome, and one or two lists of mutations to instroduce to the reference.\n", @@ -588,7 +586,8 @@ "id": "a8a9fd19", "metadata": {}, "source": [ - "# Map data with bwa mem to diploid genome\n", + "## Map data with bwa mem to diploid genome\n", + "\n", "In homolog-resolved Hi-C experiments, reads are first aligned against the reference genome and then parsed with pairtools, similar to the standard pairtools-based Hi-C pipeline. However, an additional challenge arises in distinguishing between un-phaseable reads (reads that map equally well to two homologous locations on two homologous chromosomes) and multimappers (reads that map to repeats in the genome).\n", "\n", "To differentiate between these cases, we can examine the top three candidate alignments for each read:\n", From c0857498b0be8cac17ecaa69745ec9c1e39aece2 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Thu, 28 Mar 2024 12:20:46 +0100 Subject: [PATCH 14/17] Update protocols_pipelines.rst --- doc/protocols_pipelines.rst | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/doc/protocols_pipelines.rst b/doc/protocols_pipelines.rst index 4851a3f..f5d9920 100644 --- a/doc/protocols_pipelines.rst +++ b/doc/protocols_pipelines.rst @@ -9,7 +9,7 @@ Typical Hi-C Workflow ---------------------- A typical pairtools workflow for processing standard Hi-C data is outlined below. -Please, note that this is a shorter version; you can find a more detailed and reproducible example in chapter :ref:`examples/pairtools_walkthrough`. +Please, note that this is a shorter version. For a detailed reproducible example, please, check the Jupyter notebook "Pairtools Walkthrough". 1. Align sequences to the reference genome with ``bwa mem``: @@ -113,6 +113,9 @@ Technical tips Each pairtool has the CLI flags --nproc-in and --nproc-out to control the number of cores dedicated to input decompression and output compression. Additionally, `pairtools sort` parallelizes sorting with `--nproc`.ß -Example Workflows ------------------ +Advanced Workflows +------------------ +For more advanced workflows, please check the following projects: +- [Distiller-NF](https://github.com/open2c/distiller-nf) is a feature-rich Open2C Hi-C processing pipeline for the Nextflow workflow manager. +- [Distiller-SM](https://github.com/open2c/distiller-sm) is a similarly feature-rich and optimized pipeline implemented in Snakemake. From db383f96809b00ff13cd7c152c7fd84f1ee364b1 Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Thu, 28 Mar 2024 12:20:56 +0100 Subject: [PATCH 15/17] Update protocols_pipelines.rst --- doc/protocols_pipelines.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/protocols_pipelines.rst b/doc/protocols_pipelines.rst index f5d9920..bd7c89b 100644 --- a/doc/protocols_pipelines.rst +++ b/doc/protocols_pipelines.rst @@ -116,6 +116,7 @@ Technical tips Advanced Workflows ------------------ For more advanced workflows, please check the following projects: + - [Distiller-NF](https://github.com/open2c/distiller-nf) is a feature-rich Open2C Hi-C processing pipeline for the Nextflow workflow manager. - [Distiller-SM](https://github.com/open2c/distiller-sm) is a similarly feature-rich and optimized pipeline implemented in Snakemake. From f1133c912f087861659f79a657ef78f947bdcc1c Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Thu, 28 Mar 2024 12:22:05 +0100 Subject: [PATCH 16/17] Update protocols_pipelines.rst --- doc/protocols_pipelines.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/protocols_pipelines.rst b/doc/protocols_pipelines.rst index bd7c89b..28f49f4 100644 --- a/doc/protocols_pipelines.rst +++ b/doc/protocols_pipelines.rst @@ -117,6 +117,6 @@ Advanced Workflows ------------------ For more advanced workflows, please check the following projects: -- [Distiller-NF](https://github.com/open2c/distiller-nf) is a feature-rich Open2C Hi-C processing pipeline for the Nextflow workflow manager. -- [Distiller-SM](https://github.com/open2c/distiller-sm) is a similarly feature-rich and optimized pipeline implemented in Snakemake. +- `Distiller-nf `_ is a feature-rich Open2C Hi-C processing pipeline for the Nextflow workflow manager. +- `Distiller-sm `_ is a similarly feature-rich and optimized pipeline implemented in Snakemake. From 00ef8e6f38a7e1a61a6371956e78dcce2d3936fe Mon Sep 17 00:00:00 2001 From: Anton Goloborodko Date: Sun, 7 Apr 2024 16:01:59 +0200 Subject: [PATCH 17/17] test convergence point --- tests/test_stats.py | 59 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/test_stats.py b/tests/test_stats.py index 1b037dd..8ead642 100644 --- a/tests/test_stats.py +++ b/tests/test_stats.py @@ -5,6 +5,9 @@ import numpy as np import yaml +import pytest + + testdir = os.path.dirname(os.path.realpath(__file__)) @@ -131,3 +134,59 @@ def test_merge_stats(): print(e.output) print(sys.exc_info()) raise e + + +from pairtools.lib.stats import PairCounter + +@pytest.fixture +def pair_counter(): + counter = PairCounter(filters={"f1": "filter1", "f2": "filter2"}) + counter._dist_bins = np.array([1, 1000, 10000, 100000, 1000000]) + # Populate the counter with some sample data + counter._stat["f1"]["dist_freq"] = { + "++": {1: 80, 1000: 80, 10000: 91, 100000: 95}, + "--": {1: 100, 1000: 100, 10000: 100, 100000: 100}, + "-+": {1: 100, 1000: 100, 10000: 100, 100000: 100}, + "+-": {1: 120, 1000: 120, 10000: 109, 100000: 105}, + } + + counter._stat["f2"]["dist_freq"] = { + "++": {1: 200, 1000: 180, 10000: 160, 100000: 140}, + "--": {1: 220, 1000: 190, 10000: 170, 100000: 150}, + "-+": {1: 210, 1000: 185, 10000: 165, 100000: 145}, + "+-": {1: 230, 1000: 195, 10000: 175, 100000: 155}, + } + + return counter + + +def test_find_dist_freq_convergence_distance(pair_counter): + result = pair_counter.find_dist_freq_convergence_distance(0.1) + + assert "f1" in result + assert "f2" in result + + f1_result = result["f1"] + assert "convergence_dist" in f1_result + assert "strands_w_max_convergence_dist" in f1_result + assert "convergence_rel_diff_threshold" in f1_result + assert "n_cis_pairs_below_convergence_dist" in f1_result + assert "n_cis_pairs_below_convergence_dist_all_strands" in f1_result + assert "n_cis_pairs_above_convergence_dist_all_strands" in f1_result + assert "frac_cis_in_cis_below_convergence_dist" in f1_result + assert "frac_cis_in_cis_below_convergence_dist_all_strands" in f1_result + assert "frac_cis_in_cis_above_convergence_dist_all_strands" in f1_result + assert "frac_total_mapped_in_cis_below_convergence_dist" in f1_result + assert "frac_total_mapped_in_cis_below_convergence_dist_all_strands" in f1_result + assert "frac_total_mapped_in_cis_above_convergence_dist_all_strands" in f1_result + + assert f1_result["convergence_rel_diff_threshold"] == 0.1 + assert f1_result["convergence_dist"] == 10000 + assert f1_result["strands_w_max_convergence_dist"] == "++" + + + # f2_result = result["f2"] + # assert "convergence_dist" in f2_result + # assert "strands_w_max_convergence_dist" in f2_result + # assert "convergence_rel_diff_threshold" in f2_result + # Add more assertions for f2_result as needed \ No newline at end of file