diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index 40273f22..737e790c 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -302,7 +302,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"): regions2 = bioframe.make_viewframe( regions2, check_bounds=clr.chromsizes ).to_numpy() - except ValueError: # If there are non-unique entries in regions1/2, possible only for asymmetric expected: + except ( + ValueError + ): # If there are non-unique entries in regions1/2, possible only for asymmetric expected: regions = pd.concat( [ bioframe.make_viewframe([region], check_bounds=clr.chromsizes) @@ -1017,25 +1019,43 @@ def expected_cis( # additional smoothing and aggregating options would add columns only, not replace them if smooth: + if clr_weight_name is None: + # result["count.avg"] = result["count.sum"] / result["n_valid"] + cols = { + "dist": "dist", + "n_pixels": _NUM_VALID, + "n_contacts": "count.sum", + "contact_freq": "count.avg", + "smooth_suffix": ".smoothed", + } + else: + cols = expected_smoothing.DEFAULT_CVD_COLS + result_smooth = expected_smoothing.agg_smooth_cvd( - result, - sigma_log10=smooth_sigma, + result, sigma_log10=smooth_sigma, cols=cols ) # add smoothed columns to the result (only balanced for now) result = result.merge( - result_smooth[["balanced.avg.smoothed", _DIST]], + result_smooth[[cols["contact_freq"] + cols["smooth_suffix"], _DIST]], on=[_REGION1, _REGION2, _DIST], how="left", ) if aggregate_smoothed: result_smooth_agg = expected_smoothing.agg_smooth_cvd( - result, - groupby=None, - sigma_log10=smooth_sigma, - ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"}) + result, groupby=None, sigma_log10=smooth_sigma, cols=cols + ).rename( + columns={ + cols["contact_freq"] + + cols["smooth_suffix"]: cols["contact_freq"] + + cols["smooth_suffix"] + + ".agg" + } + ) # add smoothed columns to the result result = result.merge( - result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]], + result_smooth_agg[ + [cols["contact_freq"] + cols["smooth_suffix"] + ".agg", _DIST] + ], on=[ _DIST, ], diff --git a/tests/test_expected.py b/tests/test_expected.py index 82964585..feb94865 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -476,6 +476,54 @@ def test_expected_smooth_cli(request, tmpdir): assert _delta_smooth_agg < 0.02 +def test_expected_smooth_nobalance_cli(request, tmpdir): + # CLI compute-expected for chrom-wide cis-data + in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") + out_cis_expected = op.join(tmpdir, "cis_unb.exp.tsv") + runner = CliRunner() + result = runner.invoke( + cli, + [ + "expected-cis", + "--smooth", + "--aggregate-smoothed", + "--clr-weight-name", + "", + "-o", + out_cis_expected, + in_cool, + ], + ) + assert result.exit_code == 0 + clr = cooler.Cooler(in_cool) + cis_expected = pd.read_table(out_cis_expected, sep="\t") + grouped = cis_expected.groupby(["region1", "region2"]) + # full chromosomes in this example: + for (chrom1, chrom2), group in grouped: + assert chrom1 == chrom2 + # work only on "large" crhomosomes, skip chrM and such + if chrom1 not in ["chrM", "chrY", "chrX"]: + # extract dense matrix and get desired expected: + matrix = clr.matrix(balance=False).fetch(chrom1) + desired_expected = np.where( + group["dist"] < ignore_diags, + np.nan, # fill nan for ignored diags + _diagsum_symm_dense(matrix), + ) + # do overall tolerance instead of element by element comparison + # because of non-matching NaNs and "noiseness" of the non-smoothed + # expected + _delta_smooth = np.nanmax( + np.abs(group["count.avg.smoothed"].to_numpy() - desired_expected) + ) + _delta_smooth_agg = np.nanmax( + np.abs(group["count.avg.smoothed.agg"].to_numpy() - desired_expected) + ) + # some made up tolerances, that work for this example + assert _delta_smooth < 2000 + assert _delta_smooth_agg < 4000 + + def test_trans_expected_view_cli(request, tmpdir): # CLI compute expected for cis-data with arbitrary view # which cannot overlap. But it is symmetrical cis-case.