From 82124d6e353b13de2e7ba858202bde40832e2237 Mon Sep 17 00:00:00 2001 From: Phlya Date: Fri, 27 Oct 2023 14:56:31 +0200 Subject: [PATCH 1/4] first try --- cooltools/api/expected.py | 38 ++++++++++++++++++++++------- tests/test_expected.py | 50 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 79 insertions(+), 9 deletions(-) diff --git a/cooltools/api/expected.py b/cooltools/api/expected.py index 40273f22..737e790c 100644 --- a/cooltools/api/expected.py +++ b/cooltools/api/expected.py @@ -302,7 +302,9 @@ def make_diag_tables(clr, regions, regions2=None, clr_weight_name="weight"): regions2 = bioframe.make_viewframe( regions2, check_bounds=clr.chromsizes ).to_numpy() - except ValueError: # If there are non-unique entries in regions1/2, possible only for asymmetric expected: + except ( + ValueError + ): # If there are non-unique entries in regions1/2, possible only for asymmetric expected: regions = pd.concat( [ bioframe.make_viewframe([region], check_bounds=clr.chromsizes) @@ -1017,25 +1019,43 @@ def expected_cis( # additional smoothing and aggregating options would add columns only, not replace them if smooth: + if clr_weight_name is None: + # result["count.avg"] = result["count.sum"] / result["n_valid"] + cols = { + "dist": "dist", + "n_pixels": _NUM_VALID, + "n_contacts": "count.sum", + "contact_freq": "count.avg", + "smooth_suffix": ".smoothed", + } + else: + cols = expected_smoothing.DEFAULT_CVD_COLS + result_smooth = expected_smoothing.agg_smooth_cvd( - result, - sigma_log10=smooth_sigma, + result, sigma_log10=smooth_sigma, cols=cols ) # add smoothed columns to the result (only balanced for now) result = result.merge( - result_smooth[["balanced.avg.smoothed", _DIST]], + result_smooth[[cols["contact_freq"] + cols["smooth_suffix"], _DIST]], on=[_REGION1, _REGION2, _DIST], how="left", ) if aggregate_smoothed: result_smooth_agg = expected_smoothing.agg_smooth_cvd( - result, - groupby=None, - sigma_log10=smooth_sigma, - ).rename(columns={"balanced.avg.smoothed": "balanced.avg.smoothed.agg"}) + result, groupby=None, sigma_log10=smooth_sigma, cols=cols + ).rename( + columns={ + cols["contact_freq"] + + cols["smooth_suffix"]: cols["contact_freq"] + + cols["smooth_suffix"] + + ".agg" + } + ) # add smoothed columns to the result result = result.merge( - result_smooth_agg[["balanced.avg.smoothed.agg", _DIST]], + result_smooth_agg[ + [cols["contact_freq"] + cols["smooth_suffix"] + ".agg", _DIST] + ], on=[ _DIST, ], diff --git a/tests/test_expected.py b/tests/test_expected.py index 82964585..efe7fd54 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -476,6 +476,56 @@ def test_expected_smooth_cli(request, tmpdir): assert _delta_smooth_agg < 0.02 +def test_expected_smooth_nobalance_cli(request, tmpdir): + # CLI compute-expected for chrom-wide cis-data + clr_weight_name = None + in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") + out_cis_expected = op.join(tmpdir, "cis.exp.tsv") + runner = CliRunner() + result = runner.invoke( + cli, + [ + "expected-cis", + "--smooth", + "--aggregate-smoothed", + "--clr-weight-name", + clr_weight_name, + "-o", + out_cis_expected, + in_cool, + ], + ) + assert result.exit_code == 0 + clr = cooler.Cooler(in_cool) + cis_expected = pd.read_table(out_cis_expected, sep="\t") + print(cis_expected) + grouped = cis_expected.groupby(["region1", "region2"]) + # full chromosomes in this example: + for (chrom1, chrom2), group in grouped: + assert chrom1 == chrom2 + # work only on "large" crhomosomes, skip chrM and such + if chrom1 not in ["chrM", "chrY", "chrX"]: + # extract dense matrix and get desired expected: + matrix = clr.matrix(balance=clr_weight_name).fetch(chrom1) + desired_expected = np.where( + group["dist"] < ignore_diags, + np.nan, # fill nan for ignored diags + _diagsum_symm_dense(matrix), + ) + # do overlall tolerance instead of element by element comparison + # because of non-matching NaNs and "noiseness" of the non-smoothed + # expected + _delta_smooth = np.nanmax( + np.abs(group["count.avg.smoothed"].to_numpy() - desired_expected) + ) + _delta_smooth_agg = np.nanmax( + np.abs(group["count.avg.smoothed.agg"].to_numpy() - desired_expected) + ) + # some made up tolerances, that work for this example + assert _delta_smooth < 0.01 + assert _delta_smooth_agg < 0.02 + + def test_trans_expected_view_cli(request, tmpdir): # CLI compute expected for cis-data with arbitrary view # which cannot overlap. But it is symmetrical cis-case. From 3560515d7e38c7e5cec6d406962735463b951854 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 31 Oct 2023 15:38:11 +0100 Subject: [PATCH 2/4] test fix --- tests/test_expected.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/test_expected.py b/tests/test_expected.py index efe7fd54..cfb17189 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -488,8 +488,7 @@ def test_expected_smooth_nobalance_cli(request, tmpdir): "expected-cis", "--smooth", "--aggregate-smoothed", - "--clr-weight-name", - clr_weight_name, + "--clr-weight-name ''", "-o", out_cis_expected, in_cool, @@ -498,7 +497,6 @@ def test_expected_smooth_nobalance_cli(request, tmpdir): assert result.exit_code == 0 clr = cooler.Cooler(in_cool) cis_expected = pd.read_table(out_cis_expected, sep="\t") - print(cis_expected) grouped = cis_expected.groupby(["region1", "region2"]) # full chromosomes in this example: for (chrom1, chrom2), group in grouped: From f9e017f2bf766cfa98bae421aa379dc04a065d35 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 7 Nov 2023 14:03:32 +0100 Subject: [PATCH 3/4] Fix test --- tests/test_expected.py | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/tests/test_expected.py b/tests/test_expected.py index cfb17189..6c57d607 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -478,9 +478,8 @@ def test_expected_smooth_cli(request, tmpdir): def test_expected_smooth_nobalance_cli(request, tmpdir): # CLI compute-expected for chrom-wide cis-data - clr_weight_name = None in_cool = op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool") - out_cis_expected = op.join(tmpdir, "cis.exp.tsv") + out_cis_expected = op.join(tmpdir, "cis_unb.exp.tsv") runner = CliRunner() result = runner.invoke( cli, @@ -488,7 +487,8 @@ def test_expected_smooth_nobalance_cli(request, tmpdir): "expected-cis", "--smooth", "--aggregate-smoothed", - "--clr-weight-name ''", + "--clr-weight-name", + "", "-o", out_cis_expected, in_cool, @@ -504,24 +504,34 @@ def test_expected_smooth_nobalance_cli(request, tmpdir): # work only on "large" crhomosomes, skip chrM and such if chrom1 not in ["chrM", "chrY", "chrX"]: # extract dense matrix and get desired expected: - matrix = clr.matrix(balance=clr_weight_name).fetch(chrom1) + matrix = clr.matrix(balance=False).fetch(chrom1) desired_expected = np.where( group["dist"] < ignore_diags, np.nan, # fill nan for ignored diags _diagsum_symm_dense(matrix), ) - # do overlall tolerance instead of element by element comparison + # do overall tolerance instead of element by element comparison # because of non-matching NaNs and "noiseness" of the non-smoothed # expected _delta_smooth = np.nanmax( - np.abs(group["count.avg.smoothed"].to_numpy() - desired_expected) + np.abs( + np.nan_to_num( + group["count.avg.smoothed"].to_numpy() - desired_expected, + posinf=0, + ), + ), ) _delta_smooth_agg = np.nanmax( - np.abs(group["count.avg.smoothed.agg"].to_numpy() - desired_expected) + np.abs( + np.nan_to_num( + group["count.avg.smoothed.agg"].to_numpy() - desired_expected, + posinf=0, + ), + ), ) # some made up tolerances, that work for this example - assert _delta_smooth < 0.01 - assert _delta_smooth_agg < 0.02 + assert _delta_smooth < 2000 + assert _delta_smooth_agg < 4000 def test_trans_expected_view_cli(request, tmpdir): From 5ee55815a2a17bc7af09f38862202c84b2d18a06 Mon Sep 17 00:00:00 2001 From: Phlya Date: Tue, 7 Nov 2023 14:06:56 +0100 Subject: [PATCH 4/4] Simplify test --- tests/test_expected.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/tests/test_expected.py b/tests/test_expected.py index 6c57d607..feb94865 100644 --- a/tests/test_expected.py +++ b/tests/test_expected.py @@ -514,20 +514,10 @@ def test_expected_smooth_nobalance_cli(request, tmpdir): # because of non-matching NaNs and "noiseness" of the non-smoothed # expected _delta_smooth = np.nanmax( - np.abs( - np.nan_to_num( - group["count.avg.smoothed"].to_numpy() - desired_expected, - posinf=0, - ), - ), + np.abs(group["count.avg.smoothed"].to_numpy() - desired_expected) ) _delta_smooth_agg = np.nanmax( - np.abs( - np.nan_to_num( - group["count.avg.smoothed.agg"].to_numpy() - desired_expected, - posinf=0, - ), - ), + np.abs(group["count.avg.smoothed.agg"].to_numpy() - desired_expected) ) # some made up tolerances, that work for this example assert _delta_smooth < 2000