From 8856b02053e3a10f27dc2e43aaf595653a53bc36 Mon Sep 17 00:00:00 2001
From: David <david.walter@cern.ch>
Date: Thu, 15 Aug 2024 21:42:22 +0200
Subject: [PATCH] Transform x for power polynomials to [-1,1]; change fake
 uncertainties; disable swapping bins in case abcd-x axis is not integrated
 (for mt plotting in CI)

---
 scripts/combine/setupCombine.py | 32 +++++++-------------------------
 wremnants/histselections.py     | 17 +++++++++++------
 wremnants/regression.py         | 21 ++++++++++++++-------
 3 files changed, 32 insertions(+), 38 deletions(-)

diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py
index 3dd478015..2c883f276 100644
--- a/scripts/combine/setupCombine.py
+++ b/scripts/combine/setupCombine.py
@@ -130,7 +130,7 @@ def make_parser(parser=None):
     parser.add_argument("--pseudoDataIdxs", type=str, nargs="+", default=[None], help="Variation indices to use as pseudodata for each of the histograms")
     parser.add_argument("--pseudoDataFile", type=str, help="Input file for pseudodata (if it should be read from a different file)", default=None)
     parser.add_argument("--pseudoDataProcsRegexp", type=str, default=".*", help="Regular expression for processes taken from pseudodata file (all other processes are automatically got from the nominal file). Data is excluded automatically as usual")
-    parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-fakerate"],
+    parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-binned", "extended1D-fakerate"],
         help="Pseudodata for fakes are using QCD MC (closure), or different estimation methods (simple, extended1D, extended2D)")
     parser.add_argument("--addTauToSignal", action='store_true', help="Events from the same process but from tau final states are added to the signal")
     parser.add_argument("--noPDFandQCDtheorySystOnSignal", action='store_true', help="Removes PDF and theory uncertainties on signal processes")
@@ -764,30 +764,12 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]):
             action=fakeselector.get_hist,
             systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"])
         if args.fakeSmoothingMode in ["hybrid", "full"]:
-            if False:
-                # these uncertainties are covered by the binByBin stat uncertainties
-                # for the moment
-                subgroup = f"{cardTool.getFakeName()}Smoothing"
-                cardTool.addSystematic(**info,
-                    rename=subgroup,
-                    splitGroup = {subgroup: f".*", "experiment": ".*"},
-                    systNamePrepend=subgroup,
-                    actionArgs=dict(variations_smoothing=True),
-                )
-
-            subgroup = f"{cardTool.getFakeName()}SmoothingSyst"
-            cardTool.addSystematic(name=inputBaseName,
-                                    group="Fake",
-                                    processes=cardTool.getFakeName(),
-                                    noConstraint=False,
-                                    mirror=True,
-                                    scale=1.,
-                                    applySelection=False,
-                                    action = fakeselector.get_smoothing_syst,
-                                    rename=subgroup,
-                                    splitGroup = {subgroup: f".*", "experiment": ".*"},
-                                    systNamePrepend=subgroup,
-                                    systAxes = ["var"],
+            subgroup = f"{cardTool.getFakeName()}Smoothing"
+            cardTool.addSystematic(**info,
+                rename=subgroup,
+                splitGroup = {subgroup: f".*", "experiment": ".*"},
+                systNamePrepend=subgroup,
+                actionArgs=dict(variations_smoothing=True),
             )
 
         if args.fakeSmoothingMode in ["fakerate", "hybrid"]:
diff --git a/wremnants/histselections.py b/wremnants/histselections.py
index aab5e5b6d..ab4bbe29b 100644
--- a/wremnants/histselections.py
+++ b/wremnants/histselections.py
@@ -390,7 +390,9 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi
             h = self.transfer_variances(h, set_nominal=is_nominal)
             y_frf, y_frf_var = self.compute_fakeratefactor(h, smoothing=True, syst_variations=variations_frf)
 
-            if self.swap_regions:
+            if not self.integrate_x and self.swap_regions:
+                logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated")
+            if self.swap_regions and self.integrate_x:
                 if type(self) == FakeSelectorSimpleABCD:
                     # replace C region with B region
                     hC = self.get_hist_failX_passY(h) 
@@ -426,9 +428,8 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi
             elif variations_frf:
                 dvar = cval[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:]
             elif self.smoothing_mode in ["hybrid"]:
-                # keep statistical uncertainty from c region since we'll propagate that
-                # as a systematic
-                dvar = y_frf**2 * cvar_binned
+                # noo bin by bin statistical uncertainty, all regions covered by smoothing
+                dvar = np.zeros_like(cvar)
             else:
                 # only take bin by bin uncertainty from c region
                 dvar = y_frf**2 * cvar
@@ -472,7 +473,9 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow
 
         # select sideband regions
         ha = self.get_hist_failX_failY(hNew)
-        if self.swap_regions:
+        if not self.integrate_x and self.swap_regions:
+            logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated")
+        if self.swap_regions and self.integrate_x:
             # replace B region with C region
             hb = self.get_hist_passX_failY(hNew) 
         else:
@@ -817,7 +820,9 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow
         hb = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_y}]
         hbx = hNew[{self.name_x: self.sel_d2x, self.name_y: self.sel_y}]
 
-        if self.swap_regions:
+        if not self.integrate_x and self.swap_regions:
+            logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated")
+        if self.swap_regions and self.integrate_x:
             # replace Ax region with C region
             hax = self.get_hist_passX_failY(hNew)
         else:
diff --git a/wremnants/regression.py b/wremnants/regression.py
index 9ac2d2f4e..c3c5c0cd9 100644
--- a/wremnants/regression.py
+++ b/wremnants/regression.py
@@ -20,11 +20,11 @@ def compute_chi2(y, y_pred, w=None, nparams=1):
     ndf = y.shape[-1] - nparams
     ndf_total = y.size - chi2.size*nparams
 
-    logger.info(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})")
-    logger.info(f"Min chi2/ndf = {chi2.min()}/{ndf} (p = {stats.chi2.sf(chi2.min(), ndf)})")
-    logger.info(f"Max chi2/ndf = {chi2.max()}/{ndf} (p = {stats.chi2.sf(chi2.max(), ndf)})")
-    logger.info(f"Mean chi2 = {chi2.mean()}")
-    logger.info(f"Std chi2 = {chi2.std()}")
+    logger.debug(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})")
+    logger.debug(f"Min chi2/ndf = {chi2.min()}/{ndf} (p = {stats.chi2.sf(chi2.min(), ndf)})")
+    logger.debug(f"Max chi2/ndf = {chi2.max()}/{ndf} (p = {stats.chi2.sf(chi2.max(), ndf)})")
+    logger.debug(f"Mean chi2 = {chi2.mean()}")
+    logger.debug(f"Std chi2 = {chi2.std()}")
     return chi2, ndf    
 
 
@@ -225,6 +225,8 @@ def transform_bernstein(x, min_x, max_x, cap_x=False):
             raise RuntimeError(f"All values need to be within [0,1] but {np.sum(x < 0)} values smaller 0 ({x[x < 0]}) and {np.sum(x > 1)} larger 1 ({x[x > 1]}) found after transformation with xmin={xmin} and xmax={xmax}")
     return x
 
+def transform_power(x, *args, **kwargs):
+    return transform_bernstein(x, *args, **kwargs) * 2 - 1
 
 class Regressor(object):
     def __init__(
@@ -259,8 +261,10 @@ def __init__(
         self.external_cov = None
 
     def transform_x(self, x):
-        # if self.polynomial in ["bernstein", "monotonic"]:
-        x = transform_bernstein(x, self.min_x, self.max_x, self.cap_x)
+        if self.polynomial in ["bernstein", "monotonic"]:
+            x = transform_bernstein(x, self.min_x, self.max_x, self.cap_x)
+        else:
+            x = transform_power(x, self.min_x, self.max_x, self.cap_x)
         return x
 
     def solve(self, x, y, w, chi2_info=True):
@@ -358,6 +362,9 @@ def transform_x(self, x1, x2):
         if self.polynomial in ["bernstein", "monotonic"]:
             x1 = transform_bernstein(x1, self.min_x[0], self.max_x[0], self.cap_x[0])
             x2 = transform_bernstein(x2, self.min_x[1], self.max_x[1], self.cap_x[1])
+        else:
+            x1 = transform_power(x1, self.min_x[0], self.max_x[0], self.cap_x[0])
+            x2 = transform_power(x2, self.min_x[1], self.max_x[1], self.cap_x[1])
         return x1, x2
 
     def solve(self, x1, x2, y, w, flatten=False):