From d194033cc758d53cdb2aa391d0a8ee91cf26bb45 Mon Sep 17 00:00:00 2001 From: David Date: Mon, 12 Aug 2024 15:06:39 +0200 Subject: [PATCH 01/19] Remove QCD MC nonclosure uncertainty and add explicit parameter variation instead --- scripts/combine/setupCombine.py | 30 ++++++++-------- scripts/plotting/controlPlotsHDF5.py | 21 +++++++++--- scripts/plotting/postfitPlots.py | 5 ++- wremnants/CardTool.py | 40 +++++++++++----------- wremnants/HDF5Writer.py | 2 +- wremnants/histselections.py | 51 ++++++++++++++-------------- 6 files changed, 80 insertions(+), 69 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index c5100694d..e537c1ec0 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -744,17 +744,9 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): cardTool.addLnNSystematic("lumi", processes=['MCnoQCD'], size=1.017 if lowPU else 1.012, group="luminosity", splitGroup = {"experiment": ".*"},) if cardTool.getFakeName() != "QCD" and cardTool.getFakeName() in datagroups.groups.keys() and not xnorm and (args.fakeSmoothingMode != "binned" or (args.fakeEstimation in ["extrapolate"] and "mt" in fitvar)): - - # add stat uncertainty from QCD MC nonclosure to smoothing parameter covariance - hist_fake = datagroups.results["QCDmuEnrichPt15PostVFP"]["output"]["unweighted"].get() fakeselector = cardTool.datagroups.groups[cardTool.getFakeName()].histselector - _0, _1, params, cov, _chi2, _ndf = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True) - _0, _1, params_d, cov_d, _chi2_d, _ndf_d = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True, signal_region=True) - - fakeselector.external_cov = cov + cov_d - syst_axes = ["eta", "charge"] if (args.fakeSmoothingMode != "binned" or args.fakeEstimation not in ["extrapolate"]) else ["eta", "pt", "charge"] info=dict( name=inputBaseName, @@ -783,14 +775,21 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): ) if args.fakeSmoothingMode == "full": - # add nominal nonclosure of QCD MC as single systematic + # add systematic of explicit parameter variation def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): - # apply QCD MC nonclosure by adding parameters (assumes log space, e.g. in full smoothing) - fakeselector.external_params = params_d - params + # apply varyation by adding parameter value (assumes log space, e.g. in full smoothing) + fakeselector.external_params = np.zeros(4) + fakeselector.external_params[1] = 0.5 hvar = fakeselector.get_hist(h, *args, **kwargs) # reset external parameters fakeselector.external_params = None + hnom = fakeselector.get_hist(h, *args, **kwargs) + + # normalize variation histogram to have the same integral as nominal + scale = hnom.sum(flow=True).value / hvar.sum(flow=True).value + hvar = hh.scaleHist(hvar, scale) + if len(axesToDecorrNames) == 0: # inclusive hvar = hist.Hist( @@ -799,13 +798,12 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): data = hvar.values(flow=True)[...,np.newaxis] ) else: - hnom = fakeselector.get_hist(h, *args, **kwargs) hvar = syst_tools.decorrelateByAxes(hvar, hnom, axesToDecorrNames) return hvar - for axesToDecorrNames in [[], ["eta"]]: - subgroup = f"{cardTool.getFakeName()}QCDMCNonclosure" + for axesToDecorrNames in [[], ]:#["eta"]]: + subgroup = f"{cardTool.getFakeName()}Param{param}" cardTool.addSystematic( name=inputBaseName, group="Fake", @@ -820,7 +818,7 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): action=fake_nonclosure, actionArgs=dict(axesToDecorrNames=axesToDecorrNames), systAxes=["var"] if len(axesToDecorrNames)==0 else [f"{n}_decorr" for n in axesToDecorrNames], - ) + ) if not args.noEfficiencyUnc: @@ -895,7 +893,7 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): ) if wmass or wlike_vetoValidation: useGlobalOrTrackerVeto = input_tools.args_from_metadata(cardTool, "useGlobalOrTrackerVeto") - useRefinedVeto = input_tools.args_from_metadata(cardTool, "useRefinedVeto") + useRefinedVeto = False# input_tools.args_from_metadata(cardTool, "useRefinedVeto") allEffTnP_veto = ["effStatTnP_veto_sf", "effSystTnP_veto"] for name in allEffTnP_veto: if "Syst" in name: diff --git a/scripts/plotting/controlPlotsHDF5.py b/scripts/plotting/controlPlotsHDF5.py index d504cc919..424a99dd6 100644 --- a/scripts/plotting/controlPlotsHDF5.py +++ b/scripts/plotting/controlPlotsHDF5.py @@ -35,6 +35,7 @@ help="Select specific bins of the selectionAxis e.g. '0 1' to select the first bin of the first axis and second bin of the second axis") parser.add_argument("--hists", type=str, nargs='*', default=None, help="List of hists to plot; dash separated for unrolled hists") +parser.add_argument("--normToData", action='store_true', help="Normalize MC to data") # variations parser.add_argument("--varName", type=str, nargs='*', default=[], help="Name of variation hist") @@ -123,6 +124,13 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names, else: hists_pred = [hh.sumHists(h_stack)] + if args.normToData: + scale = h_data.values().sum()/hists_pred[0].values().sum() + h_stack = [hh.scaleHist(h, scale) for h in h_stack] + hists_pred[0] = hh.scaleHist(hists_pred[0], scale) + hists_syst_up = [hh.scaleHist(h, scale) for h in hists_syst_up] + hists_syst_dn = [hh.scaleHist(h, scale) for h in hists_syst_dn] + # loop over all processes if plots for each process is requested, or inclusive otherwise for i, h_pred in enumerate(hists_pred): @@ -252,11 +260,14 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names, if selections is not None: for i, (key, idx) in enumerate(selections.items()): lo, hi = selection_edges[i] - label = styles.xlabels[key].replace(" (GeV)","") - if lo != None: - label = f"{lo} < {label}" - if hi != None: - label = f"{label} < {hi}" + if key=="charge": + label = f"charge = {'-1' if i==0 else +1}" + else: + label = styles.xlabels[key].replace(" (GeV)","") + if lo != None: + label = f"{lo} < {label}" + if hi != None: + label = f"{label} < {hi}" ax1.text(0.05, 0.96-i*0.08, label, horizontalalignment='left', verticalalignment='top', transform=ax1.transAxes, fontsize=20*args.scaleleg*scale) diff --git a/scripts/plotting/postfitPlots.py b/scripts/plotting/postfitPlots.py index 7c29635f1..1743ecb72 100644 --- a/scripts/plotting/postfitPlots.py +++ b/scripts/plotting/postfitPlots.py @@ -105,7 +105,10 @@ def make_plot(h_data, h_inclusive, h_stack, axes, colors=None, labels=None, suff h_inclusive = hh.scaleHist(h_inclusive, scale) axis_name = "_".join([a for a in axes_names]) - xlabel=f"{'-'.join([styles.xlabels.get(s,s).replace('(GeV)','') for s in axes_names])} bin" + if len(axes_names) == 1: + xlabel=styles.xlabels.get(axes_names[0]) + else: + xlabel=f"{'-'.join([styles.xlabels.get(s,s).replace('(GeV)','') for s in axes_names])} bin" if ratio: fig, ax1, ax2 = plot_tools.figureWithRatio(h_data, xlabel, ylabel, args.ylim, f"{args.dataName}/Pred.", args.rrange) else: diff --git a/wremnants/CardTool.py b/wremnants/CardTool.py index 8d7c3eceb..abc78075c 100644 --- a/wremnants/CardTool.py +++ b/wremnants/CardTool.py @@ -828,11 +828,9 @@ def loadPseudodata(self, forceNonzero=False): continue if pseudoData in ["dataClosure", "mcClosure"]: - # build the pseudodata from the nominal model - hdata = hh.sumHists([self.datagroups.getDatagroups()[x].hists[self.nominalName] for x in self.datagroups.groups.keys() if x != self.getDataName()]) - hdatas.append(hdata) + # build the pseudodata by adding the nonclosure - # build the model by adding the nonclosure + # build the pseudodata by adding the nonclosure # first load the nonclosure if pseudoData == "dataClosure": datagroups.loadHistsForDatagroups( @@ -847,31 +845,31 @@ def loadPseudodata(self, forceNonzero=False): elif pseudoData == "mcClosure": hist_fake = datagroups.results["QCDmuEnrichPt15PostVFP"]["output"]["unweighted"].get() - fakeselector = sel.FakeSelector1DExtendedABCD( - hist_fake, - fakerate_axes=self.datagroups.fakerate_axes, - smoothing_mode="full", - smoothing_order_fakerate=3, - integrate_x=True, - ) + fakeselector = self.datagroups.getDatagroups()[self.getFakeName()].histselector - _0, _1, params, cov, _chi2, _ndf = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True) - hist_fake = hh.scaleHist(hist_fake, 1./0.85) _0, _1, params_d, cov_d, _chi2_d, _ndf_d = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True, signal_region=True) + hist_fake = hh.scaleHist(hist_fake, fakeselector.global_scalefactor) + _0, _1, params, cov, _chi2, _ndf = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True) - cov = cov + cov_d - params = params_d - params - - self.datagroups.getDatagroups()[self.getFakeName()].histselector.external_params = params - self.datagroups.getDatagroups()[self.getFakeName()].histselector.external_cov = cov - + # add the nonclosure by adding the difference of the parameters + fakeselector.external_params = params_d - params + # load the pseudodata including the nonclosure self.datagroups.loadHistsForDatagroups( - baseName=self.nominalName, syst=self.nominalName, - procsToRead=[self.getFakeName()], + baseName=self.nominalName, syst=self.nominalName, label=pseudoData, + procsToRead=[x for x in self.datagroups.groups.keys() if x != self.getDataName()], scaleToNewLumi=self.lumiScale, lumiScaleVarianceLinearly=self.lumiScaleVarianceLinearly, forceNonzero=forceNonzero, sumFakesPartial=not self.simultaneousABCD) + # adding the pseudodata + hdata = hh.sumHists([self.datagroups.getDatagroups()[x].hists[pseudoData] for x in self.datagroups.groups.keys() if x != self.getDataName()]) + hdatas.append(hdata) + + # remove the parameter offset again + fakeselector.external_params = None + # add the covariance matrix from the nonclosure to the model + fakeselector.external_cov = cov + cov_d + continue elif pseudoData.split("-")[0] in ["simple", "extended1D", "extended2D"]: diff --git a/wremnants/HDF5Writer.py b/wremnants/HDF5Writer.py index 744667b83..3904e4fbf 100644 --- a/wremnants/HDF5Writer.py +++ b/wremnants/HDF5Writer.py @@ -250,7 +250,7 @@ def write(self, logger.debug(f"Delete pseudodata histogram {pseudo}") del dg.groups[proc].hists[pseudo] - # nominal predictions (after pseudodata because some pseudodata changes the nominal model) + # nominal predictions (after pseudodata because some pseudodata changes the nominal model and/or its uncertainties) for proc in procs_chan: logger.debug(f"Now in channel {chan} at process {proc}") diff --git a/wremnants/histselections.py b/wremnants/histselections.py index 532094dc2..12066e1ff 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -672,31 +672,32 @@ def smoothen(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, params = np.sum(params*w_region[*[np.newaxis]*(params.ndim-2), slice(None), np.newaxis], axis=-2) cov = np.sum(cov*w_region[*[np.newaxis]*(params.ndim-2), slice(None), np.newaxis, np.newaxis]**2, axis=-3) - # performing a nnls to enforce monotonicity for the signal region (using generalized least squares) - Y = params - W = np.linalg.inv(cov.reshape(-1,*cov.shape[-2:])) - W = W.reshape((*cov.shape[:-2],*W.shape[-2:])) - WY = np.einsum('...ij,...j->...i', W, Y) - # the design matrix X is just a 1xn unity matrix and can thus be ignored - XTWY = WY - XTWX = W - - orig_shape = XTWY.shape - nBins = np.prod(orig_shape[:-1]) - XTWY_flat = XTWY.reshape(nBins, XTWY.shape[-1]) - XTWX_flat = XTWX.reshape(nBins, XTWX.shape[-2], XTWX.shape[-1]) - params = [nnls(xtwx, xtwy)[0] for xtwx, xtwy in zip(XTWX_flat, XTWY_flat)] - params = np.reshape(params, orig_shape) - - # allow the integration constaint to be negative - if np.sum(params[...,0]==0) > 0: - mask = params[...,0]==0 - mask_flat = mask.flatten() - w_flip = np.ones(XTWY.shape[-1]) - w_flip[0] = -1 - params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTWX_flat[mask_flat], XTWY_flat[mask_flat] * w_flip)] - params[mask] = np.array(params_negative) * w_flip - logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") + if self.polynomial == "monotonic": + # performing a nnls to enforce monotonicity for the signal region (using generalized least squares) + Y = params + W = np.linalg.inv(cov.reshape(-1,*cov.shape[-2:])) + W = W.reshape((*cov.shape[:-2],*W.shape[-2:])) + WY = np.einsum('...ij,...j->...i', W, Y) + # the design matrix X is just a 1xn unity matrix and can thus be ignored + XTWY = WY + XTWX = W + + orig_shape = XTWY.shape + nBins = np.prod(orig_shape[:-1]) + XTWY_flat = XTWY.reshape(nBins, XTWY.shape[-1]) + XTWX_flat = XTWX.reshape(nBins, XTWX.shape[-2], XTWX.shape[-1]) + params = [nnls(xtwx, xtwy)[0] for xtwx, xtwy in zip(XTWX_flat, XTWY_flat)] + params = np.reshape(params, orig_shape) + + # allow the integration constaint to be negative + if np.sum(params[...,0]==0) > 0: + mask = params[...,0]==0 + mask_flat = mask.flatten() + w_flip = np.ones(XTWY.shape[-1]) + w_flip[0] = -1 + params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTWX_flat[mask_flat], XTWY_flat[mask_flat] * w_flip)] + params[mask] = np.array(params_negative) * w_flip + logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") if self.external_cov is not None and syst_variations: From f691be4cce8ac160ab2c90c11466b9b582011d2c Mon Sep 17 00:00:00 2001 From: David Date: Mon, 12 Aug 2024 16:14:18 +0200 Subject: [PATCH 02/19] Small fix --- scripts/combine/setupCombine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index e537c1ec0..75fe1837d 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -893,7 +893,7 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): ) if wmass or wlike_vetoValidation: useGlobalOrTrackerVeto = input_tools.args_from_metadata(cardTool, "useGlobalOrTrackerVeto") - useRefinedVeto = False# input_tools.args_from_metadata(cardTool, "useRefinedVeto") + useRefinedVeto = input_tools.args_from_metadata(cardTool, "useRefinedVeto") allEffTnP_veto = ["effStatTnP_veto_sf", "effSystTnP_veto"] for name in allEffTnP_veto: if "Syst" in name: From d4f029b2fd156204332e914ab2d8efb89148fcff Mon Sep 17 00:00:00 2001 From: David Date: Mon, 12 Aug 2024 16:24:08 +0200 Subject: [PATCH 03/19] Fix type --- scripts/combine/setupCombine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 8ef3c511d..b7cc0f5b6 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -803,7 +803,7 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): return hvar for axesToDecorrNames in [[], ]:#["eta"]]: - subgroup = f"{cardTool.getFakeName()}Param{param}" + subgroup = f"{cardTool.getFakeName()}Param1" cardTool.addSystematic( name=inputBaseName, group="Fake", From 061112ba3bf1a66ef73883f4311a516dd1558bbb Mon Sep 17 00:00:00 2001 From: David Date: Mon, 12 Aug 2024 20:43:13 +0200 Subject: [PATCH 04/19] Normalize parameter variation for each eta-charge separately --- scripts/combine/setupCombine.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index b7cc0f5b6..746aa5b77 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -787,8 +787,8 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): hnom = fakeselector.get_hist(h, *args, **kwargs) # normalize variation histogram to have the same integral as nominal - scale = hnom.sum(flow=True).value / hvar.sum(flow=True).value - hvar = hh.scaleHist(hvar, scale) + hScale = hh.divideHists(hnom[{"pt":hist.sum}], hvar[{"pt":hist.sum}]) + hvar = hh.multiplyHists(hvar, hScale) if len(axesToDecorrNames) == 0: # inclusive From ee9ed8c5290dc023f76aa8ff1964e71896f6d0d5 Mon Sep 17 00:00:00 2001 From: David Date: Tue, 13 Aug 2024 21:10:56 +0200 Subject: [PATCH 05/19] Refactor histselector regression related functionality in Regressor class; support smoothing of application region for fakerate smoothing --- scripts/combine/setupCombine.py | 26 +- wremnants/datasets/datagroups.py | 2 + wremnants/histselections.py | 928 +++++++++---------------------- wremnants/regression.py | 342 ++++++++++++ 4 files changed, 636 insertions(+), 662 deletions(-) create mode 100644 wremnants/regression.py diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 746aa5b77..5d2f6417a 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -293,7 +293,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x datagroups.fakerate_axes=args.fakerateAxes datagroups.set_histselectors( datagroups.getNames(), inputBaseName, mode=args.fakeEstimation, - smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder, + smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder, mcCorr=args.fakeMCCorr, integrate_x="mt" not in fitvar, simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes, @@ -743,7 +743,12 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): cardTool.addLnNSystematic("CMS_background", processes=["Other"], size=1.15, group="CMS_background", splitGroup = {"experiment": ".*"},) cardTool.addLnNSystematic("lumi", processes=['MCnoQCD'], size=1.017 if lowPU else 1.012, group="luminosity", splitGroup = {"experiment": ".*"},) - if cardTool.getFakeName() != "QCD" and cardTool.getFakeName() in datagroups.groups.keys() and not xnorm and (args.fakeSmoothingMode != "binned" or (args.fakeEstimation in ["extrapolate"] and "mt" in fitvar)): + if ( + (cardTool.getFakeName() != "QCD" or args.qcdProcessName=="QCD") + and cardTool.getFakeName() in datagroups.groups.keys() + and not xnorm + and (args.fakeSmoothingMode != "binned" or (args.fakeEstimation in ["extrapolate"] and "mt" in fitvar)) + ): fakeselector = cardTool.datagroups.groups[cardTool.getFakeName()].histselector @@ -758,13 +763,22 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): applySelection=False, # don't apply selection, all regions will be needed for the action action=fakeselector.get_hist, systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"]) - subgroup = f"{cardTool.getFakeName()}Rate" + subgroup = f"{cardTool.getFakeName()}Smoothing" cardTool.addSystematic(**info, rename=subgroup, splitGroup = {subgroup: f".*", "experiment": ".*"}, systNamePrepend=subgroup, actionArgs=dict(variations_smoothing=True), ) + if args.fakeSmoothingMode == "fakerate": + subgroup = f"{cardTool.getFakeName()}Rate" + cardTool.addSystematic(**info, + rename=subgroup, + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + actionArgs=dict(variations_frf=True), + ) + if args.fakeEstimation in ["extended2D",] and args.fakeSmoothingMode != "full": subgroup = f"{cardTool.getFakeName()}Shape" cardTool.addSystematic(**info, @@ -778,11 +792,11 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): # add systematic of explicit parameter variation def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): # apply varyation by adding parameter value (assumes log space, e.g. in full smoothing) - fakeselector.external_params = np.zeros(4) - fakeselector.external_params[1] = 0.5 + fakeselector.spectrum_regressor.external_params = np.zeros(4) + fakeselector.spectrum_regressor.external_params[1] = 0.5 hvar = fakeselector.get_hist(h, *args, **kwargs) # reset external parameters - fakeselector.external_params = None + fakeselector.spectrum_regressor.external_params = None hnom = fakeselector.get_hist(h, *args, **kwargs) diff --git a/wremnants/datasets/datagroups.py b/wremnants/datasets/datagroups.py index 31fe52d5b..62aa56840 100644 --- a/wremnants/datasets/datagroups.py +++ b/wremnants/datasets/datagroups.py @@ -203,6 +203,7 @@ def set_histselectors( mode="extended1D", smoothing_mode="full", smoothingOrderFakerate=2, + smoothingOrderSpectrum=3, integrate_shapecorrection_x=True, # integrate the abcd x-axis or not, only relevant for extended2D simultaneousABCD=False, forceGlobalScaleFakes=None, @@ -252,6 +253,7 @@ def set_histselectors( fakerate_axes=self.fakerate_axes, smoothing_mode=smoothing_mode, smoothing_order_fakerate=smoothingOrderFakerate, + smoothing_order_spectrum=smoothingOrderSpectrum, **auxiliary_info, **kwargs ) if mode in ["simple", "extended1D", "extended2D"] and forceGlobalScaleFakes is None and (len(mcCorr)==0 or mcCorr[0] not in ["none", None]): diff --git a/wremnants/histselections.py b/wremnants/histselections.py index 12066e1ff..f25831528 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -2,12 +2,10 @@ import numpy as np from utilities import boostHistHelpers as hh from utilities import common, logging -from utilities.fnnls import fnnls -import scipy -from scipy.optimize import nnls -from scipy.special import comb +from scipy import interpolate +from wremnants.regression import Regressor, Regressor2D logger = logging.child_logger(__name__) @@ -21,6 +19,10 @@ "dxy":[0,0.01,0.02,0.03] } +# default abcd_variables to look for +abcd_variables = (("mt", "passMT"), ("relIso", "passIso"), ("iso", "passIso"), ("relJetLeptonDiff", "passIso"), ("dxy", "passDxy")) + + def get_selection_edges(axis_name, upper_bound=False): # returns edges from pass to fail regions [x0, x1, x2, x3] e.g. x=[x0,x1], dx=[x1,x2], d2x=[x2,x3] if axis_name in abcd_thresholds: @@ -35,187 +37,6 @@ def get_selection_edges(axis_name, upper_bound=False): raise RuntimeError(f"Can not find threshold for abcd axis {axis_name}") -# default abcd_variables to look for -abcd_variables = (("mt", "passMT"), ("relIso", "passIso"), ("iso", "passIso"), ("relJetLeptonDiff", "passIso"), ("dxy", "passDxy")) - -def get_parameter_eigenvectors(params, cov, sign=1, force_positive=False): - # diagonalize and get eigenvalues and eigenvectors - e, v = np.linalg.eigh(cov) # The column eigenvectors[:, i] is the normalized eigenvector corresponding to the eigenvalue eigenvalues[i], - vT = np.transpose(v, axes=(*np.arange(v.ndim-2), v.ndim-1, v.ndim-2)) # transpose v to have row eigenvectors[i, :] for easier computations - mag = np.sqrt(e)[...,np.newaxis] * vT * sign - # duplicate params vector to have duplicated rows - params_brd = np.broadcast_to(params[...,np.newaxis], mag.shape) - paramsT = np.transpose(params_brd, axes=(*np.arange(params_brd.ndim-2), params_brd.ndim-1, params_brd.ndim-2)) - mask = (paramsT + mag < 0).any(axis=-1) - if force_positive and mask.sum() > 0: - logger.info(f"Force {mask.sum()} eigenvectors to be positive") - # scaling the magnitude of the eigenvector to avoid negative coefficients - min_idxs = paramsT + mag == np.min(paramsT + mag, axis=-1)[...,np.newaxis] # find indices with minimum value of each eigenvector - min_paramsT = paramsT[min_idxs] - min_mag = mag[min_idxs] - factors = np.reshape(-1*min_paramsT/min_mag, mask.shape) - factors = np.where(mask, factors, np.ones_like(mask)) - factors = np.broadcast_to(factors[...,np.newaxis], mag.shape) - mag = factors * mag - if np.sum(mag.sum(axis=-1) == 0): - logger.warning(f"Found {np.sum(mag.sum(axis=-1) == 0)} eigenvector shifts where all coefficients are 0") - params_var = paramsT + mag - if np.sum(params_var.sum(axis=-1) == 0): - logger.warning(f"Found {np.sum(params_var.sum(axis=-1) == 0)} variations where all coefficients are 0") - return params_var - -def make_eigenvector_predictons(params, cov, func, x1, x2=None, force_positive=False): - # return alternate values i.e. nominal+/-variation - params_up = get_parameter_eigenvectors(params, cov, sign=1, force_positive=force_positive) - y_pred_up = func(x1, params_up) if x2 is None else func(x1, x2, params_up) - y_pred_up = np.moveaxis(y_pred_up, params.ndim-1, -1) # put parameter variations last - params_dn = get_parameter_eigenvectors(params, cov, sign=-1, force_positive=force_positive) - y_pred_dn = func(x1, params_dn) if x2 is None else func(x1, x2, params_dn) - y_pred_dn = np.moveaxis(y_pred_dn, params.ndim-1, -1) # put parameter variations last - return np.stack((y_pred_up, y_pred_dn), axis=-1) - -def poly(pol, order, order2=None): - if order2 is None: - if pol=="power": - return lambda x, n, p=1: p * x**n - elif pol=="bernstein": - return lambda x, n, p=1, o=order: p * comb(o, n) * x**n * (1 - x)**(o - n) - elif pol=="monotonic": - # integral of bernstein (see https://en.wikipedia.org/wiki/Bernstein_polynomial#Properties) - # for n>o return one additional parameter for the constant term from the integration - return lambda x, n, p=1, o=order: p*x**0 if n==0 else -1*p * 1/o * np.array( - [comb(o, k) * x**k * (1 - x)**(o - k) for k in range(n, o+1)] - ).sum(axis=0) - else: - return lambda x, x2, n, m, p=1, o1=order, o2=order2: p * poly(pol, o1)(x, n) * poly(pol, o2)(x2, m) - -def get_parameter_matrices(x, y, w, order, pol="power"): - if x.shape != y.shape: - x = np.broadcast_to(x, y.shape) - stackX=[] # parameter matrix X - stackXTY=[] # and X.T @ Y - f = poly(pol, order) - for n in range(order+1): - p = f(x, n) - stackX.append( w * p ) - stackXTY.append((w**2 * p * y).sum(axis=(-1))) - X = np.stack(stackX, axis=-1) - XTY = np.stack(stackXTY, axis=-1) - return X, XTY - -def get_parameter_matrices_from2D(x, x2, y, w, order, order2=None, pol="power", flatten=False): - x, x2 = np.broadcast_arrays(x[np.newaxis,...], x2[..., np.newaxis]) - x = np.broadcast_to(x, y.shape) - x2 = np.broadcast_to(x2, y.shape) - if order2 is None: - order2 = [0,]*(order+1) - elif type(order2) == int: - order2 = [order2,]*(order+1) - elif type(order2) == list: - if len(order2) < order+1: - order2.append([0,]*(len(order2)-order+1)) - else: - raise RuntimeError(f"Input 'order2' requires type 'None', 'int' or 'list'") - stackX=[] # parameter matrix X - stackXTY=[] # and X.T @ Y - for n in range(order+1): - f = poly(pol, order, order2[n]) - for m in range(order2[n]+1): - p = f(x, x2, n, m) - stackX.append(w * p) - stackXTY.append((w**2 * p * y).sum(axis=(-2, -1))) - X = np.stack(stackX, axis=-1) - XTY = np.stack(stackXTY, axis=-1) - if flatten: - # flatten the 2D array into 1D - newshape = (*y.shape[:-2],np.product(y.shape[-2:])) - y = y.reshape(newshape) - X = X.reshape(*newshape, X.shape[-1]) - - return X, y, XTY - -def get_regression_function(order1, order2=None, pol="power"): - if order2 is None: - def fsum(x, ps, o=order1): - f = poly(pol, o) - if hasattr(ps, "ndim") and ps.ndim > 1: - return sum([f(x, n, ps[...,n,np.newaxis]) for n in range(o+1)]) - else: - return sum([f(x, n, ps[n]) for n in range(o+1)]) - else: - def fsum(x1, x2, ps, o1=order1, o2=order2): - idx=0 - psum = 0 - x1, x2 = np.broadcast_arrays(x1[np.newaxis,...], x2[..., np.newaxis]) - if hasattr(ps, "ndim") and ps.ndim > 1: - x1 = np.broadcast_to(x1, [*ps.shape[:-1], *x1.shape]) - x2 = np.broadcast_to(x2, [*ps.shape[:-1], *x2.shape]) - for n in range(o1+1): - f = poly(pol, o1, o2[n]) - for m in range(o2[n]+1): - psum += f(x1, x2, n, m, ps[...,idx,np.newaxis,np.newaxis]) - idx += 1 - else: - for n in range(o1+1): - f = poly(pol, o1, o2[n]) - for m in range(o2[n]+1): - psum += f(x1, x2, n, m, ps[idx]) - idx += 1 - return psum - return fsum - -def solve_leastsquare(X, XTY): - # compute the transpose of X for the mt and parameter axes - XT = np.transpose(X, axes=(*np.arange(X.ndim-2), X.ndim-1, X.ndim-2)) - XTX = XT @ X - # compute the inverse of the matrix in each bin (reshape to make last two axes contiguous, reshape back after inversion), - # this term is also the covariance matrix for the parameters - XTXinv = np.linalg.inv(XTX.reshape(-1,*XTX.shape[-2:])) - XTXinv = XTXinv.reshape((*XT.shape[:-2],*XTXinv.shape[-2:])) - params = np.einsum('...ij,...j->...i', XTXinv, XTY) - return params, XTXinv - -def solve_nonnegative_leastsquare(X, XTY, exclude_idx=None): - # exclude_idx to exclude the non negative constrained for one parameter by evaluating the nnls twice and flipping the sign - XT = np.transpose(X, axes=(*np.arange(X.ndim-2), X.ndim-1, X.ndim-2)) - XTX = XT @ X - XTXinv = np.linalg.inv(XTX.reshape(-1,*XTX.shape[-2:])) - XTXinv = XTXinv.reshape((*XT.shape[:-2],*XTXinv.shape[-2:])) - orig_shape = XTY.shape - nBins = np.prod(orig_shape[:-1]) - XTY_flat = XTY.reshape(nBins, XTY.shape[-1]) - XTX_flat = XTX.reshape(nBins, XTX.shape[-2], XTX.shape[-1]) - # params = [fnnls(xtx, xty) for xtx, xty in zip(XTX_flat, XTY_flat)] # use fast nnls (for some reason slower, even though it should be faster ...) - params = [nnls(xtx, xty)[0] for xtx, xty in zip(XTX_flat, XTY_flat)] # use scipy implementation of nnls - params = np.reshape(params, orig_shape) - if exclude_idx is not None and np.sum(params[...,exclude_idx]==0): - mask = params[...,exclude_idx]==0 - mask_flat = mask.flatten() - w_flip = np.ones(XTY.shape[-1]) - w_flip[exclude_idx] = -1 - params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTX_flat[mask_flat], XTY_flat[mask_flat] * w_flip)] - params[mask] = np.array(params_negative) * w_flip - logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") - return params, XTXinv - -def compute_chi2(y, y_pred, w=None, nparams=1): - # goodness of fit from parameter matrix 'X', values 'y' and parateters 'params' - residuals = (y - y_pred) - if w is not None: - residuals *= w - chi2 = np.sum((residuals**2), axis=-1) # per fit chi2 - chi2_total = np.sum((residuals**2)) # chi2 of all fits together - - # Degrees of freedom calculation - ndf = y.shape[-1] - nparams - ndf_total = y.size - chi2.size*nparams - - from scipy import stats - - logger.debug(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})") - logger.debug(f"Min chi2 = {chi2.min()}; max chi2 = {chi2.max()}") - return chi2, ndf - def extend_edges(traits, x): # extend array for underflow/overflow with distance from difference of two closest values if traits.underflow: @@ -228,6 +49,7 @@ def extend_edges(traits, x): logger.debug(f"Extend array by overflow bin using {new_x}") return x + def get_rebinning(edges, axis_name): if axis_name == "mt": target_edges = common.get_binning_fakes_mt(high_mt_bins=True) @@ -257,6 +79,7 @@ def get_rebinning(edges, axis_name): logger.debug(f"Rebin axis {axis_name} to {rebin}") return rebin + def divide_arrays(num, den, cutoff=1, replace=1): r = num/den # criteria = abs(den) <= cutoff @@ -266,24 +89,26 @@ def divide_arrays(num, den, cutoff=1, replace=1): r[criteria] = replace # if denumerator is close to 0 set ratio to 1 to avoid large negative/positive values return r + def spline_smooth_nominal(binvals, edges, edges_out, axis): - # interpolation of CDF - cdf = np.cumsum(binvals, axis=axis) - padding = binvals.ndim*[(0,0)] - padding[axis] = (1,0) - cdf = np.pad(cdf, padding) + # interpolation of CDF + cdf = np.cumsum(binvals, axis=axis) + padding = binvals.ndim*[(0,0)] + padding[axis] = (1,0) + cdf = np.pad(cdf, padding) + + x = edges + y = cdf + xout = edges_out - x = edges - y = cdf - xout = edges_out + spline = scipy.interpolate.make_interp_spline(x, y, axis=axis, bc_type=None) + yout = spline(xout) - spline = scipy.interpolate.make_interp_spline(x, y, axis=axis, bc_type=None) - yout = spline(xout) + binvalsout = np.diff(yout, axis=axis) + binvalsout = np.maximum(binvalsout, 0.) - binvalsout = np.diff(yout, axis=axis) - binvalsout = np.maximum(binvalsout, 0.) + return binvalsout - return binvalsout def spline_smooth(binvals, edges, edges_out, axis, binvars=None, syst_variations=False): ynom = spline_smooth_nominal(binvals, edges, edges_out, axis=axis) @@ -311,6 +136,7 @@ def spline_smooth(binvals, edges, edges_out, axis, binvars=None, syst_variations return ynom, yvars + class HistselectorABCD(object): def __init__(self, h, name_x=None, name_y=None, fakerate_axes=["eta","pt","charge"], @@ -319,7 +145,7 @@ def __init__(self, h, name_x=None, name_y=None, upper_bound_y=None, # using an upper bound on the abcd y-axis (e.g. isolation) integrate_x=True, # integrate the abcd x-axis in final histogram (allows simplified procedure e.g. for extrapolation method) ): - self.upper_bound_y=upper_bound_y + self.upper_bound_y = upper_bound_y self.integrate_x = integrate_x self.name_x = name_x @@ -420,31 +246,50 @@ def get_hist(self, h, is_nominal=False): class FakeSelectorSimpleABCD(HistselectorABCD): # simple ABCD method def __init__(self, h, *args, - smoothing_mode="full", # 'binned', 'fakerate', or 'full' + smoothing_mode="full", smoothing_order_fakerate=2, + smoothen_application_region=True, # if application region should be smoothed in case of fakerate smoothing + smoothing_order_spectrum=3, throw_toys=None, #"normal", # None, 'normal' or 'poisson' global_scalefactor=1, # apply global correction factor on prediction **kwargs ): """ - :polynomial: choices: ["power","bernstein"] + :smoothing_mode: choices: ['binned', 'fakerate', 'full'] """ super().__init__(h, *args, **kwargs) # nominal histogram to be used to transfer variances for systematic variations self.h_nominal = None self.global_scalefactor = global_scalefactor + self.smoothing_mode = smoothing_mode - # select appropriate polynomial depending on type of smoothing - if smoothing_mode == "fakerate": - self.polynomial = "bernstein" - elif smoothing_mode == "full": - self.polynomial = "monotonic" + self.smoothen_application_region=smoothen_application_region + + # select appropriate regressor objects depending on type of smoothing + if self.smoothing_mode == "fakerate": + self.fakerate_regressor = Regressor( + "bernstein", + smoothing_order_fakerate, + min_x=self.smoothing_axis_min, + max_x=self.smoothing_axis_max, + ) else: - self.polynomial = "power" + self.fakerate_regressor = None + + if self.smoothing_mode in ["full", "fakerate"]: + self.spectrum_regressor = Regressor( + "monotonic", + # "power", + smoothing_order_spectrum, + min_x=self.smoothing_axis_min, + max_x=self.smoothing_axis_max, + ) + else: + self.spectrum_regressor = None # rebinning doesn't make sense for binned estimation - if smoothing_mode in ["binned"]: + if self.smoothing_mode in ["binned"]: self.rebin_smoothing_axis = None if hasattr(self, "fakerate_integration_axes"): @@ -453,31 +298,11 @@ def __init__(self, h, *args, self.throw_toys = throw_toys - # fakerate factor - self.smoothing_mode = smoothing_mode - self.smoothing_order_fakerate = smoothing_order_fakerate - if self.smoothing_mode != "binned": - logger.info(f"Fakerate smoothing order is {self.smoothing_order_fakerate}") - - # solve with non negative least squares - if self.polynomial in ["bernstein",]: - self.solve = solve_nonnegative_leastsquare - elif self.polynomial in ["monotonic",]: - # exclude first parameter (integration constant) from non negative constraint - self.solve = lambda x,y: solve_nonnegative_leastsquare(x,y,0) - else: - self.solve = solve_leastsquare - - # set smooth functions - self.f_smoothing = None - if self.smoothing_mode != "binned": - self.f_smoothing = get_regression_function(self.smoothing_order_fakerate, pol=self.polynomial) - # histogram with nonclosure corrections self.hCorr = None - self.external_params = None - self.external_cov = None + # swap the A and C regions for better numerical behaviour (only implemented for fakerate smoothing) + self.swap_regions = True def set_correction(self, hQCD, axes_names=False, mirror_axes=["eta"], flow=True): # hQCD is QCD MC histogram before selection (should contain variances) @@ -547,19 +372,47 @@ def transfer_variances(self, h, set_nominal=False): raise RuntimeError(f"Failed to transfer variances") return h - def get_hist(self, h, is_nominal=False, variations_smoothing=False, flow=True): + def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothing=False, flow=True): idx_x = h.axes.name.index(self.name_x) if self.smoothing_mode == "fakerate": h = self.transfer_variances(h, set_nominal=is_nominal) - y_frf, y_frf_var = self.compute_fakeratefactor(h, smoothing=True, syst_variations=variations_smoothing) - c, cvar = self.get_yields_applicationregion(h) - d = c * y_frf + y_frf, y_frf_var = self.compute_fakeratefactor(h, smoothing=True, syst_variations=variations_frf) + + if self.swap_regions: + if type(self) == FakeSelectorSimpleABCD: + # replace C region with B region + hC = self.get_hist_failX_passY(h) + elif type(self) == FakeSelector1DExtendedABCD: + # replace C region with Ax region + hC = h[{self.name_x: self.sel_d2x, self.name_y: self.sel_dy}] + else: + hC = self.get_hist_passX_failY(h) + + cval = hC.values(flow=flow) + cvar = hC.variances(flow=flow) + if self.smoothen_application_region: + cval, cvar = self.smoothen_spectrum( + hC, + cval, + cvar, + syst_variations=variations_smoothing, + use_spline=False, + flow=flow, + ) + + d = cval * y_frf if variations_smoothing: - dvar = c[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:] + dvar = cvar[...,:,:] * y_frf[..., np.newaxis,np.newaxis] + elif variations_frf: + dvar = cval[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:] + elif self.smoothen_application_region: + # everything is smoothed, no additional bin by bin statistical uncertainty + dvar = np.zeros_like(d) else: # only take bin by bin uncertainty from c region dvar = y_frf**2 * cvar + elif self.smoothing_mode == "full": h = self.transfer_variances(h, set_nominal=is_nominal) d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing) @@ -574,7 +427,7 @@ def get_hist(self, h, is_nominal=False, variations_smoothing=False, flow=True): *h[{self.name_x: self.sel_x if not self.integrate_x else hist.sum, self.name_y: self.sel_y}].axes, storage=hist.storage.Double() if variations_smoothing else h.storage_type()) hSignal.values(flow=flow)[...] = d - if variations_smoothing and self.smoothing_mode != "binned": + if (variations_smoothing or variations_frf) and self.smoothing_mode != "binned": hSignal = self.get_syst_hist(hSignal, d, dvar, flow=flow) elif hSignal.storage_type == hist.storage.Weight: hSignal.variances(flow=flow)[...] = dvar @@ -592,14 +445,18 @@ def get_yields_applicationregion(self, h, flow=True): return c, cvar return c, None - def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True, auxiliary_info=False): + def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True): # rebin in smoothing axis to have stable ratios sel = {n: hist.sum for n in self.fakerate_integration_axes} hNew = hh.rebinHist(h[sel], self.smoothing_axis_name, self.rebin_smoothing_axis) if self.rebin_smoothing_axis is not None else h[sel] # select sideband regions ha = self.get_hist_failX_failY(hNew) - hb = self.get_hist_failX_passY(hNew) + if self.swap_regions: + # replace B region with C region + hb = self.get_hist_passX_failY(hNew) + else: + hb = self.get_hist_failX_passY(hNew) a = ha.values(flow=flow) b = hb.values(flow=flow) @@ -618,7 +475,15 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow if smoothing: x = self.get_bin_centers_smoothing(hNew, flow=True) # the bins where the smoothing is performed (can be different to the bins in h) - y, y_var = self.smoothen(h, x, y, y_var, syst_variations=syst_variations, auxiliary_info=auxiliary_info, flow=flow) + y, y_var = self.smoothen( + h, + x, + y, + y_var, + regressor=self.fakerate_regressor, + syst_variations=syst_variations, + flow=flow + ) # broadcast abcd-x axis and application axes slices=[slice(None) if n in ha.axes.name else np.newaxis for n in h[{self.name_x: self.sel_x}].axes.name if n != self.name_y] @@ -627,7 +492,7 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow return y, y_var - def smoothen(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, signal_region=False, flow=True): + def smoothen(self, h, x, y, y_var, regressor, syst_variations=False, reduce=False, flow=True): if h.storage_type == hist.storage.Weight: # transform with weights w = 1/np.sqrt(y_var) @@ -642,18 +507,10 @@ def smoothen(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, y = np.moveaxis(y, idx_ax_smoothing, -1) w = np.moveaxis(w, idx_ax_smoothing, -1) - # smooth frf (e.g. in pT) - X, XTY = get_parameter_matrices(x, y, w, self.smoothing_order_fakerate, pol=self.polynomial) - params, cov = self.solve(X, XTY) - - if self.smoothing_mode == "full" and not signal_region: - if auxiliary_info: - # compute chi2 from individual sideband regions - y_pred = self.f_smoothing(x, params) - y_pred = y_pred.reshape(y.shape) # flatten - w = w.reshape(y.shape) - chi2, ndf = compute_chi2(y, y_pred, w, nparams=params.shape[-1]) + # smoothen + regressor.solve(x, y, w) + if reduce: # add up parameters from smoothing of individual sideband regions if type(self) == FakeSelectorSimpleABCD: # exp(-a + b + c) @@ -669,48 +526,18 @@ def smoothen(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, w_region = np.array([-1, 2, -1, 2, -4, 2, -1, 2], dtype=int) # linear parameter combination - params = np.sum(params*w_region[*[np.newaxis]*(params.ndim-2), slice(None), np.newaxis], axis=-2) - cov = np.sum(cov*w_region[*[np.newaxis]*(params.ndim-2), slice(None), np.newaxis, np.newaxis]**2, axis=-3) - - if self.polynomial == "monotonic": - # performing a nnls to enforce monotonicity for the signal region (using generalized least squares) - Y = params - W = np.linalg.inv(cov.reshape(-1,*cov.shape[-2:])) - W = W.reshape((*cov.shape[:-2],*W.shape[-2:])) - WY = np.einsum('...ij,...j->...i', W, Y) - # the design matrix X is just a 1xn unity matrix and can thus be ignored - XTWY = WY - XTWX = W - - orig_shape = XTWY.shape - nBins = np.prod(orig_shape[:-1]) - XTWY_flat = XTWY.reshape(nBins, XTWY.shape[-1]) - XTWX_flat = XTWX.reshape(nBins, XTWX.shape[-2], XTWX.shape[-1]) - params = [nnls(xtwx, xtwy)[0] for xtwx, xtwy in zip(XTWX_flat, XTWY_flat)] - params = np.reshape(params, orig_shape) - - # allow the integration constaint to be negative - if np.sum(params[...,0]==0) > 0: - mask = params[...,0]==0 - mask_flat = mask.flatten() - w_flip = np.ones(XTWY.shape[-1]) - w_flip[0] = -1 - params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTWX_flat[mask_flat], XTWY_flat[mask_flat] * w_flip)] - params[mask] = np.array(params_negative) * w_flip - logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") - - - if self.external_cov is not None and syst_variations: - cov = cov + self.external_cov[..., *[np.newaxis for n in range(cov.ndim - self.external_cov.ndim)],:,:] - if self.external_params is not None: - params = params + self.external_params[..., *[np.newaxis for n in range(params.ndim - self.external_params.ndim)],:] + regressor.params = np.sum(regressor.params*w_region[*[np.newaxis]*(regressor.params.ndim-2), slice(None), np.newaxis], axis=-2) + regressor.cov = np.sum(regressor.cov*w_region[*[np.newaxis]*(regressor.params.ndim-2), slice(None), np.newaxis, np.newaxis]**2, axis=-3) + + if regressor.polynomial == "monotonic": + regressor.force_positive(exclude_idx=0) # evaluate in range of original histogram x_smooth_orig = self.get_bin_centers_smoothing(h, flow=True) - y_smooth_orig = self.f_smoothing(x_smooth_orig, params) + y_smooth_orig = regressor.evaluate(x_smooth_orig) if syst_variations: - y_smooth_var_orig = self.make_eigenvector_predictons_frf(params, cov, x_smooth_orig) + y_smooth_var_orig = regressor.get_eigenvector_predictions(x_smooth_orig) else: y_smooth_var_orig = None @@ -725,16 +552,79 @@ def smoothen(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, if y_smooth_var_orig is not None and np.sum(y_smooth_var_orig<0) > 0: logger.warning(f"Found {np.sum(y_smooth_var_orig<0)} bins with negative values from smoothing variations") - if auxiliary_info: - if self.smoothing_mode != "full" or signal_region: - y_pred = self.f_smoothing(x, params) - y_pred = y_pred.reshape(y.shape) # flatten - w = w.reshape(y.shape) - chi2, ndf = compute_chi2(y, y_pred, w, nparams=params.shape[-1]) + return y_smooth_orig, y_smooth_var_orig + + def smoothen_spectrum(self, h, sval, svar, syst_variations=False, use_spline=False, reduce=False, flow=True): + smoothidx = [n for n in h.axes.name if n not in [self.name_x, self.name_y]].index(self.smoothing_axis_name) + smoothing_axis = h.axes[self.smoothing_axis_name] + nax = sval.ndim + + # underflow and overflow are left unchanged along the smoothing axis + # so we need to exclude them if they have been otherwise included + if flow: + smoothstart = 1 if smoothing_axis.traits.underflow else 0 + smoothstop = -1 if smoothing_axis.traits.overflow else None + smoothslice = slice(smoothstart, smoothstop) + else: + smoothslice = slice(None) + + sel = nax*[slice(None)] + sel[smoothidx] = smoothslice + + sval = sval[*sel] + svar = svar[*sel] + + if use_spline: + sval, svar = spline_smooth(sval, edges = smoothing_axis.edges, edges_out = h.axes[self.smoothing_axis_name].edges, axis=smoothidx, binvars=svar, syst_variations=syst_variations) + else: + xwidth = h.axes[self.smoothing_axis_name].widths + + xwidthtgt = xwidth[*smoothidx*[None], :, *(nax - smoothidx - 2 + (0 if reduce else 1))*[None]] + xwidth = xwidth[*smoothidx*[None], :, *(nax - smoothidx - 1)*[None]] + + sval *= 1./xwidth + svar *= 1./xwidth**2 - return y_smooth_orig, y_smooth_var_orig, params, cov, chi2, ndf + goodbin = (sval > 0.) & (svar > 0.) + if goodbin.size-np.sum(goodbin) > 0: + logger.warning(f"Found {goodbin.size-np.sum(goodbin)} of {goodbin.size} bins with 0 or negative bin content, those will be set to 0 and a large error") + + logd = np.where(goodbin, np.log(sval), 0.) + logdvar = np.where(goodbin, svar/sval**2, np.inf) + x = self.get_bin_centers_smoothing(h, flow=flow) # the bins where the smoothing is performed (can be different to the bins in h) + + sval, svar = self.smoothen( + h, + x, + logd, + logdvar, + regressor=self.spectrum_regressor, + syst_variations=syst_variations, + reduce=reduce, + ) + + sval = np.exp(sval)*xwidthtgt + sval = np.where(np.isfinite(sval), sval, 0.) + if syst_variations: + svar = np.exp(svar)*xwidthtgt[..., None, None]**2 + svar = np.where((sval[..., None, None] > 0.) & np.isfinite(svar), svar, sval[..., None, None]) + + # get output shape from original hist axes, but as for result histogram + hOut = h[{self.name_x:self.sel_x if not self.integrate_x else hist.sum}] if self.name_x in h.axes.name else h + out = np.zeros([a.extent if flow else a.shape for a in hOut.axes if a.name != self.name_y], dtype=sval.dtype) + # leave the underflow and overflow unchanged if present + out[*sel[:-1]] = sval + if syst_variations: + outvar = np.zeros_like(out) + outvar = outvar[..., None, None]*np.ones((*outvar.shape, *svar.shape[-2:]), dtype=outvar.dtype) + # leave the underflow and overflow unchanged if present + outvar[*sel[:-1], :, :] = svar else: - return y_smooth_orig, y_smooth_var_orig + # with full smoothing all of the statistical uncertainty is included in the + # explicit variations, so the remaining binned uncertainty is zero + outvar = np.zeros_like(out) + + return out, outvar def get_syst_hist(self, hNominal, values, alternate, flow=True): # return systematic histogram with nominal and nominal+variation on diagonal elements for systematic axes of parameters and up/down variations @@ -756,39 +646,22 @@ def get_syst_hist(self, hNominal, values, alternate, flow=True): hNominal = hh.addHists(hNominal, hsyst) return hNominal - def make_eigenvector_predictons_frf(self, params, cov, x1, x2=None): - return make_eigenvector_predictons(params, cov, func=self.f_smoothing, x1=x1, x2=x2, force_positive=False) - def get_bin_centers_smoothing(self, h, flow=True): - return self.get_bin_centers(h, self.smoothing_axis_name, xmin=self.smoothing_axis_min, xmax=self.smoothing_axis_max, flow=flow) + return self.get_bin_centers(h, self.smoothing_axis_name, flow=flow) def get_bin_edges_smoothing(self, h, flow=True): - return self.get_bin_edges(h, self.smoothing_axis_name, xmin=self.smoothing_axis_min, xmax=self.smoothing_axis_max, flow=flow) + return self.get_bin_edges(h, self.smoothing_axis_name, flow=flow) - def get_bin_edges(self, h, axis_name, flow=True, **kwargs): + def get_bin_edges(self, h, axis_name, flow=True): x = h.axes[axis_name].edges if flow: x = extend_edges(h.axes[axis_name].traits, x) - return self.get_bin_boundaries(x, **kwargs) + return x - def get_bin_centers(self, h, axis_name, flow=True, **kwargs): + def get_bin_centers(self, h, axis_name, flow=True): x = h.axes[axis_name].centers if flow: x = extend_edges(h.axes[axis_name].traits, x) - return self.get_bin_boundaries(x, **kwargs) - - def get_bin_boundaries(self, x, xmin=None, xmax=None, cap=False): - # get bin boundaries for interpolation/smoothing with transformation - if self.polynomial in ["bernstein", "monotonic"]: - # transform bernstein polinomials to [0,1] - x = (x - xmin) / (xmax - xmin) - if np.sum(x < 0) or np.sum(x > 1): - if cap: - logger.info("Values outside [0,1] found, those will be capped to [0,1]") - x[x < 0] = 0 - x[x > 1] = 1 - else: - raise RuntimeError(f"All values need to be within [0,1] but {np.sum(x < 0)} values smaller 0 ({x[x < 0]}) and {np.sum(x > 1)} larger 1 ({x[x > 1]}) found after transformation with xmin={xmin} and xmax={xmax}") return x def calculate_fullABCD(self, h, flow=True): @@ -806,7 +679,7 @@ def calculate_fullABCD(self, h, flow=True): return d, dvar - def calculate_fullABCD_smoothed(self, h, syst_variations=False, use_spline=False, auxiliary_info=False, signal_region=False, flow=True): + def calculate_fullABCD_smoothed(self, h, syst_variations=False, use_spline=False, signal_region=False, flow=True): if type(self) in [FakeSelectorSimpleABCD, FakeSelector1DExtendedABCD]: # sum up high abcd-y axis bins @@ -839,273 +712,15 @@ def calculate_fullABCD_smoothed(self, h, syst_variations=False, use_spline=False sval = sval.reshape((*sval.shape[:-2], sval.shape[-2]*sval.shape[-1]))[...,:-1] svar = svar.reshape((*svar.shape[:-2], svar.shape[-2]*svar.shape[-1]))[...,:-1] - smoothidx = [n for n in h.axes.name if n not in [self.name_x, self.name_y]].index(self.smoothing_axis_name) - smoothing_axis = h.axes[self.smoothing_axis_name] - nax = sval.ndim - - # underflow and overflow are left unchanged along the smoothing axis - # so we need to exclude them if they have been otherwise included - if flow: - smoothstart = 1 if smoothing_axis.traits.underflow else 0 - smoothstop = -1 if smoothing_axis.traits.overflow else None - smoothslice = slice(smoothstart, smoothstop) - else: - smoothslice = slice(None) - - sel = nax*[slice(None)] - sel[smoothidx] = smoothslice - - sval = sval[*sel] - svar = svar[*sel] - - if use_spline: - sval, svar = spline_smooth(sval, edges = smoothing_axis.edges, edges_out = h.axes[self.smoothing_axis_name].edges, axis=smoothidx, binvars=svar, syst_variations=syst_variations) - else: - xwidth = h.axes[self.smoothing_axis_name].widths - - xwidthtgt = xwidth[*smoothidx*[None], :, *(nax - smoothidx - 2 + signal_region)*[None]] - xwidth = xwidth[*smoothidx*[None], :, *(nax - smoothidx - 1)*[None]] - - sval *= 1./xwidth - svar *= 1./xwidth**2 - - goodbin = (sval > 0.) & (svar > 0.) - if goodbin.size-np.sum(goodbin) > 0: - logger.warning(f"Found {goodbin.size-np.sum(goodbin)} of {goodbin.size} bins with 0 or negative bin content, those will be set to 0 and a large error") - - logd = np.where(goodbin, np.log(sval), 0.) - logdvar = np.where(goodbin, svar/sval**2, np.inf) - x = self.get_bin_centers_smoothing(h, flow=True) # the bins where the smoothing is performed (can be different to the bins in h) - - if auxiliary_info: - logd, logdvar, params, cov, chi2, ndf = self.smoothen( - h, x, logd, logdvar, syst_variations=syst_variations, signal_region=signal_region, auxiliary_info=True) - else: - logd, logdvar = self.smoothen(h, x, logd, logdvar, syst_variations=syst_variations) - - sval = np.exp(logd)*xwidthtgt - sval = np.where(np.isfinite(sval), sval, 0.) - if syst_variations: - svar = np.exp(logdvar)*xwidthtgt[..., None, None]**2 - svar = np.where((sval[..., None, None] > 0.) & np.isfinite(svar), svar, sval[..., None, None]) - - # get output shape from original hist axes, but as for result histogram - d = np.zeros([a.extent if flow else a.shape for a in h[{self.name_x:self.sel_x if not self.integrate_x else hist.sum}].axes if a.name != self.name_y], dtype=sval.dtype) - # leave the underflow and overflow unchanged if present - d[*sel[:-1]] = sval - if syst_variations: - dvar = np.zeros_like(d) - dvar = dvar[..., None, None]*np.ones((*dvar.shape, *svar.shape[-2:]), dtype=dvar.dtype) - # leave the underflow and overflow unchanged if present - dvar[*sel[:-1], :, :] = svar - else: - # with full smoothing all of the statistical uncertainty is included in the - # explicit variations, so the remaining binned uncertainty is zero - dvar = np.zeros_like(d) - - if auxiliary_info: - return d, dvar, params, cov, chi2, ndf - else: - return d, dvar - -class FakeSelectorSimultaneousABCD(FakeSelectorSimpleABCD): - # simple ABCD method simultaneous fitting all regions - def __init__(self, h, *args, **kwargs): - super().__init__(h, *args, **kwargs) - - def get_hist(self, h): - if h.storage_type == hist.storage.Weight: - # setting errors to 0 - h.view(flow=True)[...] = np.stack((h.values(flow=True), np.zeros_like(h.values(flow=True))), axis=-1) - - if self.name_x not in h.axes.name or self.name_y not in h.axes.name: - raise RuntimeError(f'{self.name_x} and {self.name_y} expected to be found in histogram, but only have axes {h.axes.name}') - - # axes in the correct ordering - axes = [ax for ax in h.axes.name if ax not in [self.name_x, self.name_y]] - axes += [self.name_x, self.name_y] - - if set(h.axes.name) != set(axes): - logger.warning(f"Axes in histogram '{h.axes.name}' are not the same as required '{axes}' or in a different order than expected, try to project") - h = h.project(*axes) - - # set the expected values in the signal region - slices = [self.sel_x if n==self.name_x else self.sel_y if n==self.name_y else slice(None) for n in h.axes.name] - h.values(flow=True)[*slices] = super().get_hist(h).values(flow=True) - - if self.global_scalefactor != 1: - h = hh.scaleHist(h, self.global_scalefactor) - - return h - -class FakeSelectorExtrapolateABCD(FakeSelectorSimpleABCD): - # extrapolate the fakerate in the abcd x axis by finding an analytic description in the dx region - def __init__(self, h, *args, - extrapolation_order=1, - rebin_x="automatic", # can be a list of bin edges, "automatic", or None - **kwargs - ): - super().__init__(h, *args, **kwargs) - self.set_selections_x() - - self.extrapolation_order = extrapolation_order - - if rebin_x == "automatic": - edges = h.axes[self.name_x].edges - self.rebin_x = get_rebinning(edges, self.name_x) - else: - self.rebin_x = rebin_x - - if self.polynomial == "bernstein": - logger.warning(f"It is not recommended to use {self.polynomial} polynomials for extrapolation.") - - if self.smoothing_mode != "binned": - raise NotImplementedError("Smooting fakerate is not implemented") - - self.f_smoothing = get_regression_function(self.extrapolation_order, pol=self.polynomial) - - # set slices object for selection of sideband regions - def set_selections_x(self): - x0, x1, x2, x3 = get_selection_edges(self.name_x) - s = hist.tag.Slicer() - self.sel_x = s[x0:x1:] if x0 is not None and x1.imag > x0.imag else s[x1:x0:] - self.sel_dx = s[x1:x3:] if x3 is None or x3.imag > x1.imag else s[x3:x1:] - - def get_hist(self, h, is_nominal=False, variations_smoothing=False, flow=True): - h = self.transfer_variances(h, set_nominal=is_nominal) - c, cvar = self.get_yields_applicationregion(h) - if self.integrate_x: - # can convert syst variations into bin by bin stat; do for the fist call (nominal histogram) - logger.debug("Integrate abcd x-axis, treat uncertainty as bin by bin stat") - y_frf, y_frf_variation = self.compute_fakeratefactor(h, syst_variations=is_nominal) - d = c * y_frf - # sum up abcd-x axis (sum up variations to treat uncertainty fully correlated across abcd-x axis bins) - idx_x = h[{self.name_y: self.sel_dy}].axes.name.index(self.name_x) - d = d.sum(axis=idx_x) - if is_nominal: - d_variation = c[..., np.newaxis,np.newaxis] * y_frf_variation - d_variation = d_variation.sum(axis=idx_x) - dvar = np.sum([(d_variation[...,iparam,0] - d)**2 for iparam in range(d_variation.shape[-2])], axis=0) # sum of squares of up variations - dvar += (y_frf**2 * cvar).sum(axis=idx_x) # add uncorrelated bin by bin uncertainty from application region - else: - dvar = None - else: - y_frf, y_frf_var = self.compute_fakeratefactor(h, syst_variations=variations_smoothing) - if variations_smoothing: - dvar = c[..., np.newaxis,np.newaxis] * y_frf_var - else: - # only take bin by bin uncertainty from c region - dvar = y_frf**2 * cvar - d = c * y_frf - - # set histogram in signal region - axes = h[{self.name_x: self.sel_x if not self.integrate_x else hist.sum, self.name_y: self.sel_y}].axes - hSignal = hist.Hist(*axes, storage=hist.storage.Double() if variations_smoothing else h.storage_type()) - hSignal.values(flow=flow)[...] = d - if variations_smoothing: - hSignal = self.get_syst_hist(hSignal, d, dvar, flow=flow) - elif dvar is not None and hSignal.storage_type == hist.storage.Weight: - hSignal.variances(flow=flow)[...] = dvar - - if self.global_scalefactor != 1: - hSignal = hh.scaleHist(hSignal, self.global_scalefactor) - - return hSignal - - def compute_fakeratefactor(self, h, syst_variations=False, flow=True, auxiliary_info=False): - sel = {n: hist.sum for n in self.fakerate_integration_axes} - # rebin in regression axis to have stable ratios - hNew = hh.rebinHist(h[sel], self.name_x, self.rebin_x) if self.rebin_x is not None else h[sel] - - # the underflow and overflow bins are kept by a simple selection, remove them - hNew = hh.disableFlow(hNew, self.name_x) - - # select sideband regions - ha = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_dy}] - hb = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_y}] - - a = ha.values(flow=flow) - b = hb.values(flow=flow) - - # fakerate factor - y = divide_arrays(b,a,cutoff=1) - if h.storage_type == hist.storage.Weight: - # full variances - avar = ha.variances(flow=flow) - bvar = hb.variances(flow=flow) - y_var = bvar/a**2 + b**2*avar/a**4 - y_var[a <= 1] = 1e10 - - # the bins where the regression is performed (can be different to the bin in h) - x = self.get_bin_centers(ha, self.name_x, flow=False) - if len(x)-1 < self.extrapolation_order: - raise RuntimeError(f"The extrapolation order is {self.extrapolation_order} but it can not be higher than the number of bins in {self.name_x} of {len(x)} -1") - # if len(x)-1 == self.extrapolation_order: - # logger.info("Extrapolation can be done fully analytical") - # slope = (y[...,1] - y[...,0]) / (x[1] - x[0]) - # offset = y[...,0] - slope * x[0] - - # x_extrapolation = self.get_bin_centers(h[{self.name_x: self.sel_x}], self.name_x, flow=flow) - # y = slope[...,np.newaxis] * x_extrapolation + offset[...,np.newaxis] - - y, y_var = self.extrapolate_fakerate(h[{**sel, self.name_x: self.sel_x}], x, y, y_var, syst_variations=syst_variations, auxiliary_info=auxiliary_info, flow=flow) - - # broadcast abcd-x axis and application axes - slices=[slice(None) if n in ha.axes.name else np.newaxis for n in h[{self.name_x: self.sel_x}].axes.name if n != self.name_y and (not self.integrate_x or n != self.name_x)] - y = y[*slices] - y_var = y_var[*slices] if y_var is not None else None - - return y, y_var - - def extrapolate_fakerate(self, h, x, y, y_var, syst_variations=False, auxiliary_info=False, flow=True): - if h.storage_type == hist.storage.Weight: - # transform with weights - w = 1/np.sqrt(y_var) - else: - logger.warning("Extrapolate ABCD on histogram without uncertainties, make an unweighted linear squared solution.") - w = np.ones_like(y) - - # move regression axis to last - axes = [n for n in h.axes.name if n not in [self.name_y,] ] - idx_ax_regress = axes.index(self.name_x) - if idx_ax_regress != len(axes)-1: - y = np.moveaxis(y, idx_ax_regress, -1) - w = np.moveaxis(w, idx_ax_regress, -1) - - # regression frf (e.g. in mT) - X, XTY = get_parameter_matrices(x, y, w, self.extrapolation_order, pol=self.polynomial) - - params, cov = self.solve(X, XTY) - - # evaluate in range of application region of original histogram - x_extrapolation = self.get_bin_centers(h, self.name_x, flow=flow) - y_extrapolation = self.f_smoothing(x_extrapolation, params) - - if syst_variations: - y_extrapolation_var = self.make_eigenvector_predictons_frf(params, cov, x_extrapolation) - else: - y_extrapolation_var = None - - if idx_ax_regress != len(axes)-1: - # move extrapolation axis to original positon again - y_extrapolation = np.moveaxis(y_extrapolation, -1, idx_ax_regress) - y_extrapolation_var = np.moveaxis(y_extrapolation_var, -3, idx_ax_regress) if syst_variations else None - - # check for negative rates - if np.sum(y_extrapolation<0) > 0: - logger.warning(f"Found {np.sum(y_extrapolation<0)} bins with negative fake rate factors") - if y_extrapolation_var is not None and np.sum(y_extrapolation_var<0) > 0: - logger.warning(f"Found {np.sum(y_extrapolation_var<0)} bins with negative fake rate factor variations") - - if auxiliary_info: - y_pred = self.f_smoothing(x, params) - # flatten - y_pred = y_pred.reshape(y.shape) - w = w.reshape(y.shape) - chi2, ndf = compute_chi2(y, y_pred, w, nparams=params.shape[-1]) - return y_extrapolation, y_extrapolation_var, params, cov, chi2, ndf - else: - return y_extrapolation, y_extrapolation_var + return self.smoothen_spectrum( + h, + sval, + svar, + syst_variations=syst_variations, + use_spline=use_spline, + reduce=not signal_region, + flow=flow, + ) class FakeSelector1DExtendedABCD(FakeSelectorSimpleABCD): # extended ABCD method with 5 control regions as desribed in https://arxiv.org/abs/1906.10831 equation 16 @@ -1162,17 +777,22 @@ def calculate_fullABCD(self, h, flow=True, syst_variations=False): return d, dvar - def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True, auxiliary_info=False): + def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True): # rebin in smoothing axis to have stable ratios sel = {n: hist.sum for n in self.fakerate_integration_axes} hNew = hh.rebinHist(h[sel], self.smoothing_axis_name, self.rebin_smoothing_axis) if self.rebin_smoothing_axis is not None else h[sel] # select sideband regions ha = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_dy}] - hax = hNew[{self.name_x: self.sel_d2x, self.name_y: self.sel_dy}] hb = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_y}] hbx = hNew[{self.name_x: self.sel_d2x, self.name_y: self.sel_y}] + if self.swap_regions: + # replace Ax region with C region + hax = self.get_hist_passX_failY(hNew) + else: + hax = hNew[{self.name_x: self.sel_d2x, self.name_y: self.sel_dy}] + a = ha.values(flow=flow) ax = hax.values(flow=flow) b = hb.values(flow=flow) @@ -1236,10 +856,17 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow logger.info("Done with toys") - if smoothing: x = self.get_bin_centers_smoothing(hNew, flow=True) # the bins where the smoothing is performed (can be different to the bin in h) - y, y_var = self.smoothen(h, x, y, y_var, syst_variations=syst_variations, auxiliary_info=auxiliary_info, flow=flow) + y, y_var = self.smoothen( + h, + x, + y, + y_var, + regressor=self.fakerate_regressor, + syst_variations=syst_variations, + flow=flow, + ) # broadcast abcd-x axis and application axes slices=[slice(None) if n in ha.axes.name else np.newaxis for n in h[{self.name_x: self.sel_x}].axes.name if n != self.name_y] @@ -1264,7 +891,6 @@ def __init__(self, h, *args, self.set_selections_x(integrate_x=False) self.interpolate_x = interpolate_x - self.interpolation_order = interpolation_order if rebin_x == "automatic": edges = h.axes[self.name_x].edges @@ -1272,33 +898,53 @@ def __init__(self, h, *args, else: self.rebin_x = rebin_x - # min and max (in application region) for transformation of bernstain polynomials into interval [0,1] - self.axis_x_min = h[{self.name_x: self.sel_x}].axes[self.name_x].edges[0] - if self.name_x == "mt": - # mt does not have an upper bound, cap at 100 - edges = self.rebin_x if self.rebin_x is not None else h.axes[self.name_x].edges - self.axis_x_max = extend_edges(h.axes[self.name_x].traits, edges)[-1] - elif self.name_x in ["iso", "relIso", "relJetLeptonDiff", "dxy"]: - # iso and dxy have a finite lower and upper bound in the application region - self.axis_x_max = abcd_thresholds[self.name_x][1] - else: - self.axis_x_max = self.axis_x.edges[-1] - # shape correction, can be interpolated in the abcd x-axis in 1D, in the x-axis and smoothing axis in 2D, or in the smoothing axis integrating out the abcd x-axis in 1D self.integrate_shapecorrection_x = integrate_shapecorrection_x # if the shape correction factor for the abcd x-axis should be inclusive or differential if self.integrate_shapecorrection_x and self.interpolate_x: raise RuntimeError("Can not integrate and interpolate x at the same time") self.smooth_shapecorrection = smooth_shapecorrection - self.smoothing_order_shapecorrection = smoothing_order_shapecorrection - # set smooth functions - self.f_scf = None - if self.interpolate_x and self.smooth_shapecorrection: - self.f_scf = get_regression_function(self.interpolation_order, self.smoothing_order_shapecorrection, pol=self.polynomial) - elif self.interpolate_x: - self.f_scf = get_regression_function(self.interpolation_order, pol=self.polynomial) - elif self.smooth_shapecorrection: - self.f_scf = get_regression_function(self.smoothing_order_shapecorrection, pol=self.polynomial) + if not self.integrate_shapecorrection_x: + # initialiaze shape correction regressor (can be 1D or 2D) + mins_x = [] + maxs_x = [] + orders = [] + if self.interpolate_x: + # min and max (in application region) for transformation of bernstain polynomials into interval [0,1] + axis_x_min = h[{self.name_x: self.sel_x}].axes[self.name_x].edges[0] + if self.name_x == "mt": + # mt does not have an upper bound, cap at 100 + edges = self.rebin_x if self.rebin_x is not None else h.axes[self.name_x].edges + axis_x_max = extend_edges(h.axes[self.name_x].traits, edges)[-1] + elif self.name_x in ["iso", "relIso", "relJetLeptonDiff", "dxy"]: + # iso and dxy have a finite lower and upper bound in the application region + axis_x_max = abcd_thresholds[self.name_x][1] + else: + axis_x_max = self.axis_x.edges[-1] + mins_x.append(axis_x_min) + maxs_x.append(axis_x_max) + orders.append(interpolation_order) + if self.smooth_shapecorrection: + mins_x.append(self.smoothing_axis_min) + maxs_x.append(self.smoothing_axis_max) + orders.append(smoothing_order_shapecorrection) + + if len(order)>1: + self.shapecorrection_regressor = Regressor2D( + "bernstein", + orders, + min_x=mins_x, + max_x=maxs_x, + ) + elif len(order) == 1: + self.shapecorrection_regressor = Regressor( + "bernstein", + orders[0], + min_x=mins_x[0], + max_x=maxs_x[0], + ) + else: + self.shapecorrection_regressor = None # set slices object for selection of sideband regions def set_selections_y(self): @@ -1360,7 +1006,7 @@ def get_hist(self, h, is_nominal=False, variations_scf=False, variations_smoothi return hSignal - def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, apply=False, flow=True, auxiliary_info=False): + def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, apply=False, flow=True): # if apply=True, shape correction is multiplied to application region for correct statistical uncertainty, only allowed if not smoothing if apply and smoothing and (self.interpolate_x or self.smooth_shapecorrection): raise NotImplementedError(f"Direct application of shapecorrection only supported when no smoothing is performed") @@ -1460,17 +1106,13 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app w = np.moveaxis(w, (idx_ax_smoothing, idx_ax_interpol), (-2, -1)) x_smoothing = self.get_bin_centers_smoothing(hNew, flow=True) - - X, y, XTY = get_parameter_matrices_from2D(x_interpol, x_smoothing, y, w, - self.interpolation_order, self.smoothing_order_shapecorrection, pol=self.polynomial, flatten=True) - - params, cov = self.solve(X, XTY) + shapecorrection_regressor.solve(x_interpol, x_smoothing, y, w, flatten=True) x_smooth_orig = self.get_bin_centers_smoothing(h, flow=True) - y_smooth_orig = self.f_scf(x_interpol_orig, x_smooth_orig, params) + y_smooth_orig = shapecorrection_regressor.evaluate(x_interpol_orig, x_smooth_orig) if syst_variations: - y_smooth_var_orig = self.make_eigenvector_predictons_scf(params, cov, x_interpol_orig, x_smooth_orig) + y_smooth_var_orig = shapecorrection_regressor.get_eigenvector_predictions(x_interpol_orig, x_smooth_orig) else: y_smooth_var_orig = None @@ -1479,27 +1121,21 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app y_smooth_orig = np.moveaxis(y_smooth_orig, (-2, -1), (idx_ax_smoothing, idx_ax_interpol)) y_smooth_var_orig = np.moveaxis(y_smooth_var_orig, (-4, -3), (idx_ax_smoothing, idx_ax_interpol)) if syst_variations else None - if auxiliary_info: - y_pred = self.f_scf(x_interpol, x_smoothing, params) else: # interpolate scf in mT in 1D if idx_ax_interpol != len(axes)-1: y = np.moveaxis(y, idx_ax_interpol, -1) w = np.moveaxis(w, idx_ax_interpol, -1) - X, XTY = get_parameter_matrices(x_interpol, y, w, self.smoothing_order_fakerate, pol=self.polynomial) - params, cov = self.solve(X, XTY) + shapecorrection_regressor.solve(x_interpol, y, w) - y_smooth_orig = self.f_scf(x_interpol_orig, params) + y_smooth_orig = shapecorrection_regressor.evaluate(x_interpol_orig) if syst_variations: - y_smooth_var_orig = self.make_eigenvector_predictons_scf(params, cov, x_interpol_orig) + y_smooth_var_orig = shapecorrection_regressor.get_eigenvector_predictions(x_interpol_orig) else: y_smooth_var_orig = None - if auxiliary_info: - y_pred = self.f_scf(x_interpol, params) - # move interpolation axis to original positon again if idx_ax_interpol != len(axes)-1: y_smooth_orig = np.moveaxis(y_smooth_orig, -1, idx_ax_interpol) @@ -1514,12 +1150,6 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app y, y_var = y_smooth_orig, y_smooth_var_orig - if auxiliary_info: - # flatten - y_pred = y_pred.reshape(y.shape) - w = w.reshape(y.shape) - chi2, ndf = compute_chi2(y, y_pred, w, nparams=params.shape[-1]) - elif smoothing and self.smooth_shapecorrection: # don't interpolate in mT, but smooth in pT in 1D @@ -1532,15 +1162,14 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app # smooth scf (e.g. in pT) x_smoothing = self.get_bin_centers_smoothing(hNew, flow=True) - X, XTY = get_parameter_matrices(x_smoothing, y, w, self.smoothing_order_shapecorrection, pol=self.polynomial) - params, cov = self.solve(X, XTY) + shapecorrection_regressor.solve(x_smoothing, y, w) # evaluate in range of original histogram x_smooth_orig = self.get_bin_centers_smoothing(h, flow=True) - y_smooth_orig = self.f_scf(x_smooth_orig, params) + y_smooth_orig = shapecorrection_regressor.evaluate(x_smooth_orig) if syst_variations: - y_smooth_var_orig = self.make_eigenvector_predictons_scf(params, cov, x_smooth_orig) + y_smooth_var_orig = shapecorrection_regressor.get_eigenvector_predictions(x_smooth_orig) else: y_smooth_var_orig = None @@ -1557,13 +1186,6 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app y, y_var = y_smooth_orig, y_smooth_var_orig - if auxiliary_info: - y_pred = self.f_scf(x_smoothing, params) - # flatten - y_pred = y_pred.reshape(y.shape) - w = w.reshape(y.shape) - chi2, ndf = compute_chi2(y, y_pred, w, nparams=params.shape[-1]) - # broadcast abcd-x axis and application axes if self.integrate_shapecorrection_x: slices=[np.newaxis if n==self.name_x or n not in hc.axes.name else slice(None) for n in h.axes.name if n not in [self.name_y]] @@ -1573,13 +1195,10 @@ def compute_shapecorrection(self, h, smoothing=False, syst_variations=False, app y = y[*slices] y_var = y_var[*slices] if y_var is not None else None - if auxiliary_info: - return y, y_var, params, cov, chi2, ndf - else: - return y, y_var + return y, y_var - def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True, auxiliary_info=False): + def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow=True): # rebin in smoothing axis to have stable ratios sel = {n: hist.sum for n in self.fakerate_integration_axes} hNew = hh.rebinHist(h[sel], self.smoothing_axis_name, self.rebin_smoothing_axis) if self.rebin_smoothing_axis is not None else h[sel] @@ -1680,7 +1299,7 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow if smoothing: x = self.get_bin_centers_smoothing(hNew, flow=True) # the bins where the smoothing is performed (can be different to the bin in h) - y, y_var = self.smoothen(h, x, y, y_var, syst_variations=syst_variations, auxiliary_info=auxiliary_info) + y, y_var = self.smoothen(h, x, y, y_var, regressor=self.fakerate_regressor, syst_variations=syst_variations) # broadcast abcd-x axis and application axes slices=[slice(None) if n in ha.axes.name else np.newaxis for n in h[{self.name_x: self.sel_x}].axes.name if n != self.name_y] @@ -1689,11 +1308,8 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow return y, y_var - def make_eigenvector_predictons_scf(self, params, cov, x1, x2=None): - return make_eigenvector_predictons(params, cov, func=self.f_scf, x1=x1, x2=x2, force_positive=False)#self.polynomial=="bernstein") - def get_bin_centers_interpolation(self, h, flow=True, cap=False): - return self.get_bin_centers(h, self.name_x, xmin=self.axis_x_min, xmax=self.axis_x_max, flow=flow, cap=cap) + return self.get_bin_centers(h, self.name_x, flow=flow) def calculate_fullABCD(self, h, flow=True): if len(self.fakerate_integration_axes) > 0: diff --git a/wremnants/regression.py b/wremnants/regression.py new file mode 100644 index 000000000..bccd84a0c --- /dev/null +++ b/wremnants/regression.py @@ -0,0 +1,342 @@ +import numpy as np +from scipy import stats +from scipy.optimize import nnls +from scipy.special import comb + +from utilities import logging + +logger = logging.child_logger(__name__) + + +def compute_chi2(y, y_pred, w=None, nparams=1): + # goodness of fit from parameter matrix 'X', values 'y' and parateters 'params' + residuals = (y - y_pred) + if w is not None: + residuals *= w + chi2 = np.sum((residuals**2), axis=-1) # per fit chi2 + chi2_total = np.sum((residuals**2)) # chi2 of all fits together + + # Degrees of freedom calculation + ndf = y.shape[-1] - nparams + ndf_total = y.size - chi2.size*nparams + + logger.info(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})") + logger.info(f"Min chi2 = {chi2.min()} (p = {stats.chi2.sf(chi2.min(), ndf)})") + logger.info(f"Max chi2 = {chi2.max()} (p = {stats.chi2.sf(chi2.max(), ndf)})") + logger.info(f"Mean chi2 = {chi2.mean()}") + logger.info(f"Std chi2 = {chi2.std()}") + return chi2, ndf + + +def get_parameter_eigenvectors(params, cov, sign=1, force_positive=False): + # diagonalize and get eigenvalues and eigenvectors + e, v = np.linalg.eigh(cov) # The column eigenvectors[:, i] is the normalized eigenvector corresponding to the eigenvalue eigenvalues[i], + vT = np.transpose(v, axes=(*np.arange(v.ndim-2), v.ndim-1, v.ndim-2)) # transpose v to have row eigenvectors[i, :] for easier computations + mag = np.sqrt(e)[...,np.newaxis] * vT * sign + # duplicate params vector to have duplicated rows + params_brd = np.broadcast_to(params[...,np.newaxis], mag.shape) + paramsT = np.transpose(params_brd, axes=(*np.arange(params_brd.ndim-2), params_brd.ndim-1, params_brd.ndim-2)) + mask = (paramsT + mag < 0).any(axis=-1) + if force_positive and mask.sum() > 0: + logger.info(f"Force {mask.sum()} eigenvectors to be positive") + # scaling the magnitude of the eigenvector to avoid negative coefficients + min_idxs = paramsT + mag == np.min(paramsT + mag, axis=-1)[...,np.newaxis] # find indices with minimum value of each eigenvector + min_paramsT = paramsT[min_idxs] + min_mag = mag[min_idxs] + factors = np.reshape(-1*min_paramsT/min_mag, mask.shape) + factors = np.where(mask, factors, np.ones_like(mask)) + factors = np.broadcast_to(factors[...,np.newaxis], mag.shape) + mag = factors * mag + if np.sum(mag.sum(axis=-1) == 0): + logger.warning(f"Found {np.sum(mag.sum(axis=-1) == 0)} eigenvector shifts where all coefficients are 0") + params_var = paramsT + mag + if np.sum(params_var.sum(axis=-1) == 0): + logger.warning(f"Found {np.sum(params_var.sum(axis=-1) == 0)} variations where all coefficients are 0") + return params_var + + +def make_eigenvector_predictons(params, cov, func, x1, x2=None, force_positive=False): + # return alternate values i.e. nominal+/-variation + params_up = get_parameter_eigenvectors(params, cov, sign=1, force_positive=force_positive) + y_pred_up = func(x1, params_up) if x2 is None else func(x1, x2, params_up) + y_pred_up = np.moveaxis(y_pred_up, params.ndim-1, -1) # put parameter variations last + params_dn = get_parameter_eigenvectors(params, cov, sign=-1, force_positive=force_positive) + y_pred_dn = func(x1, params_dn) if x2 is None else func(x1, x2, params_dn) + y_pred_dn = np.moveaxis(y_pred_dn, params.ndim-1, -1) # put parameter variations last + return np.stack((y_pred_up, y_pred_dn), axis=-1) + + +def poly(pol, order, order2=None): + if order2 is None: + if pol=="power": + return lambda x, n, p=1: p * x**n + elif pol=="bernstein": + return lambda x, n, p=1, o=order: p * comb(o, n) * x**n * (1 - x)**(o - n) + elif pol=="monotonic": + # integral of bernstein (see https://en.wikipedia.org/wiki/Bernstein_polynomial#Properties) + # for n>o return one additional parameter for the constant term from the integration + return lambda x, n, p=1, o=order: p*x**0 if n==0 else -1*p * 1/o * np.array( + [comb(o, k) * x**k * (1 - x)**(o - k) for k in range(n, o+1)] + ).sum(axis=0) + else: + return lambda x, x2, n, m, p=1, o1=order, o2=order2: p * poly(pol, o1)(x, n) * poly(pol, o2)(x2, m) + + +def get_parameter_matrices(x, y, w, order, pol="power"): + if x.shape != y.shape: + x = np.broadcast_to(x, y.shape) + stackX=[] # parameter matrix X + stackXTY=[] # and X.T @ Y + f = poly(pol, order) + for n in range(order+1): + p = f(x, n) + stackX.append( w * p ) + stackXTY.append((w**2 * p * y).sum(axis=(-1))) + X = np.stack(stackX, axis=-1) + XTY = np.stack(stackXTY, axis=-1) + return X, XTY + + +def get_parameter_matrices_from2D(x, x2, y, w, order, order2=None, pol="power", flatten=False): + x, x2 = np.broadcast_arrays(x[np.newaxis,...], x2[..., np.newaxis]) + x = np.broadcast_to(x, y.shape) + x2 = np.broadcast_to(x2, y.shape) + if order2 is None: + order2 = [0,]*(order+1) + elif type(order2) == int: + order2 = [order2,]*(order+1) + elif type(order2) == list: + if len(order2) < order+1: + order2.append([0,]*(len(order2)-order+1)) + else: + raise RuntimeError(f"Input 'order2' requires type 'None', 'int' or 'list'") + stackX=[] # parameter matrix X + stackXTY=[] # and X.T @ Y + for n in range(order+1): + f = poly(pol, order, order2[n]) + for m in range(order2[n]+1): + p = f(x, x2, n, m) + stackX.append(w * p) + stackXTY.append((w**2 * p * y).sum(axis=(-2, -1))) + X = np.stack(stackX, axis=-1) + XTY = np.stack(stackXTY, axis=-1) + if flatten: + # flatten the 2D array into 1D + newshape = (*y.shape[:-2],np.product(y.shape[-2:])) + y = y.reshape(newshape) + X = X.reshape(*newshape, X.shape[-1]) + + return X, y, XTY + + +def get_regression_function(orders, pol="power"): + if not isinstance(orders, list): + orders = [orders] + + if len(orders) == 1: + def fsum(x, ps, o=orders[0]): + f = poly(pol, o) + if hasattr(ps, "ndim") and ps.ndim > 1: + return sum([f(x, n, ps[...,n,np.newaxis]) for n in range(o+1)]) + else: + return sum([f(x, n, ps[n]) for n in range(o+1)]) + else: + def fsum(x1, x2, ps, o1=orders[0], o2=orders[1]): + idx=0 + psum = 0 + x1, x2 = np.broadcast_arrays(x1[np.newaxis,...], x2[..., np.newaxis]) + if hasattr(ps, "ndim") and ps.ndim > 1: + x1 = np.broadcast_to(x1, [*ps.shape[:-1], *x1.shape]) + x2 = np.broadcast_to(x2, [*ps.shape[:-1], *x2.shape]) + for n in range(o1+1): + f = poly(pol, o1, o2[n]) + for m in range(o2[n]+1): + psum += f(x1, x2, n, m, ps[...,idx,np.newaxis,np.newaxis]) + idx += 1 + else: + for n in range(o1+1): + f = poly(pol, o1, o2[n]) + for m in range(o2[n]+1): + psum += f(x1, x2, n, m, ps[idx]) + idx += 1 + return psum + return fsum + + +def solve_leastsquare(X, XTY): + # compute the transpose of X for the mt and parameter axes + XT = np.transpose(X, axes=(*np.arange(X.ndim-2), X.ndim-1, X.ndim-2)) + XTX = XT @ X + # compute the inverse of the matrix in each bin (reshape to make last two axes contiguous, reshape back after inversion), + # this term is also the covariance matrix for the parameters + XTXinv = np.linalg.inv(XTX.reshape(-1,*XTX.shape[-2:])) + XTXinv = XTXinv.reshape((*XT.shape[:-2],*XTXinv.shape[-2:])) + params = np.einsum('...ij,...j->...i', XTXinv, XTY) + return params, XTXinv + + +def solve_nonnegative_leastsquare(X, XTY, exclude_idx=None): + # exclude_idx to exclude the non negative constrained for one parameter by evaluating the nnls twice and flipping the sign + XT = np.transpose(X, axes=(*np.arange(X.ndim-2), X.ndim-1, X.ndim-2)) + XTX = XT @ X + XTXinv = np.linalg.inv(XTX.reshape(-1,*XTX.shape[-2:])) + XTXinv = XTXinv.reshape((*XT.shape[:-2],*XTXinv.shape[-2:])) + orig_shape = XTY.shape + nBins = np.prod(orig_shape[:-1]) + XTY_flat = XTY.reshape(nBins, XTY.shape[-1]) + XTX_flat = XTX.reshape(nBins, XTX.shape[-2], XTX.shape[-1]) + # params = [fnnls(xtx, xty) for xtx, xty in zip(XTX_flat, XTY_flat)] # use fast nnls (for some reason slower, even though it should be faster ...) + params = [nnls(xtx, xty)[0] for xtx, xty in zip(XTX_flat, XTY_flat)] # use scipy implementation of nnls + params = np.reshape(params, orig_shape) + if exclude_idx is not None and np.sum(params[...,exclude_idx]==0): + mask = params[...,exclude_idx]==0 + mask_flat = mask.flatten() + w_flip = np.ones(XTY.shape[-1]) + w_flip[exclude_idx] = -1 + params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTX_flat[mask_flat], XTY_flat[mask_flat] * w_flip)] + params[mask] = np.array(params_negative) * w_flip + logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") + return params, XTXinv + + +def get_solver(polynomial): + # solve with non negative least squares + if polynomial in ["bernstein",]: + return solve_nonnegative_leastsquare + elif polynomial in ["monotonic",]: + # exclude first parameter (integration constant) from non negative constraint + return lambda x,y: solve_nonnegative_leastsquare(x,y,0) + else: + return solve_leastsquare + + +def transform_bernstein(x, min_x, max_x, cap_x=False): + # get x axes values for interpolation/smoothing with transformation + # transform bernstein polinomials to [0,1] + x = (x - min_x) / (max_x - min_x) + if np.sum(x < 0) or np.sum(x > 1): + if cap_x: + logger.info("Values outside [0,1] found, those will be capped to [0,1]") + x[x < 0] = 0 + x[x > 1] = 1 + else: + raise RuntimeError(f"All values need to be within [0,1] but {np.sum(x < 0)} values smaller 0 ({x[x < 0]}) and {np.sum(x > 1)} larger 1 ({x[x > 1]}) found after transformation with xmin={xmin} and xmax={xmax}") + return x + + +class Regressor(object): + def __init__( + self, + polynomial, + order=None, + order2=None, + cap_x=False, + min_x=0, + max_x=1, + ): + self.polynomial = polynomial + self.order = order + + self.solver = get_solver(polynomial) + + self.evaluator = get_regression_function(order, pol=polynomial) + + self.cap_x = cap_x + self.min_x = min_x + self.max_x = max_x + + self.params = None + self.cov = None + + # external parameters and covariance matrix added for evaluation + self.external_params = None + self.external_cov = None + + def transform_x(self, x): + # if self.polynomial in ["bernstein", "monotonic"]: + x = transform_bernstein(x, self.min_x, self.max_x, self.cap_x) + return x + + def solve(self, x, y, w, chi2_info=True): + x = self.transform_x(x) + X, XTY = get_parameter_matrices(x, y, w, self.order, pol=self.polynomial) + self.params, self.cov = self.solver(X, XTY) + + if chi2_info: + ypred = self.evaluator(x, self.params) + compute_chi2(y, ypred, w, self.params.shape[-1]) + + def evaluate(self, x): + x = self.transform_x(x) + params = self.params + if self.external_params is not None: + params += self.external_params[..., *[np.newaxis for n in range(self.params.ndim - self.external_params.ndim)],:] + return self.evaluator(x, params) + + def get_eigenvector_predictions(self, x1, x2=None): + x1 = self.transform_x(x1) + if x2 is not None: + x2 = self.transform_x(x2) + cov = self.cov + params = self.params + if self.external_cov: + cov += self.external_cov[..., *[np.newaxis for n in range(self.cov.ndim - self.external_cov.ndim)],:,:] + if self.external_params is not None: + params += self.external_params[..., *[np.newaxis for n in range(self.params.ndim - self.external_params.ndim)],:] + return make_eigenvector_predictons(params, cov, func=self.evaluator, x1=x1, x2=x2, force_positive=False) + + def force_positive(self, exclude_idx=None): + # performing a nnls to enforce monotonicity for the signal region (using generalized least squares) + Y = self.params + W = np.linalg.inv(self.cov.reshape(-1,*self.cov.shape[-2:])) + W = W.reshape((*self.cov.shape[:-2],*W.shape[-2:])) + WY = np.einsum('...ij,...j->...i', W, Y) + # the design matrix X is just a 1xn unity matrix and can thus be ignored + XTWY = WY + XTWX = W + + orig_shape = XTWY.shape + nBins = np.prod(orig_shape[:-1]) + XTWY_flat = XTWY.reshape(nBins, XTWY.shape[-1]) + XTWX_flat = XTWX.reshape(nBins, XTWX.shape[-2], XTWX.shape[-1]) + self.params = [nnls(xtwx, xtwy)[0] for xtwx, xtwy in zip(XTWX_flat, XTWY_flat)] + self.params = np.reshape(self.params, orig_shape) + + # allow the integration constaint to be negative + if exclude_idx is not None and np.sum(self.params[...,exclude_idx]==0) > 0: + mask = self.params[...,exclude_idx]==0 + mask_flat = mask.flatten() + w_flip = np.ones(XTWY.shape[-1]) + w_flip[exclude_idx] = -1 + self.params_negative = [nnls(xtx, xty)[0] for xtx, xty in zip(XTWX_flat[mask_flat], XTWY_flat[mask_flat] * w_flip)] + self.params[mask] = np.array(self.params_negative) * w_flip + logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") + + +class Regressor2D(Regressor): + def __init__( + self, + *args, + **kwargs + ): + super().__init__(*args, **kwargs) + + if not isinstance(self.cap_x, list): + self.order = [self.order] * 2 + if not isinstance(self.cap_x, list): + self.cap_x = [self.cap_x] * 2 + if not isinstance(self.min_x, list): + self.min_x = [self.min_x] * 2 + if not isinstance(self.max_x, list): + self.max_x = [self.max_x] * 2 + + def transform_x(self, x1, x2): + if self.polynomial in ["bernstein", "monotonic"]: + x1 = transform_bernstein(x1, self.min_x[0], self.max_x[0], self.cap_x[0]) + x2 = transform_bernstein(x2, self.min_x[1], self.max_x[1], self.cap_x[1]) + return x1, x2 + + def solve(self, x1, x2, y, w, flatten=False): + x1, x2 = self.transform_x(x1, x2) + X, XTY = get_parameter_matrices_from2D(x1, x2, y, w, *self.order, pol=self.polynomial, flatten=flatten) + self.params, self.cov = self.solver(X, XTY) From e4ddebc236bebe11de7bbec54e757144885ce7fc Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Tue, 13 Aug 2024 16:32:40 -0400 Subject: [PATCH 06/19] don't hardcode polynomial order for construction of O(x) systematic uncertainty --- scripts/combine/setupCombine.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 5d2f6417a..55e815592 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -788,11 +788,12 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): actionArgs=dict(variations_scf=True), ) - if args.fakeSmoothingMode == "full": + if args.fakeSmoothingMode == "full" and args.fakeSmoothingOrder > 0: # add systematic of explicit parameter variation + fakeSmoothingOrder = args.fakeSmoothingOrder def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): - # apply varyation by adding parameter value (assumes log space, e.g. in full smoothing) - fakeselector.spectrum_regressor.external_params = np.zeros(4) + # apply variation by adding parameter value (assumes log space, e.g. in full smoothing) + fakeselector.spectrum_regressor.external_params = np.zeros(fakeSmoothingOrder + 1) fakeselector.spectrum_regressor.external_params[1] = 0.5 hvar = fakeselector.get_hist(h, *args, **kwargs) # reset external parameters From 3ea39e58921534f5a57f0daa0cfb3dc2ed06bcd7 Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Tue, 13 Aug 2024 16:33:50 -0400 Subject: [PATCH 07/19] apply monotonic constraint only for final estimate, and freeze any parameters which are at the boundary when constructing the uncertainties --- wremnants/histselections.py | 1 + wremnants/regression.py | 13 ++++++++++++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/wremnants/histselections.py b/wremnants/histselections.py index f25831528..fbe700947 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -284,6 +284,7 @@ def __init__(self, h, *args, smoothing_order_spectrum, min_x=self.smoothing_axis_min, max_x=self.smoothing_axis_max, + nnls = False, ) else: self.spectrum_regressor = None diff --git a/wremnants/regression.py b/wremnants/regression.py index bccd84a0c..677969bad 100644 --- a/wremnants/regression.py +++ b/wremnants/regression.py @@ -31,6 +31,8 @@ def compute_chi2(y, y_pred, w=None, nparams=1): def get_parameter_eigenvectors(params, cov, sign=1, force_positive=False): # diagonalize and get eigenvalues and eigenvectors e, v = np.linalg.eigh(cov) # The column eigenvectors[:, i] is the normalized eigenvector corresponding to the eigenvalue eigenvalues[i], + # protect against negative eigenvalues + e = np.maximum(e, 0.) vT = np.transpose(v, axes=(*np.arange(v.ndim-2), v.ndim-1, v.ndim-2)) # transpose v to have row eigenvectors[i, :] for easier computations mag = np.sqrt(e)[...,np.newaxis] * vT * sign # duplicate params vector to have duplicated rows @@ -233,11 +235,15 @@ def __init__( cap_x=False, min_x=0, max_x=1, + nnls=None, ): self.polynomial = polynomial self.order = order - self.solver = get_solver(polynomial) + if nnls is None: + self.solver = get_solver(polynomial) + else: + self.solver = solve_nonnegative_leastsquare if nnls else solve_leastsquare self.evaluator = get_regression_function(order, pol=polynomial) @@ -312,6 +318,11 @@ def force_positive(self, exclude_idx=None): self.params[mask] = np.array(self.params_negative) * w_flip logger.info(f"Found {mask.sum()} parameters that are excluded in nnls and negative") + # modify covariance matrix by fixing parameters at boundary + # this should lead to zero eigenvalues/null variations in + # the corresponding cases + self.cov = np.where(self.params[..., None]==0., 0., self.cov) + self.cov = np.where(self.params[..., None, :]==0., 0., self.cov) class Regressor2D(Regressor): def __init__( From ec3cba5bf44357069a210fd4e23a620d2bfcfbbc Mon Sep 17 00:00:00 2001 From: David Date: Tue, 13 Aug 2024 22:40:03 +0200 Subject: [PATCH 08/19] Disable application region smoothing for fakerate to be able to plot with integration axes --- scripts/plotting/makeDataMCStackPlot.py | 2 +- wremnants/histselections.py | 8 +++----- wremnants/regression.py | 13 +++++++++++++ 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/scripts/plotting/makeDataMCStackPlot.py b/scripts/plotting/makeDataMCStackPlot.py index 7ad86180b..2b162da13 100644 --- a/scripts/plotting/makeDataMCStackPlot.py +++ b/scripts/plotting/makeDataMCStackPlot.py @@ -146,7 +146,7 @@ def padArray(ref, matchLength): datasets, args.baseName, smoothing_mode=args.fakeSmoothingMode, - smoothingOrderFakerate=args.fakeSmoothingOrder, + smoothingOrderSpectrum=args.fakeSmoothingOrder, integrate_x=all("mt" not in x.split("-") for x in args.hists), mode=args.fakeEstimation, forceGlobalScaleFakes=args.forceGlobalScaleFakes, diff --git a/wremnants/histselections.py b/wremnants/histselections.py index f25831528..74e3080b5 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -248,7 +248,7 @@ class FakeSelectorSimpleABCD(HistselectorABCD): def __init__(self, h, *args, smoothing_mode="full", smoothing_order_fakerate=2, - smoothen_application_region=True, # if application region should be smoothed in case of fakerate smoothing + smoothen_application_region=False, # if application region should be smoothed in case of fakerate smoothing smoothing_order_spectrum=3, throw_toys=None, #"normal", # None, 'normal' or 'poisson' global_scalefactor=1, # apply global correction factor on prediction @@ -302,7 +302,7 @@ def __init__(self, h, *args, self.hCorr = None # swap the A and C regions for better numerical behaviour (only implemented for fakerate smoothing) - self.swap_regions = True + self.swap_regions = False def set_correction(self, hQCD, axes_names=False, mirror_axes=["eta"], flow=True): # hQCD is QCD MC histogram before selection (should contain variances) @@ -525,9 +525,7 @@ def smoothen(self, h, x, y, y_var, regressor, syst_variations=False, reduce=Fals # ['axy', 'ax', 'bx', 'ay', 'a', 'b', 'cy', 'c'] w_region = np.array([-1, 2, -1, 2, -4, 2, -1, 2], dtype=int) - # linear parameter combination - regressor.params = np.sum(regressor.params*w_region[*[np.newaxis]*(regressor.params.ndim-2), slice(None), np.newaxis], axis=-2) - regressor.cov = np.sum(regressor.cov*w_region[*[np.newaxis]*(regressor.params.ndim-2), slice(None), np.newaxis, np.newaxis]**2, axis=-3) + regressor.reduce_parameters(w_region) if regressor.polynomial == "monotonic": regressor.force_positive(exclude_idx=0) diff --git a/wremnants/regression.py b/wremnants/regression.py index bccd84a0c..c2a334877 100644 --- a/wremnants/regression.py +++ b/wremnants/regression.py @@ -285,6 +285,19 @@ def get_eigenvector_predictions(self, x1, x2=None): params += self.external_params[..., *[np.newaxis for n in range(self.params.ndim - self.external_params.ndim)],:] return make_eigenvector_predictons(params, cov, func=self.evaluator, x1=x1, x2=x2, force_positive=False) + def reduce_parameters(self, weight_vector=None, axis=-2): + # linear parameter combination using weight_vector + if weight_vector is None: + weight_vector = np.ones_like(self.params.shape[axis]) + if axis<0: + axis=self.params.ndim+axis + if axis==self.params.ndim-1: + raise RuntimeError("Last axis can not be reduced since it's the parameter axis") + slices = [np.newaxis if i!=axis else slice(None) for i in range(self.params.ndim)] + + self.params = np.sum(self.params*weight_vector[*slices], axis=axis) + self.cov = np.sum(self.cov*weight_vector[*slices, np.newaxis]**2, axis=axis) + def force_positive(self, exclude_idx=None): # performing a nnls to enforce monotonicity for the signal region (using generalized least squares) Y = self.params From 2a68a84efe64cf256efd02c9706abc2b63a34702 Mon Sep 17 00:00:00 2001 From: David Date: Wed, 14 Aug 2024 15:13:55 +0200 Subject: [PATCH 09/19] Add new smoothing option 'hybrid' to smooth fakerate and application region separately; fakerate smoothing remains as before with the binned application region --- scripts/combine/setupCombine.py | 19 ++++++++++--------- wremnants/histselections.py | 15 ++++++--------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 55e815592..3015d1bba 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -82,7 +82,7 @@ def make_parser(parser=None): parser.add_argument("--noMCStat", action='store_true', help="Do not include MC stat uncertainty in covariance for theory fit (only when using --fitresult)") parser.add_argument("--fakerateAxes", nargs="+", help="Axes for the fakerate binning", default=["eta","pt","charge"]) parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["closure", "simple", "extrapolate", "extended1D", "extended2D"]) - parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "full"], help="Smoothing mode for fake estimate.") + parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") @@ -763,14 +763,15 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): applySelection=False, # don't apply selection, all regions will be needed for the action action=fakeselector.get_hist, systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"]) - subgroup = f"{cardTool.getFakeName()}Smoothing" - cardTool.addSystematic(**info, - rename=subgroup, - splitGroup = {subgroup: f".*", "experiment": ".*"}, - systNamePrepend=subgroup, - actionArgs=dict(variations_smoothing=True), - ) - if args.fakeSmoothingMode == "fakerate": + if args.fakeSmoothingMode in ["hybrid", "full"]: + subgroup = f"{cardTool.getFakeName()}Smoothing" + cardTool.addSystematic(**info, + rename=subgroup, + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + actionArgs=dict(variations_smoothing=True), + ) + if args.fakeSmoothingMode in ["fakerate", "hybrid"]: subgroup = f"{cardTool.getFakeName()}Rate" cardTool.addSystematic(**info, rename=subgroup, diff --git a/wremnants/histselections.py b/wremnants/histselections.py index 0901475e7..7b2e21d3a 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -248,14 +248,13 @@ class FakeSelectorSimpleABCD(HistselectorABCD): def __init__(self, h, *args, smoothing_mode="full", smoothing_order_fakerate=2, - smoothen_application_region=False, # if application region should be smoothed in case of fakerate smoothing smoothing_order_spectrum=3, throw_toys=None, #"normal", # None, 'normal' or 'poisson' global_scalefactor=1, # apply global correction factor on prediction **kwargs ): """ - :smoothing_mode: choices: ['binned', 'fakerate', 'full'] + :smoothing_mode: choices: ['binned', 'fakerate', 'hybrid', 'full'] """ super().__init__(h, *args, **kwargs) @@ -264,10 +263,8 @@ def __init__(self, h, *args, self.global_scalefactor = global_scalefactor self.smoothing_mode = smoothing_mode - self.smoothen_application_region=smoothen_application_region - # select appropriate regressor objects depending on type of smoothing - if self.smoothing_mode == "fakerate": + if self.smoothing_mode in ["fakerate", "hybrid"]: self.fakerate_regressor = Regressor( "bernstein", smoothing_order_fakerate, @@ -277,7 +274,7 @@ def __init__(self, h, *args, else: self.fakerate_regressor = None - if self.smoothing_mode in ["full", "fakerate"]: + if self.smoothing_mode in ["fakerate", "hybrid", "full"]: self.spectrum_regressor = Regressor( "monotonic", # "power", @@ -375,7 +372,7 @@ def transfer_variances(self, h, set_nominal=False): def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothing=False, flow=True): idx_x = h.axes.name.index(self.name_x) - if self.smoothing_mode == "fakerate": + if self.smoothing_mode in ["fakerate", "hybrid"]: h = self.transfer_variances(h, set_nominal=is_nominal) y_frf, y_frf_var = self.compute_fakeratefactor(h, smoothing=True, syst_variations=variations_frf) @@ -391,7 +388,7 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi cval = hC.values(flow=flow) cvar = hC.variances(flow=flow) - if self.smoothen_application_region: + if self.smoothing_mode in ["hybrid"]: cval, cvar = self.smoothen_spectrum( hC, cval, @@ -407,7 +404,7 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi dvar = cvar[...,:,:] * y_frf[..., np.newaxis,np.newaxis] elif variations_frf: dvar = cval[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:] - elif self.smoothen_application_region: + elif self.smoothing_mode in ["hybrid"]: # everything is smoothed, no additional bin by bin statistical uncertainty dvar = np.zeros_like(d) else: From cfd3dc99c7fd29ceb8d0c3ffac4a4a5dd4d89d42 Mon Sep 17 00:00:00 2001 From: David Date: Wed, 14 Aug 2024 15:14:52 +0200 Subject: [PATCH 10/19] Small refinements for plotting scripts --- scripts/plotting/controlPlotsHDF5.py | 3 ++- scripts/plotting/makeDataMCStackPlot.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/plotting/controlPlotsHDF5.py b/scripts/plotting/controlPlotsHDF5.py index 424a99dd6..b429d377d 100644 --- a/scripts/plotting/controlPlotsHDF5.py +++ b/scripts/plotting/controlPlotsHDF5.py @@ -36,6 +36,7 @@ parser.add_argument("--hists", type=str, nargs='*', default=None, help="List of hists to plot; dash separated for unrolled hists") parser.add_argument("--normToData", action='store_true', help="Normalize MC to data") +parser.add_argument("--dataName", type=str, default="Data", help="Data name for plot labeling") # variations parser.add_argument("--varName", type=str, nargs='*', default=[], help="Name of variation hist") @@ -175,7 +176,7 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names, yerr=True, histtype="errorbar", color="black", - label="Data", + label=args.dataName, binwnorm=binwnorm, ax=ax1, alpha=1., diff --git a/scripts/plotting/makeDataMCStackPlot.py b/scripts/plotting/makeDataMCStackPlot.py index 2b162da13..2a50de350 100644 --- a/scripts/plotting/makeDataMCStackPlot.py +++ b/scripts/plotting/makeDataMCStackPlot.py @@ -43,7 +43,7 @@ parser.add_argument("--presel", type=str, nargs="*", default=[], help="Specify custom selections on input histograms to integrate some axes, giving axis name and min,max (e.g. '--presel pt=ptmin,ptmax' ) or just axis name for bool axes") parser.add_argument("--normToData", action='store_true', help="Normalize MC to data") parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["simple", "extrapolate", "extended1D", "extended2D"]) -parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "full"], help="Smoothing mode for fake estimate.") +parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") From eb802555d4e8a80c16ac6b95e6767a1f15813d96 Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Wed, 14 Aug 2024 21:32:54 -0400 Subject: [PATCH 11/19] update to slightly finer binning for fakerate smoothing --- utilities/common.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/utilities/common.py b/utilities/common.py index 66633067c..f774dd457 100644 --- a/utilities/common.py +++ b/utilities/common.py @@ -117,12 +117,15 @@ def get_binning_fakes_pt(min_pt, max_pt): edges = np.arange(min_pt,32,1) - edges = np.append(edges, [e for e in [33,36,40,46,56] if e Date: Wed, 14 Aug 2024 21:36:28 -0400 Subject: [PATCH 12/19] some updates to spline smoothing (not used), make hybrid smoothing the default and add systematic and statistical uncertainties corresponding toe fakerate smoothing with binned application region --- scripts/combine/setupCombine.py | 40 ++++++++++++++----- wremnants/histselections.py | 69 ++++++++++++++++++++++++--------- 2 files changed, 81 insertions(+), 28 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 3015d1bba..3346f5fe3 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -82,7 +82,7 @@ def make_parser(parser=None): parser.add_argument("--noMCStat", action='store_true', help="Do not include MC stat uncertainty in covariance for theory fit (only when using --fitresult)") parser.add_argument("--fakerateAxes", nargs="+", help="Axes for the fakerate binning", default=["eta","pt","charge"]) parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["closure", "simple", "extrapolate", "extended1D", "extended2D"]) - parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") + parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") @@ -138,7 +138,7 @@ def make_parser(parser=None): parser.add_argument("--massConstraintMode", choices=["automatic", "constrained", "unconstrained"], default="automatic", help="Whether mass is constrained within PDG value and uncertainty or unconstrained in the fit") parser.add_argument("--decorMassWidth", action='store_true', help="remove width variations from mass variations") parser.add_argument("--muRmuFPolVar", action="store_true", help="Use polynomial variations (like in theoryAgnosticPolVar) instead of binned variations for muR and muF (of course in setupCombine these are still constrained nuisances)") - parser.add_argument("--binByBinStatScaleForMW", type=float, default=1.125, help="scaling of bin by bin statistical uncertainty for W mass analysis") + parser.add_argument("--binByBinStatScaleForMW", type=float, default=1.10, help="scaling of bin by bin statistical uncertainty for W mass analysis") parser.add_argument("--exponentialTransform", action='store_true', help="apply exponential transformation to yields (useful for gen-level fits to helicity cross sections for example)") parser = make_subparsers(parser) @@ -764,13 +764,32 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): action=fakeselector.get_hist, systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"]) if args.fakeSmoothingMode in ["hybrid", "full"]: - subgroup = f"{cardTool.getFakeName()}Smoothing" - cardTool.addSystematic(**info, - rename=subgroup, - splitGroup = {subgroup: f".*", "experiment": ".*"}, - systNamePrepend=subgroup, - actionArgs=dict(variations_smoothing=True), + if False: + # these uncertainties are covered by the binByBin stat uncertainties + # for the moment + subgroup = f"{cardTool.getFakeName()}Smoothing" + cardTool.addSystematic(**info, + rename=subgroup, + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + actionArgs=dict(variations_smoothing=True), + ) + + subgroup = f"{cardTool.getFakeName()}SmoothingSyst" + cardTool.addSystematic(name=inputBaseName, + group="Fake", + processes=cardTool.getFakeName(), + noConstraint=False, + mirror=True, + scale=1., + applySelection=False, + action = fakeselector.get_smoothing_syst, + rename=subgroup, + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + systAxes = ["var"], ) + if args.fakeSmoothingMode in ["fakerate", "hybrid"]: subgroup = f"{cardTool.getFakeName()}Rate" cardTool.addSystematic(**info, @@ -789,7 +808,7 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): actionArgs=dict(variations_scf=True), ) - if args.fakeSmoothingMode == "full" and args.fakeSmoothingOrder > 0: + if args.fakeSmoothingMode in ["hybrid", "full"] and args.fakeSmoothingOrder > 0: # add systematic of explicit parameter variation fakeSmoothingOrder = args.fakeSmoothingOrder def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): @@ -907,7 +926,8 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): systNameReplace=[("effSystTnP", "effSyst"), ("etaDecorr0", "fullyCorr")], scale=scale, ) - if (wmass and not input_tools.args_from_metadata(cardTool, "noVetoSF")) or wlike_vetoValidation: + # if (wmass and not input_tools.args_from_metadata(cardTool, "noVetoSF")) or wlike_vetoValidation: + if (wmass and not False) or wlike_vetoValidation: useGlobalOrTrackerVeto = input_tools.args_from_metadata(cardTool, "useGlobalOrTrackerVeto") useRefinedVeto = input_tools.args_from_metadata(cardTool, "useRefinedVeto") allEffTnP_veto = ["effStatTnP_veto_sf", "effSystTnP_veto"] diff --git a/wremnants/histselections.py b/wremnants/histselections.py index 7b2e21d3a..ae6f862eb 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -101,7 +101,7 @@ def spline_smooth_nominal(binvals, edges, edges_out, axis): y = cdf xout = edges_out - spline = scipy.interpolate.make_interp_spline(x, y, axis=axis, bc_type=None) + spline = interpolate.make_interp_spline(x, y, axis=axis, bc_type=None) yout = spline(xout) binvalsout = np.diff(yout, axis=axis) @@ -276,12 +276,12 @@ def __init__(self, h, *args, if self.smoothing_mode in ["fakerate", "hybrid", "full"]: self.spectrum_regressor = Regressor( - "monotonic", + "monotonic", # "power", smoothing_order_spectrum, min_x=self.smoothing_axis_min, max_x=self.smoothing_axis_max, - nnls = False, + nnls = self.smoothing_mode not in ["full"], # constraint is handled elsewhere in the full smoothing case ) else: self.spectrum_regressor = None @@ -299,7 +299,7 @@ def __init__(self, h, *args, # histogram with nonclosure corrections self.hCorr = None - # swap the A and C regions for better numerical behaviour (only implemented for fakerate smoothing) + # swap the A and C regions for better numerical behaviour (only implemented for fakerate and hybrid smoothing) self.swap_regions = False def set_correction(self, hQCD, axes_names=False, mirror_axes=["eta"], flow=True): @@ -370,7 +370,20 @@ def transfer_variances(self, h, set_nominal=False): raise RuntimeError(f"Failed to transfer variances") return h - def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothing=False, flow=True): + def get_smoothing_syst(self, h): + #TODO this might not be safe in certain future parallelization schemes + smoothing_mode_old = self.smoothing_mode + self.smoothing_mode = "fakerate" + halt = self.get_hist(h) + self.smoothing_mode = smoothing_mode_old + + axis_var=hist.axis.Integer(0,1, underflow=False, overflow=False, name="var") + hout = hist.Hist(*halt.axes, axis_var) + hout[{"var" : 0}] = halt.values(flow=True) + + return hout + + def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothing=False, flow=True, use_spline=False): idx_x = h.axes.name.index(self.name_x) if self.smoothing_mode in ["fakerate", "hybrid"]: h = self.transfer_variances(h, set_nominal=is_nominal) @@ -386,15 +399,22 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi else: hC = self.get_hist_passX_failY(h) - cval = hC.values(flow=flow) - cvar = hC.variances(flow=flow) + if use_spline and self.smoothing_mode in ["hybrid"]: + hCNew = hC[{self.smoothing_axis_name : hist.rebin(3)}] + else: + hCNew = hC + + cval = hCNew.values(flow=flow) + cvar = hCNew.variances(flow=flow) + cvar_binned = cvar if self.smoothing_mode in ["hybrid"]: cval, cvar = self.smoothen_spectrum( hC, + hCNew.axes[self.smoothing_axis_name].edges, cval, cvar, syst_variations=variations_smoothing, - use_spline=False, + use_spline=use_spline, flow=flow, ) @@ -405,15 +425,16 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi elif variations_frf: dvar = cval[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:] elif self.smoothing_mode in ["hybrid"]: - # everything is smoothed, no additional bin by bin statistical uncertainty - dvar = np.zeros_like(d) + # keep statistical uncertainty from c region since we'll propagate that + # as a systematic + dvar = y_frf**2 * cvar_binned else: # only take bin by bin uncertainty from c region dvar = y_frf**2 * cvar elif self.smoothing_mode == "full": h = self.transfer_variances(h, set_nominal=is_nominal) - d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing) + d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing, use_spline=True) elif self.smoothing_mode == "binned": # no smoothing of rates d, dvar = self.calculate_fullABCD(h, flow=flow) @@ -550,7 +571,7 @@ def smoothen(self, h, x, y, y_var, regressor, syst_variations=False, reduce=Fals return y_smooth_orig, y_smooth_var_orig - def smoothen_spectrum(self, h, sval, svar, syst_variations=False, use_spline=False, reduce=False, flow=True): + def smoothen_spectrum(self, h, edges, sval, svar, syst_variations=False, use_spline=False, reduce=False, flow=True): smoothidx = [n for n in h.axes.name if n not in [self.name_x, self.name_y]].index(self.smoothing_axis_name) smoothing_axis = h.axes[self.smoothing_axis_name] nax = sval.ndim @@ -571,7 +592,12 @@ def smoothen_spectrum(self, h, sval, svar, syst_variations=False, use_spline=Fal svar = svar[*sel] if use_spline: - sval, svar = spline_smooth(sval, edges = smoothing_axis.edges, edges_out = h.axes[self.smoothing_axis_name].edges, axis=smoothidx, binvars=svar, syst_variations=syst_variations) + if reduce: + raise NotImplementedError("splines with reduction over regions not implemented yet.") + + print("splines") + + sval, svar = spline_smooth(sval, edges = edges, edges_out = h.axes[self.smoothing_axis_name].edges, axis=smoothidx, binvars=svar, syst_variations=syst_variations) else: xwidth = h.axes[self.smoothing_axis_name].widths @@ -587,6 +613,7 @@ def smoothen_spectrum(self, h, sval, svar, syst_variations=False, use_spline=Fal logd = np.where(goodbin, np.log(sval), 0.) logdvar = np.where(goodbin, svar/sval**2, np.inf) + x = self.get_bin_centers_smoothing(h, flow=flow) # the bins where the smoothing is performed (can be different to the bins in h) sval, svar = self.smoothen( @@ -686,12 +713,17 @@ def calculate_fullABCD_smoothed(self, h, syst_variations=False, use_spline=False # sum high mT bins h = hh.rebinHist(h, self.name_x, h.axes[self.name_x].edges[:3]) + if use_spline: + hNew = h[{self.smoothing_axis_name : hist.rebin(3)}] + else: + hNew = h + # get values and variances of all sideband regions (this assumes signal region is at high abcd-x and low abcd-y axis bins) - sval = h.values(flow=flow) - svar = h.variances(flow=flow) + sval = hNew.values(flow=flow) + svar = hNew.variances(flow=flow) # move abcd axes last - idx_x = h.axes.name.index(self.name_x) - idx_y = h.axes.name.index(self.name_y) + idx_x = hNew.axes.name.index(self.name_x) + idx_y = hNew.axes.name.index(self.name_y) sval = np.moveaxis(sval, [idx_x, idx_y], [-2, -1]) svar = np.moveaxis(svar, [idx_x, idx_y], [-2, -1]) @@ -710,6 +742,7 @@ def calculate_fullABCD_smoothed(self, h, syst_variations=False, use_spline=False return self.smoothen_spectrum( h, + hNew.axes[self.smoothing_axis_name].edges, sval, svar, syst_variations=syst_variations, @@ -981,7 +1014,7 @@ def get_hist(self, h, is_nominal=False, variations_scf=False, variations_smoothi dvar = dvar.sum(axis=idx_x) elif self.smoothing_mode == "full": h = self.transfer_variances(h, set_nominal=is_nominal) - d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing) + d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing, use_spline=False) elif self.smoothing_mode == "binned": # no smoothing of rates d, dvar = self.calculate_fullABCD(h) From 41f11a3cdd0ceca447806be7f2fe878a2c38a94f Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Thu, 15 Aug 2024 07:42:58 -0400 Subject: [PATCH 13/19] further update to binning for fakerate smoothing --- utilities/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utilities/common.py b/utilities/common.py index f774dd457..673998186 100644 --- a/utilities/common.py +++ b/utilities/common.py @@ -117,7 +117,7 @@ def get_binning_fakes_pt(min_pt, max_pt): edges = np.arange(min_pt,32,1) - edges = np.append(edges, [e for e in [35,38,41,44,47,50,53,56] if e Date: Thu, 15 Aug 2024 07:43:20 -0400 Subject: [PATCH 14/19] update default smoothing mode to hybrid also for plotting script --- scripts/plotting/makeDataMCStackPlot.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/plotting/makeDataMCStackPlot.py b/scripts/plotting/makeDataMCStackPlot.py index 2a50de350..3eb9f26ee 100644 --- a/scripts/plotting/makeDataMCStackPlot.py +++ b/scripts/plotting/makeDataMCStackPlot.py @@ -43,7 +43,7 @@ parser.add_argument("--presel", type=str, nargs="*", default=[], help="Specify custom selections on input histograms to integrate some axes, giving axis name and min,max (e.g. '--presel pt=ptmin,ptmax' ) or just axis name for bool axes") parser.add_argument("--normToData", action='store_true', help="Normalize MC to data") parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["simple", "extrapolate", "extended1D", "extended2D"]) -parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") +parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") From ffc0812592e36487f2b0472bc92915be3a8a3c2c Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Thu, 15 Aug 2024 07:44:22 -0400 Subject: [PATCH 15/19] disable splines for full smoothing (was left enabled by accident) --- wremnants/histselections.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wremnants/histselections.py b/wremnants/histselections.py index ae6f862eb..ae7c545de 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -434,7 +434,7 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi elif self.smoothing_mode == "full": h = self.transfer_variances(h, set_nominal=is_nominal) - d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing, use_spline=True) + d, dvar = self.calculate_fullABCD_smoothed(h, flow=flow, syst_variations=variations_smoothing, use_spline=False) elif self.smoothing_mode == "binned": # no smoothing of rates d, dvar = self.calculate_fullABCD(h, flow=flow) From dea960449c209ffc423f14099a7b2f792130588e Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Thu, 15 Aug 2024 07:44:46 -0400 Subject: [PATCH 16/19] revert to power basis without monotonic constraint for application region smoothing --- wremnants/histselections.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/wremnants/histselections.py b/wremnants/histselections.py index ae7c545de..7969088eb 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -276,12 +276,13 @@ def __init__(self, h, *args, if self.smoothing_mode in ["fakerate", "hybrid", "full"]: self.spectrum_regressor = Regressor( - "monotonic", - # "power", + # "monotonic", + "power", smoothing_order_spectrum, min_x=self.smoothing_axis_min, max_x=self.smoothing_axis_max, - nnls = self.smoothing_mode not in ["full"], # constraint is handled elsewhere in the full smoothing case + # nnls = self.smoothing_mode not in ["full"], # constraint is handled elsewhere in the full smoothing case + nnls = False, ) else: self.spectrum_regressor = None From 3034dfed8cf7d794dab27fabd17215f4c1b99978 Mon Sep 17 00:00:00 2001 From: Josh Bendavid Date: Thu, 15 Aug 2024 08:20:05 -0400 Subject: [PATCH 17/19] revert accidental change to extra options for vetoSF tests --- scripts/combine/setupCombine.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 3346f5fe3..6885ce03b 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -926,8 +926,7 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): systNameReplace=[("effSystTnP", "effSyst"), ("etaDecorr0", "fullyCorr")], scale=scale, ) - # if (wmass and not input_tools.args_from_metadata(cardTool, "noVetoSF")) or wlike_vetoValidation: - if (wmass and not False) or wlike_vetoValidation: + if (wmass and not input_tools.args_from_metadata(cardTool, "noVetoSF")) or wlike_vetoValidation: useGlobalOrTrackerVeto = input_tools.args_from_metadata(cardTool, "useGlobalOrTrackerVeto") useRefinedVeto = input_tools.args_from_metadata(cardTool, "useRefinedVeto") allEffTnP_veto = ["effStatTnP_veto_sf", "effSystTnP_veto"] From 199f3ea8149e329ee703aadf1945e30b33cd06e1 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 15 Aug 2024 16:58:36 +0200 Subject: [PATCH 18/19] Fix bug in error propagation for fakerate in extended1D; swap regions for fakerate (and hybrid) smoothing to obtain better smoothing p-values; set fakerate smoothing order to 3 by default; add global parameter variations as additional uncertainties on fakes --- scripts/combine/setupCombine.py | 45 +++++++++++++------------ scripts/plotting/controlPlotsHDF5.py | 2 +- scripts/plotting/makeDataMCStackPlot.py | 2 +- scripts/plotting/postfitPlots.py | 2 +- wremnants/CardTool.py | 13 ++++--- wremnants/datasets/datagroups.py | 2 +- wremnants/histselections.py | 6 ++-- wremnants/regression.py | 4 +-- 8 files changed, 41 insertions(+), 35 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 6885ce03b..3dd478015 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -85,7 +85,7 @@ def make_parser(parser=None): parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") - parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") + parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the application region or full prediction, depending on the smoothing mode") parser.add_argument("--simultaneousABCD", action="store_true", help="Produce datacard for simultaneous fit of ABCD regions") parser.add_argument("--skipSumGroups", action="store_true", help="Don't add sum groups to the output to save time e.g. when computing impacts") parser.add_argument("--allowNegativeExpectation", action="store_true", help="Allow processes to have negative contributions") @@ -377,7 +377,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x pseudodataGroups.fakerate_axes=args.fakerateAxes pseudodataGroups.set_histselectors( pseudodataGroups.getNames(), inputBaseName, mode=args.fakeEstimation, - smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder, + smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder, mcCorr=args.fakeMCCorr, integrate_x="mt" not in fitvar, simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes, @@ -394,7 +394,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x pseudodataGroups.copyGroup("QCD", "QCDTruth") pseudodataGroups.set_histselectors( pseudodataGroups.getNames(), inputBaseName, mode=args.fakeEstimation, fake_processes=["QCD",], - smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder, + smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder, mcCorr=args.fakeMCCorr, simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes, ) @@ -811,10 +811,10 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): if args.fakeSmoothingMode in ["hybrid", "full"] and args.fakeSmoothingOrder > 0: # add systematic of explicit parameter variation fakeSmoothingOrder = args.fakeSmoothingOrder - def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): + def fake_nonclosure(h, axesToDecorrNames, param_idx=1, variation_size=0.5, *args, **kwargs): # apply variation by adding parameter value (assumes log space, e.g. in full smoothing) fakeselector.spectrum_regressor.external_params = np.zeros(fakeSmoothingOrder + 1) - fakeselector.spectrum_regressor.external_params[1] = 0.5 + fakeselector.spectrum_regressor.external_params[param_idx] = variation_size hvar = fakeselector.get_hist(h, *args, **kwargs) # reset external parameters fakeselector.spectrum_regressor.external_params = None @@ -837,23 +837,24 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs): return hvar - for axesToDecorrNames in [[], ]:#["eta"]]: - subgroup = f"{cardTool.getFakeName()}Param1" - cardTool.addSystematic( - name=inputBaseName, - group="Fake", - rename=subgroup+ (f"_{'_'.join(axesToDecorrNames)}" if len(axesToDecorrNames) else ""), - splitGroup = {subgroup: f".*", "experiment": ".*"}, - systNamePrepend=subgroup, - processes=cardTool.getFakeName(), - noConstraint=False, - mirror=True, - scale=1, - applySelection=False, # don't apply selection, external parameters need to be added - action=fake_nonclosure, - actionArgs=dict(axesToDecorrNames=axesToDecorrNames), - systAxes=["var"] if len(axesToDecorrNames)==0 else [f"{n}_decorr" for n in axesToDecorrNames], - ) + for axesToDecorrNames in [[], ]: + for idx, mag in [(1,0.2),(2,0.2),(3,0.2)]: + subgroup = f"{cardTool.getFakeName()}Param{idx}" + cardTool.addSystematic( + name=inputBaseName, + group="Fake", + rename=subgroup+ (f"_{'_'.join(axesToDecorrNames)}" if len(axesToDecorrNames) else ""), + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + processes=cardTool.getFakeName(), + noConstraint=False, + mirror=True, + scale=1, + applySelection=False, # don't apply selection, external parameters need to be added + action=fake_nonclosure, + actionArgs=dict(axesToDecorrNames=axesToDecorrNames, param_idx=idx, variation_size=mag), + systAxes=["var"] if len(axesToDecorrNames)==0 else [f"{n}_decorr" for n in axesToDecorrNames], + ) if not args.noEfficiencyUnc: diff --git a/scripts/plotting/controlPlotsHDF5.py b/scripts/plotting/controlPlotsHDF5.py index b429d377d..08f06d828 100644 --- a/scripts/plotting/controlPlotsHDF5.py +++ b/scripts/plotting/controlPlotsHDF5.py @@ -286,7 +286,7 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names, # offset_text.set_position((-0.08,1.02)) if args.cmsDecor: - lumi = float(f"{channel_info['lumi']:.3g}") if not density else None + lumi = float(f"{channel_info['lumi']:.3g}") if not density and args.dataName=="Data" else None hep.cms.label(ax=ax1, lumi=lumi, fontsize=legtext_size*scale, label= args.cmsDecor, data=hist_data is not None) diff --git a/scripts/plotting/makeDataMCStackPlot.py b/scripts/plotting/makeDataMCStackPlot.py index 3eb9f26ee..ff7f806e3 100644 --- a/scripts/plotting/makeDataMCStackPlot.py +++ b/scripts/plotting/makeDataMCStackPlot.py @@ -46,7 +46,7 @@ parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.") parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction") parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).") -parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode") +parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the application region or full prediction, depending on the smoothing mode") parser.add_argument("--fakerateAxes", nargs="+", help="Axes for the fakerate binning", default=["eta","pt","charge"]) parser.add_argument("--fineGroups", action='store_true', help="Plot each group as a separate process, otherwise combine groups based on predefined dictionary") diff --git a/scripts/plotting/postfitPlots.py b/scripts/plotting/postfitPlots.py index 1743ecb72..3591ef738 100644 --- a/scripts/plotting/postfitPlots.py +++ b/scripts/plotting/postfitPlots.py @@ -210,7 +210,7 @@ def make_plot(h_data, h_inclusive, h_stack, axes, colors=None, labels=None, suff plot_tools.redo_axis_ticks(ax1, "x") plot_tools.redo_axis_ticks(ax2, "x") - hep.cms.label(ax=ax1, lumi=float(f"{lumi:.3g}") if lumi is not None else None, + hep.cms.label(ax=ax1, lumi=float(f"{lumi:.3g}") if lumi is not None and args.dataName=="Data" else None, fontsize=fontsize, label=args.cmsDecor, data=data) diff --git a/wremnants/CardTool.py b/wremnants/CardTool.py index abc78075c..077d8608c 100644 --- a/wremnants/CardTool.py +++ b/wremnants/CardTool.py @@ -847,12 +847,17 @@ def loadPseudodata(self, forceNonzero=False): fakeselector = self.datagroups.getDatagroups()[self.getFakeName()].histselector - _0, _1, params_d, cov_d, _chi2_d, _ndf_d = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True, signal_region=True) + _0, _1 = fakeselector.calculate_fullABCD_smoothed(hist_fake, signal_region=True) + params_d = fakeselector.spectrum_regressor.params + cov_d = fakeselector.spectrum_regressor.cov + hist_fake = hh.scaleHist(hist_fake, fakeselector.global_scalefactor) - _0, _1, params, cov, _chi2, _ndf = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True) + _0, _1 = fakeselector.calculate_fullABCD_smoothed(hist_fake) + params = fakeselector.spectrum_regressor.params + cov = fakeselector.spectrum_regressor.cov # add the nonclosure by adding the difference of the parameters - fakeselector.external_params = params_d - params + fakeselector.spectrum_regressor.external_params = params_d - params # load the pseudodata including the nonclosure self.datagroups.loadHistsForDatagroups( baseName=self.nominalName, syst=self.nominalName, label=pseudoData, @@ -866,7 +871,7 @@ def loadPseudodata(self, forceNonzero=False): hdatas.append(hdata) # remove the parameter offset again - fakeselector.external_params = None + fakeselector.spectrum_regressor.external_params = None # add the covariance matrix from the nonclosure to the model fakeselector.external_cov = cov + cov_d diff --git a/wremnants/datasets/datagroups.py b/wremnants/datasets/datagroups.py index 62aa56840..ba9c0ef25 100644 --- a/wremnants/datasets/datagroups.py +++ b/wremnants/datasets/datagroups.py @@ -202,7 +202,7 @@ def set_histselectors( fake_processes=None, mode="extended1D", smoothing_mode="full", - smoothingOrderFakerate=2, + smoothingOrderFakerate=3, smoothingOrderSpectrum=3, integrate_shapecorrection_x=True, # integrate the abcd x-axis or not, only relevant for extended2D simultaneousABCD=False, diff --git a/wremnants/histselections.py b/wremnants/histselections.py index 7969088eb..aab5e5b6d 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -247,7 +247,7 @@ class FakeSelectorSimpleABCD(HistselectorABCD): # simple ABCD method def __init__(self, h, *args, smoothing_mode="full", - smoothing_order_fakerate=2, + smoothing_order_fakerate=3, smoothing_order_spectrum=3, throw_toys=None, #"normal", # None, 'normal' or 'poisson' global_scalefactor=1, # apply global correction factor on prediction @@ -301,7 +301,7 @@ def __init__(self, h, *args, self.hCorr = None # swap the A and C regions for better numerical behaviour (only implemented for fakerate and hybrid smoothing) - self.swap_regions = False + self.swap_regions = True def set_correction(self, hQCD, axes_names=False, mirror_axes=["eta"], flow=True): # hQCD is QCD MC histogram before selection (should contain variances) @@ -840,7 +840,7 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow axvar = hax.variances(flow=flow) bvar = hb.variances(flow=flow) bxvar = hbx.variances(flow=flow) - y_var = b**4/(bx**2*a**4)*axvar + ax**2*b**2/(bx**2*a**4)*4*bvar + 4*avar/a**2 + ax**2*b**4/(bx**4*a**4)*bxvar + y_var = b**4/(bx**2*a**4)*axvar + ax**2*b**2/(bx**2*a**4)*4*bvar + y**2*4*avar/a**2 + y**2*bxvar/bx**2 y_var[y_den <= 1] = 1e10 if self.hCorr: diff --git a/wremnants/regression.py b/wremnants/regression.py index 671fb8cd2..9ac2d2f4e 100644 --- a/wremnants/regression.py +++ b/wremnants/regression.py @@ -21,8 +21,8 @@ def compute_chi2(y, y_pred, w=None, nparams=1): ndf_total = y.size - chi2.size*nparams logger.info(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})") - logger.info(f"Min chi2 = {chi2.min()} (p = {stats.chi2.sf(chi2.min(), ndf)})") - logger.info(f"Max chi2 = {chi2.max()} (p = {stats.chi2.sf(chi2.max(), ndf)})") + logger.info(f"Min chi2/ndf = {chi2.min()}/{ndf} (p = {stats.chi2.sf(chi2.min(), ndf)})") + logger.info(f"Max chi2/ndf = {chi2.max()}/{ndf} (p = {stats.chi2.sf(chi2.max(), ndf)})") logger.info(f"Mean chi2 = {chi2.mean()}") logger.info(f"Std chi2 = {chi2.std()}") return chi2, ndf From 8856b02053e3a10f27dc2e43aaf595653a53bc36 Mon Sep 17 00:00:00 2001 From: David Date: Thu, 15 Aug 2024 21:42:22 +0200 Subject: [PATCH 19/19] Transform x for power polynomials to [-1,1]; change fake uncertainties; disable swapping bins in case abcd-x axis is not integrated (for mt plotting in CI) --- scripts/combine/setupCombine.py | 32 +++++++------------------------- wremnants/histselections.py | 17 +++++++++++------ wremnants/regression.py | 21 ++++++++++++++------- 3 files changed, 32 insertions(+), 38 deletions(-) diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py index 3dd478015..2c883f276 100644 --- a/scripts/combine/setupCombine.py +++ b/scripts/combine/setupCombine.py @@ -130,7 +130,7 @@ def make_parser(parser=None): parser.add_argument("--pseudoDataIdxs", type=str, nargs="+", default=[None], help="Variation indices to use as pseudodata for each of the histograms") parser.add_argument("--pseudoDataFile", type=str, help="Input file for pseudodata (if it should be read from a different file)", default=None) parser.add_argument("--pseudoDataProcsRegexp", type=str, default=".*", help="Regular expression for processes taken from pseudodata file (all other processes are automatically got from the nominal file). Data is excluded automatically as usual") - parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-fakerate"], + parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-binned", "extended1D-fakerate"], help="Pseudodata for fakes are using QCD MC (closure), or different estimation methods (simple, extended1D, extended2D)") parser.add_argument("--addTauToSignal", action='store_true', help="Events from the same process but from tau final states are added to the signal") parser.add_argument("--noPDFandQCDtheorySystOnSignal", action='store_true', help="Removes PDF and theory uncertainties on signal processes") @@ -764,30 +764,12 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]): action=fakeselector.get_hist, systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"]) if args.fakeSmoothingMode in ["hybrid", "full"]: - if False: - # these uncertainties are covered by the binByBin stat uncertainties - # for the moment - subgroup = f"{cardTool.getFakeName()}Smoothing" - cardTool.addSystematic(**info, - rename=subgroup, - splitGroup = {subgroup: f".*", "experiment": ".*"}, - systNamePrepend=subgroup, - actionArgs=dict(variations_smoothing=True), - ) - - subgroup = f"{cardTool.getFakeName()}SmoothingSyst" - cardTool.addSystematic(name=inputBaseName, - group="Fake", - processes=cardTool.getFakeName(), - noConstraint=False, - mirror=True, - scale=1., - applySelection=False, - action = fakeselector.get_smoothing_syst, - rename=subgroup, - splitGroup = {subgroup: f".*", "experiment": ".*"}, - systNamePrepend=subgroup, - systAxes = ["var"], + subgroup = f"{cardTool.getFakeName()}Smoothing" + cardTool.addSystematic(**info, + rename=subgroup, + splitGroup = {subgroup: f".*", "experiment": ".*"}, + systNamePrepend=subgroup, + actionArgs=dict(variations_smoothing=True), ) if args.fakeSmoothingMode in ["fakerate", "hybrid"]: diff --git a/wremnants/histselections.py b/wremnants/histselections.py index aab5e5b6d..ab4bbe29b 100644 --- a/wremnants/histselections.py +++ b/wremnants/histselections.py @@ -390,7 +390,9 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi h = self.transfer_variances(h, set_nominal=is_nominal) y_frf, y_frf_var = self.compute_fakeratefactor(h, smoothing=True, syst_variations=variations_frf) - if self.swap_regions: + if not self.integrate_x and self.swap_regions: + logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated") + if self.swap_regions and self.integrate_x: if type(self) == FakeSelectorSimpleABCD: # replace C region with B region hC = self.get_hist_failX_passY(h) @@ -426,9 +428,8 @@ def get_hist(self, h, is_nominal=False, variations_frf=False, variations_smoothi elif variations_frf: dvar = cval[..., np.newaxis,np.newaxis] * y_frf_var[...,:,:] elif self.smoothing_mode in ["hybrid"]: - # keep statistical uncertainty from c region since we'll propagate that - # as a systematic - dvar = y_frf**2 * cvar_binned + # noo bin by bin statistical uncertainty, all regions covered by smoothing + dvar = np.zeros_like(cvar) else: # only take bin by bin uncertainty from c region dvar = y_frf**2 * cvar @@ -472,7 +473,9 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow # select sideband regions ha = self.get_hist_failX_failY(hNew) - if self.swap_regions: + if not self.integrate_x and self.swap_regions: + logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated") + if self.swap_regions and self.integrate_x: # replace B region with C region hb = self.get_hist_passX_failY(hNew) else: @@ -817,7 +820,9 @@ def compute_fakeratefactor(self, h, smoothing=False, syst_variations=False, flow hb = hNew[{self.name_x: self.sel_dx, self.name_y: self.sel_y}] hbx = hNew[{self.name_x: self.sel_d2x, self.name_y: self.sel_y}] - if self.swap_regions: + if not self.integrate_x and self.swap_regions: + logger.warning(f"Regions for fakerate estiamation can only be swapped if abcd-x axis is integrated") + if self.swap_regions and self.integrate_x: # replace Ax region with C region hax = self.get_hist_passX_failY(hNew) else: diff --git a/wremnants/regression.py b/wremnants/regression.py index 9ac2d2f4e..c3c5c0cd9 100644 --- a/wremnants/regression.py +++ b/wremnants/regression.py @@ -20,11 +20,11 @@ def compute_chi2(y, y_pred, w=None, nparams=1): ndf = y.shape[-1] - nparams ndf_total = y.size - chi2.size*nparams - logger.info(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})") - logger.info(f"Min chi2/ndf = {chi2.min()}/{ndf} (p = {stats.chi2.sf(chi2.min(), ndf)})") - logger.info(f"Max chi2/ndf = {chi2.max()}/{ndf} (p = {stats.chi2.sf(chi2.max(), ndf)})") - logger.info(f"Mean chi2 = {chi2.mean()}") - logger.info(f"Std chi2 = {chi2.std()}") + logger.debug(f"Total chi2/ndf = {chi2_total}/{ndf_total} = {chi2_total/ndf_total} (p = {stats.chi2.sf(chi2_total, ndf_total)})") + logger.debug(f"Min chi2/ndf = {chi2.min()}/{ndf} (p = {stats.chi2.sf(chi2.min(), ndf)})") + logger.debug(f"Max chi2/ndf = {chi2.max()}/{ndf} (p = {stats.chi2.sf(chi2.max(), ndf)})") + logger.debug(f"Mean chi2 = {chi2.mean()}") + logger.debug(f"Std chi2 = {chi2.std()}") return chi2, ndf @@ -225,6 +225,8 @@ def transform_bernstein(x, min_x, max_x, cap_x=False): raise RuntimeError(f"All values need to be within [0,1] but {np.sum(x < 0)} values smaller 0 ({x[x < 0]}) and {np.sum(x > 1)} larger 1 ({x[x > 1]}) found after transformation with xmin={xmin} and xmax={xmax}") return x +def transform_power(x, *args, **kwargs): + return transform_bernstein(x, *args, **kwargs) * 2 - 1 class Regressor(object): def __init__( @@ -259,8 +261,10 @@ def __init__( self.external_cov = None def transform_x(self, x): - # if self.polynomial in ["bernstein", "monotonic"]: - x = transform_bernstein(x, self.min_x, self.max_x, self.cap_x) + if self.polynomial in ["bernstein", "monotonic"]: + x = transform_bernstein(x, self.min_x, self.max_x, self.cap_x) + else: + x = transform_power(x, self.min_x, self.max_x, self.cap_x) return x def solve(self, x, y, w, chi2_info=True): @@ -358,6 +362,9 @@ def transform_x(self, x1, x2): if self.polynomial in ["bernstein", "monotonic"]: x1 = transform_bernstein(x1, self.min_x[0], self.max_x[0], self.cap_x[0]) x2 = transform_bernstein(x2, self.min_x[1], self.max_x[1], self.cap_x[1]) + else: + x1 = transform_power(x1, self.min_x[0], self.max_x[0], self.cap_x[0]) + x2 = transform_power(x2, self.min_x[1], self.max_x[1], self.cap_x[1]) return x1, x2 def solve(self, x1, x2, y, w, flatten=False):