Merge pull request #521 from davidwalter2/240724_fakes

Update QCD background: smoothing, uncertainties, and bias tests
WMass · Aug 16, 2024 · 42d7fd3 · 42d7fd3
2 parents 9f0825a + 8856b02
commit 42d7fd3
Show file tree

Hide file tree

Showing 10 changed files with 812 additions and 748 deletions.
diff --git a/scripts/combine/setupCombine.py b/scripts/combine/setupCombine.py
@@ -82,10 +82,10 @@ def make_parser(parser=None):
     parser.add_argument("--noMCStat", action='store_true', help="Do not include MC stat uncertainty in covariance for theory fit (only when using --fitresult)")
     parser.add_argument("--fakerateAxes", nargs="+", help="Axes for the fakerate binning", default=["eta","pt","charge"])
     parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["closure", "simple", "extrapolate", "extended1D", "extended2D"])
-    parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "full"], help="Smoothing mode for fake estimate.")
+    parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.")
     parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes  by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).")
     parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction")
-    parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode")
+    parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the application region or full prediction, depending on the smoothing mode")
     parser.add_argument("--simultaneousABCD", action="store_true", help="Produce datacard for simultaneous fit of ABCD regions")
     parser.add_argument("--skipSumGroups", action="store_true", help="Don't add sum groups to the output to save time e.g. when computing impacts")
     parser.add_argument("--allowNegativeExpectation", action="store_true", help="Allow processes to have negative contributions")
@@ -130,15 +130,15 @@ def make_parser(parser=None):
     parser.add_argument("--pseudoDataIdxs", type=str, nargs="+", default=[None], help="Variation indices to use as pseudodata for each of the histograms")
     parser.add_argument("--pseudoDataFile", type=str, help="Input file for pseudodata (if it should be read from a different file)", default=None)
     parser.add_argument("--pseudoDataProcsRegexp", type=str, default=".*", help="Regular expression for processes taken from pseudodata file (all other processes are automatically got from the nominal file). Data is excluded automatically as usual")
-    parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-fakerate"],
+    parser.add_argument("--pseudoDataFakes", type=str, nargs="+", choices=["truthMC", "closure", "simple", "extrapolate", "extended1D", "extended2D", "dataClosure", "mcClosure", "simple-binned", "extended1D-binned", "extended1D-fakerate"],
         help="Pseudodata for fakes are using QCD MC (closure), or different estimation methods (simple, extended1D, extended2D)")
     parser.add_argument("--addTauToSignal", action='store_true', help="Events from the same process but from tau final states are added to the signal")
     parser.add_argument("--noPDFandQCDtheorySystOnSignal", action='store_true', help="Removes PDF and theory uncertainties on signal processes")
     parser.add_argument("--recoCharge", type=str, default=["plus", "minus"], nargs="+", choices=["plus", "minus"], help="Specify reco charge to use, default uses both. This is a workaround for unfolding/theory-agnostic fit when running a single reco charge, as gen bins with opposite gen charge have to be filtered out")
     parser.add_argument("--massConstraintMode", choices=["automatic", "constrained", "unconstrained"], default="automatic", help="Whether mass is constrained within PDG value and uncertainty or unconstrained in the fit")
     parser.add_argument("--decorMassWidth", action='store_true', help="remove width variations from mass variations")
     parser.add_argument("--muRmuFPolVar", action="store_true", help="Use polynomial variations (like in theoryAgnosticPolVar) instead of binned variations for muR and muF (of course in setupCombine these are still constrained nuisances)")
-    parser.add_argument("--binByBinStatScaleForMW", type=float, default=1.125, help="scaling of bin by bin statistical uncertainty for W mass analysis")
+    parser.add_argument("--binByBinStatScaleForMW", type=float, default=1.10, help="scaling of bin by bin statistical uncertainty for W mass analysis")
     parser.add_argument("--exponentialTransform", action='store_true', help="apply exponential transformation to yields (useful for gen-level fits to helicity cross sections for example)")
     parser = make_subparsers(parser)
 
@@ -293,7 +293,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x
         datagroups.fakerate_axes=args.fakerateAxes
         datagroups.set_histselectors(
             datagroups.getNames(), inputBaseName, mode=args.fakeEstimation,
-            smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder,
+            smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder,
             mcCorr=args.fakeMCCorr,
             integrate_x="mt" not in fitvar,
             simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes,
@@ -377,7 +377,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x
                 pseudodataGroups.fakerate_axes=args.fakerateAxes
                 pseudodataGroups.set_histselectors(
                     pseudodataGroups.getNames(), inputBaseName, mode=args.fakeEstimation,
-                    smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder,
+                    smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder,
                     mcCorr=args.fakeMCCorr,
                     integrate_x="mt" not in fitvar,
                     simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes,
@@ -394,7 +394,7 @@ def setup(args, inputFile, inputBaseName, inputLumiScale, fitvar, genvar=None, x
             pseudodataGroups.copyGroup("QCD", "QCDTruth")
             pseudodataGroups.set_histselectors(
                 pseudodataGroups.getNames(), inputBaseName, mode=args.fakeEstimation, fake_processes=["QCD",],
-                smoothing_mode=args.fakeSmoothingMode, smoothingOrderFakerate=args.fakeSmoothingOrder,
+                smoothing_mode=args.fakeSmoothingMode, smoothingOrderSpectrum=args.fakeSmoothingOrder,
                 mcCorr=args.fakeMCCorr,
                 simultaneousABCD=simultaneousABCD, forceGlobalScaleFakes=args.forceGlobalScaleFakes,
                 )
@@ -743,18 +743,15 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]):
         cardTool.addLnNSystematic("CMS_background", processes=["Other"], size=1.15, group="CMS_background", splitGroup = {"experiment": ".*"},)
         cardTool.addLnNSystematic("lumi", processes=['MCnoQCD'], size=1.017 if lowPU else 1.012, group="luminosity", splitGroup = {"experiment": ".*"},)
 
-    if cardTool.getFakeName() != "QCD" and cardTool.getFakeName() in datagroups.groups.keys() and not xnorm and (args.fakeSmoothingMode != "binned" or (args.fakeEstimation in ["extrapolate"] and "mt" in fitvar)):
-
-        # add stat uncertainty from QCD MC nonclosure to smoothing parameter covariance
-        hist_fake = datagroups.results["QCDmuEnrichPt15PostVFP"]["output"]["unweighted"].get()
+    if (
+        (cardTool.getFakeName() != "QCD" or args.qcdProcessName=="QCD") 
+        and cardTool.getFakeName() in datagroups.groups.keys() 
+        and not xnorm 
+        and (args.fakeSmoothingMode != "binned" or (args.fakeEstimation in ["extrapolate"] and "mt" in fitvar))
+        ):
 
         fakeselector = cardTool.datagroups.groups[cardTool.getFakeName()].histselector
 
-        _0, _1, params, cov, _chi2, _ndf = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True)
-        _0, _1, params_d, cov_d, _chi2_d, _ndf_d = fakeselector.calculate_fullABCD_smoothed(hist_fake, auxiliary_info=True, signal_region=True)
-
-        fakeselector.external_cov = cov + cov_d
-
         syst_axes = ["eta", "charge"] if (args.fakeSmoothingMode != "binned" or args.fakeEstimation not in ["extrapolate"]) else ["eta", "pt", "charge"]
         info=dict(
             name=inputBaseName, 
@@ -766,13 +763,24 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]):
             applySelection=False, # don't apply selection, all regions will be needed for the action
             action=fakeselector.get_hist,
             systAxes=[f"_{x}" for x in syst_axes if x in args.fakerateAxes]+["_param", "downUpVar"])
-        subgroup = f"{cardTool.getFakeName()}Rate"
-        cardTool.addSystematic(**info,
-            rename=subgroup,
-            splitGroup = {subgroup: f".*", "experiment": ".*"},
-            systNamePrepend=subgroup,
-            actionArgs=dict(variations_smoothing=True),
-        )
+        if args.fakeSmoothingMode in ["hybrid", "full"]:
+            subgroup = f"{cardTool.getFakeName()}Smoothing"
+            cardTool.addSystematic(**info,
+                rename=subgroup,
+                splitGroup = {subgroup: f".*", "experiment": ".*"},
+                systNamePrepend=subgroup,
+                actionArgs=dict(variations_smoothing=True),
+            )
+
+        if args.fakeSmoothingMode in ["fakerate", "hybrid"]:
+            subgroup = f"{cardTool.getFakeName()}Rate"
+            cardTool.addSystematic(**info,
+                rename=subgroup,
+                splitGroup = {subgroup: f".*", "experiment": ".*"},
+                systNamePrepend=subgroup,
+                actionArgs=dict(variations_frf=True),
+            )
+
         if args.fakeEstimation in ["extended2D",] and args.fakeSmoothingMode != "full":
             subgroup = f"{cardTool.getFakeName()}Shape"
             cardTool.addSystematic(**info,
@@ -782,14 +790,22 @@ def assertSample(name, startsWith=["W", "Z"], excludeMatch=[]):
                 actionArgs=dict(variations_scf=True),
             )
 
-        if args.fakeSmoothingMode == "full":
-            # add nominal nonclosure of QCD MC as single systematic
-            def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs):
-                # apply QCD MC nonclosure by adding parameters (assumes log space, e.g. in full smoothing)
-                fakeselector.external_params = params_d - params
+        if args.fakeSmoothingMode in ["hybrid", "full"] and args.fakeSmoothingOrder > 0:
+            # add systematic of explicit parameter variation
+            fakeSmoothingOrder = args.fakeSmoothingOrder
+            def fake_nonclosure(h, axesToDecorrNames, param_idx=1, variation_size=0.5, *args, **kwargs):
+                # apply variation by adding parameter value (assumes log space, e.g. in full smoothing)
+                fakeselector.spectrum_regressor.external_params = np.zeros(fakeSmoothingOrder + 1)
+                fakeselector.spectrum_regressor.external_params[param_idx] = variation_size
                 hvar = fakeselector.get_hist(h, *args, **kwargs)
                 # reset external parameters
-                fakeselector.external_params = None
+                fakeselector.spectrum_regressor.external_params = None
+
+                hnom = fakeselector.get_hist(h, *args, **kwargs)
+
+                # normalize variation histogram to have the same integral as nominal
+                hScale = hh.divideHists(hnom[{"pt":hist.sum}], hvar[{"pt":hist.sum}])
+                hvar = hh.multiplyHists(hvar, hScale)
 
                 if len(axesToDecorrNames) == 0:
                     # inclusive
@@ -799,28 +815,28 @@ def fake_nonclosure(h, axesToDecorrNames, *args, **kwargs):
                         data = hvar.values(flow=True)[...,np.newaxis]
                     )
                 else:
-                    hnom = fakeselector.get_hist(h, *args, **kwargs)
                     hvar = syst_tools.decorrelateByAxes(hvar, hnom, axesToDecorrNames)
 
                 return hvar
 
-            for axesToDecorrNames in [[], ["eta"]]:
-                subgroup = f"{cardTool.getFakeName()}QCDMCNonclosure"
-                cardTool.addSystematic(
-                    name=inputBaseName, 
-                    group="Fake",
-                    rename=subgroup+ (f"_{'_'.join(axesToDecorrNames)}" if len(axesToDecorrNames) else ""),
-                    splitGroup = {subgroup: f".*", "experiment": ".*"},
-                    systNamePrepend=subgroup,
-                    processes=cardTool.getFakeName(),
-                    noConstraint=False,
-                    mirror=True,
-                    scale=1,
-                    applySelection=False, # don't apply selection, external parameters need to be added
-                    action=fake_nonclosure,
-                    actionArgs=dict(axesToDecorrNames=axesToDecorrNames),
-                    systAxes=["var"] if len(axesToDecorrNames)==0 else [f"{n}_decorr" for n in axesToDecorrNames],
-            )
+            for axesToDecorrNames in [[], ]:
+                for idx, mag in [(1,0.2),(2,0.2),(3,0.2)]:
+                    subgroup = f"{cardTool.getFakeName()}Param{idx}"
+                    cardTool.addSystematic(
+                        name=inputBaseName, 
+                        group="Fake",
+                        rename=subgroup+ (f"_{'_'.join(axesToDecorrNames)}" if len(axesToDecorrNames) else ""),
+                        splitGroup = {subgroup: f".*", "experiment": ".*"},
+                        systNamePrepend=subgroup,
+                        processes=cardTool.getFakeName(),
+                        noConstraint=False,
+                        mirror=True,
+                        scale=1,
+                        applySelection=False, # don't apply selection, external parameters need to be added
+                        action=fake_nonclosure,
+                        actionArgs=dict(axesToDecorrNames=axesToDecorrNames, param_idx=idx, variation_size=mag),
+                        systAxes=["var"] if len(axesToDecorrNames)==0 else [f"{n}_decorr" for n in axesToDecorrNames],
+                    )
 
     if not args.noEfficiencyUnc:
 

diff --git a/scripts/plotting/controlPlotsHDF5.py b/scripts/plotting/controlPlotsHDF5.py
@@ -35,6 +35,8 @@
     help="Select specific bins of the selectionAxis e.g. '0 1' to select the first bin of the first axis and second bin of the second axis")
 parser.add_argument("--hists", type=str, nargs='*', default=None, 
     help="List of hists to plot; dash separated for unrolled hists")
+parser.add_argument("--normToData", action='store_true', help="Normalize MC to data")
+parser.add_argument("--dataName", type=str, default="Data", help="Data name for plot labeling")
 
 # variations
 parser.add_argument("--varName", type=str, nargs='*', default=[], help="Name of variation hist")
@@ -123,6 +125,13 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names,
     else:
         hists_pred = [hh.sumHists(h_stack)]
 
+        if args.normToData:
+            scale = h_data.values().sum()/hists_pred[0].values().sum()
+            h_stack = [hh.scaleHist(h, scale) for h in h_stack]
+            hists_pred[0] = hh.scaleHist(hists_pred[0], scale)
+            hists_syst_up = [hh.scaleHist(h, scale) for h in hists_syst_up]
+            hists_syst_dn = [hh.scaleHist(h, scale) for h in hists_syst_dn]
+
     # loop over all processes if plots for each process is requested, or inclusive otherwise
     for i, h_pred in enumerate(hists_pred):
 
@@ -167,7 +176,7 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names,
                 yerr=True,
                 histtype="errorbar",
                 color="black",
-                label="Data",
+                label=args.dataName,
                 binwnorm=binwnorm,
                 ax=ax1,
                 alpha=1.,
@@ -252,11 +261,14 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names,
         if selections is not None:
             for i, (key, idx) in enumerate(selections.items()):
                 lo, hi = selection_edges[i]
-                label = styles.xlabels[key].replace(" (GeV)","")
-                if lo != None:
-                    label = f"{lo} < {label}"
-                if hi != None:
-                    label = f"{label} < {hi}"
+                if key=="charge":
+                    label = f"charge = {'-1' if i==0 else +1}"
+                else:
+                    label = styles.xlabels[key].replace(" (GeV)","")
+                    if lo != None:
+                        label = f"{lo} < {label}"
+                    if hi != None:
+                        label = f"{label} < {hi}"
 
                 ax1.text(0.05, 0.96-i*0.08, label, horizontalalignment='left', verticalalignment='top', transform=ax1.transAxes,
                     fontsize=20*args.scaleleg*scale)  
@@ -274,7 +286,7 @@ def make_plot(hists_proc, hist_data, hists_syst_up, hists_syst_dn, axes_names,
         # offset_text.set_position((-0.08,1.02))
 
         if args.cmsDecor:
-            lumi = float(f"{channel_info['lumi']:.3g}") if not density else None
+            lumi = float(f"{channel_info['lumi']:.3g}") if not density and args.dataName=="Data" else None
 
             hep.cms.label(ax=ax1, lumi=lumi, fontsize=legtext_size*scale, 
                 label= args.cmsDecor, data=hist_data is not None)

diff --git a/scripts/plotting/makeDataMCStackPlot.py b/scripts/plotting/makeDataMCStackPlot.py
@@ -43,10 +43,10 @@
 parser.add_argument("--presel", type=str, nargs="*", default=[], help="Specify custom selections on input histograms to integrate some axes, giving axis name and min,max (e.g. '--presel pt=ptmin,ptmax' ) or just axis name for bool axes")
 parser.add_argument("--normToData", action='store_true', help="Normalize MC to data")
 parser.add_argument("--fakeEstimation", type=str, help="Set the mode for the fake estimation", default="extended1D", choices=["simple", "extrapolate", "extended1D", "extended2D"])
-parser.add_argument("--fakeSmoothingMode", type=str, default="full", choices=["binned", "fakerate", "full"], help="Smoothing mode for fake estimate.")
+parser.add_argument("--fakeSmoothingMode", type=str, default="hybrid", choices=["binned", "fakerate", "hybrid", "full"], help="Smoothing mode for fake estimate.")
 parser.add_argument("--fakeMCCorr", type=str, default=[None], nargs="*", choices=["none", "pt", "eta", "mt"], help="axes to apply nonclosure correction from QCD MC. Leave empty for inclusive correction, use'none' for no correction")
 parser.add_argument("--forceGlobalScaleFakes", default=None, type=float, help="Scale the fakes  by this factor (overriding any custom one implemented in datagroups.py in the fakeSelector).")
-parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the fake rate or full prediction, depending on the smoothing mode")
+parser.add_argument("--fakeSmoothingOrder", type=int, default=3, help="Order of the polynomial for the smoothing of the application region or full prediction, depending on the smoothing mode")
 parser.add_argument("--fakerateAxes", nargs="+", help="Axes for the fakerate binning", default=["eta","pt","charge"])
 parser.add_argument("--fineGroups", action='store_true', help="Plot each group as a separate process, otherwise combine groups based on predefined dictionary")
 
@@ -146,7 +146,7 @@ def padArray(ref, matchLength):
         datasets,
         args.baseName,
         smoothing_mode=args.fakeSmoothingMode,
-        smoothingOrderFakerate=args.fakeSmoothingOrder,
+        smoothingOrderSpectrum=args.fakeSmoothingOrder,
         integrate_x=all("mt" not in x.split("-") for x in args.hists),
         mode=args.fakeEstimation,
         forceGlobalScaleFakes=args.forceGlobalScaleFakes,