Merge pull request #320 from TopEFT/sr-data-mc-plots

Updates to tools for unblinding checks
TopEFT · Sep 21, 2022 · 447dfde · 447dfde
2 parents c21be48 + 9cd0cbf
commit 447dfde
Show file tree

Hide file tree

Showing 4 changed files with 611 additions and 61 deletions.
diff --git a/analysis/topEFT/get_datacard_yields.py b/analysis/topEFT/get_datacard_yields.py
@@ -3,13 +3,15 @@
 import datetime
 import argparse
 import json
+import numpy as np
 
 import topcoffea.modules.MakeLatexTable as mlt
 
-BKG_PROC_LST = ["convs_sm","Diboson_sm","Triboson_sm","charge_flips_sm","fakes_sm"]
+BKG_PROC_LST = ["tWZ_sm", "convs_sm","Diboson_sm","Triboson_sm","charge_flips_sm","fakes_sm"]
 SIG_PROC_LST = ["ttH_sm", "ttlnu_sm", "ttll_sm", "tllq_sm", "tHq_sm", "tttt_sm"]
 
 PROC_ORDER = [
+     "tWZ_sm",
      "Diboson_sm",
      "Triboson_sm",
      "charge_flips_sm",
@@ -25,6 +27,7 @@
      "Sum_sig",
      "Sum_expected",
      "Observation",
+     "Pdiff",
 ]
 CAT_ORDER = [
     "2lss_m_3b",
@@ -298,6 +301,8 @@ def main():
         cat_name = get_cat_name_from_dc_name(dc_fname)
         all_rates_dict[cat_name] = rate_dict_sm
 
+    #printd(all_rates_dict)
+
     # Sum over jet bins and rename the keys, i.e. just some "post processing"
     all_rates_dict = comb_dict(all_rates_dict)
     all_rates_dict = replace_key_names(all_rates_dict,RENAME_CAT_MAP)
@@ -306,6 +311,14 @@ def main():
     if not args.unblind:
         all_rates_dict = remove_observed_rates(all_rates_dict,2)
 
+    # Get pdiff
+    for cat in all_rates_dict.keys():
+        sm = all_rates_dict[cat]["Sum_expected"]
+        ob = all_rates_dict[cat]["Observation"]
+        pdiff = 100.0*(sm-ob)/sm
+        print(cat,pdiff)
+        all_rates_dict[cat]["Pdiff"] = pdiff
+
     # Dump to the screen text for a latex table
     all_rates_dict_none_errs = append_none_errs(all_rates_dict) # Get a dict that will work for the latex table (i.e. need None for errs)
     mlt.print_latex_yield_table(
@@ -316,7 +329,7 @@ def main():
         print_begin_info=True,
         print_end_info=True,
         column_variable="keys",
-        hz_line_lst=[4,5,11,12,13],
+        hz_line_lst=[5,6,12,13,14,15],
     )
 
     # Save yields to a json

diff --git a/analysis/topEFT/get_yield_json.py b/analysis/topEFT/get_yield_json.py
@@ -18,7 +18,7 @@ def main():
     # Set up the command line parser
     parser = argparse.ArgumentParser()
     parser.add_argument("-f", "--pkl-file-path", default="histos/plotsTopEFT.pkl.gz", help = "The path to the pkl file")
-    parser.add_argument("-y", "--year", default="2017", help = "The year of the sample")
+    parser.add_argument("-y", "--year", default=None, help = "The year of the sample")
     parser.add_argument("-t", "--tag", default="Sample", help = "A string to describe the pkl file")
     parser.add_argument("-n", "--json-name", default="yields", help = "Name of the json file to save")
     parser.add_argument("-q", "--quiet", action="store_true", help = "Do not print out anything")
@@ -27,7 +27,7 @@ def main():
     args = parser.parse_args()
 
     # Get the histograms, check if split into lep flavors
-    hin_dict = utils.get_hist_from_pkl(args.pkl_file_path)
+    hin_dict = utils.get_hist_from_pkl(args.pkl_file_path,allow_empty=False)
     if not yt.is_split_by_lepflav(hin_dict) and args.by_lep_flavor:
         raise Exception("Cannot specify --by-lep-flavor option, the yields file is not split by lepton flavor")
 

diff --git a/analysis/topEFT/make_cr_and_sr_plots.py b/analysis/topEFT/make_cr_and_sr_plots.py
@@ -59,19 +59,19 @@
 
 
 SR_CHAN_DICT = {
-    "2lss_4t_SR": [
-        "2lss_4t_p_4j", "2lss_4t_m_5j", "2lss_4t_m_6j", "2lss_4t_m_7j",
+    "2lss_SR": [
+        "2lss_4t_m_4j", "2lss_4t_m_5j", "2lss_4t_m_6j", "2lss_4t_m_7j",
         "2lss_4t_p_4j", "2lss_4t_p_5j", "2lss_4t_p_6j", "2lss_4t_p_7j",
-    ],
-    "2lss_SR" : [
         "2lss_m_4j", "2lss_m_5j", "2lss_m_6j", "2lss_m_7j",
         "2lss_p_4j", "2lss_p_5j", "2lss_p_6j", "2lss_p_7j",
     ],
-    "3l_SR" : [
+    "3l_offZ_SR" : [
         "3l_m_offZ_1b_2j", "3l_m_offZ_1b_3j", "3l_m_offZ_1b_4j", "3l_m_offZ_1b_5j",
         "3l_m_offZ_2b_2j", "3l_m_offZ_2b_3j", "3l_m_offZ_2b_4j", "3l_m_offZ_2b_5j",
         "3l_p_offZ_1b_2j", "3l_p_offZ_1b_3j", "3l_p_offZ_1b_4j", "3l_p_offZ_1b_5j",
         "3l_p_offZ_2b_2j", "3l_p_offZ_2b_3j", "3l_p_offZ_2b_4j", "3l_p_offZ_2b_5j",
+    ],
+    "3l_onZ_SR" : [
         "3l_onZ_1b_2j"   , "3l_onZ_1b_3j"   , "3l_onZ_1b_4j"   , "3l_onZ_1b_5j",
         "3l_onZ_2b_2j"   , "3l_onZ_2b_3j"   , "3l_onZ_2b_4j"   , "3l_onZ_2b_5j",
     ],
@@ -97,6 +97,19 @@
     "Data" : [],
 }
 
+SR_GRP_MAP = {
+    "Data": [],
+    "Conv": [],
+    "Multiboson" : [],
+    "Nonprompt" : [],
+    "Flips" : [],
+    "ttH" : [],
+    "ttlnu" : [],
+    "ttll" : [],
+    "tttt" : [],
+    "tXq" : [],
+}
+
 # Best fit point from TOP-19-001 with madup numbers for the 10 new WCs
 WCPT_EXAMPLE = {
     "ctW": -0.74,
@@ -187,7 +200,7 @@ def group_bins(histo,bin_map,axis_name="sample",drop_unspecified=False):
     # Remap the bins
     old_ax = histo.axis(axis_name)
     new_ax = hist.Cat(old_ax.name,old_ax.label)
-    new_histo = histo.group(old_ax,new_ax,bin_map)
+    new_histo = histo.group(old_ax,new_ax,bin_map,overflow="over")
 
     return new_histo
 
@@ -619,6 +632,204 @@ def make_all_sr_sys_plots(dict_of_hists,year,save_dir_path):
             if "www" in save_dir_path_tmp: make_html(save_dir_path_tmp)
 
 
+###################### Wrapper function for simple plots ######################
+# Wrapper function to loop over categories and make plots for all variables
+def make_simple_plots(dict_of_hists,year,save_dir_path):
+
+    all_samples = yt.get_cat_lables(dict_of_hists,"sample",h_name="njets")
+
+    for idx,var_name in enumerate(dict_of_hists.keys()):
+        #if var_name == "njets": continue
+        #if "parton" in var_name: save_tag = "partonFlavour"
+        #if "hadron" in var_name: save_tag = "hadronFlavour"
+        #if "hadron" not in var_name: continue
+        #if var_name != "j0hadronFlavour": continue
+        if var_name != "j0partonFlavour": continue
+
+        histo_orig = dict_of_hists[var_name]
+
+        # Loop over channels
+        channels_lst = yt.get_cat_lables(dict_of_hists[var_name],"channel")
+        for chan_name in channels_lst:
+
+            histo = copy.deepcopy(histo_orig)
+
+            # Normalize the MC hists
+            sample_lumi_dict = {}
+            for sample_name in all_samples:
+                sample_lumi_dict[sample_name] = get_lumi_for_sample(sample_name)
+            histo.scale(sample_lumi_dict,axis="sample")
+
+            histo = yt.integrate_out_appl(histo,chan_name)
+            histo = histo.integrate("systematic","nominal")
+            histo = histo.integrate("channel",chan_name)
+
+            print("\n",chan_name)
+            print(histo.values())
+            summed_histo = histo.sum("sample")
+            print("sum:",sum(summed_histo.values()[()]))
+            continue
+
+            # Make a sub dir for this category
+            save_dir_path_tmp = os.path.join(save_dir_path,save_tag)
+            if not os.path.exists(save_dir_path_tmp):
+                os.mkdir(save_dir_path_tmp)
+
+            fig = make_single_fig(histo, unit_norm_bool=False)
+            title = chan_name + "_" + var_name
+            fig.savefig(os.path.join(save_dir_path_tmp,title))
+
+            # Make an index.html file if saving to web area
+            if "www" in save_dir_path: make_html(save_dir_path_tmp)
+
+
+###################### Wrapper function for SR data and mc plots (unblind!) ######################
+# Wrapper function to loop over all SR categories and make plots for all variables
+def make_all_sr_data_mc_plots(dict_of_hists,year,save_dir_path):
+
+    # Construct list of MC samples
+    mc_wl = []
+    mc_bl = ["data"]
+    data_wl = ["data"]
+    data_bl = []
+    if year is None:
+        pass # Don't think we actually need to do anything here?
+    elif year == "2017":
+        mc_wl.append("UL17")
+        data_wl.append("UL17")
+    elif year == "2018":
+        mc_wl.append("UL18")
+        data_wl.append("UL18")
+    elif year == "2016":
+        mc_wl.append("UL16")
+        mc_bl.append("UL16APV")
+        data_wl.append("UL16")
+        data_bl.append("UL16APV")
+    elif year == "2016APV":
+        mc_wl.append("UL16APV")
+        data_wl.append("UL16APV")
+    else: raise Exception(f"Error: Unknown year \"{year}\".")
+
+    # Get the list of samples we want to plot
+    samples_to_rm_from_mc_hist = []
+    samples_to_rm_from_data_hist = []
+    all_samples = yt.get_cat_lables(dict_of_hists,"sample",h_name="lj0pt")
+    mc_sample_lst = yt.filter_lst_of_strs(all_samples,substr_whitelist=mc_wl,substr_blacklist=mc_bl)
+    data_sample_lst = yt.filter_lst_of_strs(all_samples,substr_whitelist=data_wl,substr_blacklist=data_bl)
+    for sample_name in all_samples:
+        if sample_name not in mc_sample_lst:
+            samples_to_rm_from_mc_hist.append(sample_name)
+        if sample_name not in data_sample_lst:
+            samples_to_rm_from_data_hist.append(sample_name)
+    print("\nAll samples:",all_samples)
+    print("\nMC samples:",mc_sample_lst)
+    print("\nData samples:",data_sample_lst)
+    print("\nVariables:",dict_of_hists.keys())
+
+    # Very hard coded :(
+    for proc_name in mc_sample_lst + data_sample_lst:
+        if "data" in proc_name:
+            SR_GRP_MAP["Data"].append(proc_name)
+        elif "nonprompt" in proc_name:
+            SR_GRP_MAP["Nonprompt"].append(proc_name)
+        elif "flips" in proc_name:
+            SR_GRP_MAP["Flips"].append(proc_name)
+        elif ("ttH" in proc_name):
+            SR_GRP_MAP["ttH"].append(proc_name)
+        elif ("ttlnu" in proc_name):
+            SR_GRP_MAP["ttlnu"].append(proc_name)
+        elif ("ttll" in proc_name):
+            SR_GRP_MAP["ttll"].append(proc_name)
+        elif (("tllq" in proc_name) or ("tHq" in proc_name)):
+            SR_GRP_MAP["tXq"].append(proc_name)
+        elif ("tttt" in proc_name):
+            SR_GRP_MAP["tttt"].append(proc_name)
+        elif "TTG" in proc_name:
+            SR_GRP_MAP["Conv"].append(proc_name)
+        elif "WWW" in proc_name or "WWZ" in proc_name or "WZZ" in proc_name or "ZZZ" in proc_name:
+            SR_GRP_MAP["Multiboson"].append(proc_name)
+        elif "WWTo2L2Nu" in proc_name or "ZZTo4L" in proc_name or "WZTo3LNu" in proc_name:
+            SR_GRP_MAP["Multiboson"].append(proc_name)
+        else:
+            raise Exception(f"Error: Process name \"{proc_name}\" is not known.")
+
+    # The analysis bins
+    analysis_bins = {
+        'njets': {
+            '2l': [4,5,6,7,dict_of_hists['njets'].axis('njets').edges()[-1]], # Last bin in topeft.py is 10, this should grab the overflow
+            '3l': [2,3,4,5,dict_of_hists['njets'].axis('njets').edges()[-1]],
+            '4l': [2,3,4,dict_of_hists['njets'].axis('njets').edges()[-1]]
+        }
+    }
+    analysis_bins['ptz'] = [0, 200, 300, 400, 500, dict_of_hists['ptz'].axis('ptz').edges()[-1]]
+    analysis_bins['lj0pt'] = [0, 150, 250, 500, dict_of_hists['lj0pt'].axis('lj0pt').edges()[-1]]
+
+    # Loop over hists and make plots
+    skip_lst = [] # Skip this hist
+    #keep_lst = ["njets","lj0pt","ptz","nbtagsl","nbtagsm","l0pt","j0pt"] # Skip all but these hists
+    for idx,var_name in enumerate(dict_of_hists.keys()):
+        if (var_name in skip_lst): continue
+        #if (var_name not in keep_lst): continue
+        print("\nVariable:",var_name)
+
+        # Extract the MC and data hists
+        hist_mc_orig = dict_of_hists[var_name].remove(samples_to_rm_from_mc_hist,"sample")
+        hist_data_orig = dict_of_hists[var_name].remove(samples_to_rm_from_data_hist,"sample")
+
+        # Loop over channels
+        channels_lst = yt.get_cat_lables(dict_of_hists[var_name],"channel")
+        print("channels:",channels_lst)
+        #for chan_name in channels_lst: # For each channel individually
+        for chan_name in SR_CHAN_DICT.keys():
+
+            #hist_mc = hist_mc_orig.integrate("systematic","nominal").integrate("channel",chan_name) # For each channel individually
+            #hist_data = hist_data_orig.integrate("systematic","nominal").integrate("channel",chan_name) # For each channel individually
+            hist_mc = hist_mc_orig.integrate("systematic","nominal").integrate("channel",SR_CHAN_DICT[chan_name],overflow="over")
+            hist_data = hist_data_orig.integrate("systematic","nominal").integrate("channel",SR_CHAN_DICT[chan_name],overflow="over")
+
+            # Normalize the MC hists
+            sample_lumi_dict = {}
+            for sample_name in mc_sample_lst:
+                sample_lumi_dict[sample_name] = get_lumi_for_sample(sample_name)
+            hist_mc.scale(sample_lumi_dict,axis="sample")
+
+            hist_mc = group_bins(hist_mc,SR_GRP_MAP)
+            hist_data = group_bins(hist_data,SR_GRP_MAP)
+
+            # Make a sub dir for this category
+            save_dir_path_tmp = os.path.join(save_dir_path,chan_name)
+            if not os.path.exists(save_dir_path_tmp):
+                os.mkdir(save_dir_path_tmp)
+
+            # Rebin into analysis bins
+            if var_name in analysis_bins.keys():
+                lep_bin = chan_name[:2]
+                if var_name == "njets":
+                    hist_mc = hist_mc.rebin(var_name, hist.Bin(var_name,  hist_mc.axis(var_name).label, analysis_bins[var_name][lep_bin]))
+                    hist_data = hist_data.rebin(var_name, hist.Bin(var_name,  hist_data.axis(var_name).label, analysis_bins[var_name][lep_bin]))
+                else:
+                    hist_mc = hist_mc.rebin(var_name, hist.Bin(var_name,  hist_mc.axis(var_name).label, analysis_bins[var_name]))
+                    hist_data = hist_data.rebin(var_name, hist.Bin(var_name,  hist_data.axis(var_name).label, analysis_bins[var_name]))
+
+            if hist_mc.values() == {}:
+                print("Warning: empty mc histo, continuing")
+                continue
+            if hist_data.values() == {}:
+                print("Warning: empty data histo, continuing")
+                continue
+
+            fig = make_cr_fig(hist_mc, hist_data, unit_norm_bool=False)
+            if year is not None: year_str = year
+            else: year_str = "ULall"
+            title = chan_name + "_" + var_name + "_" + year_str
+            fig.savefig(os.path.join(save_dir_path_tmp,title))
+
+            # Make an index.html file if saving to web area
+            if "www" in save_dir_path_tmp: make_html(save_dir_path_tmp)
+
+
+
+
 
 ###################### Wrapper function for example SR plots ######################
 # Wrapper function to loop over all SR categories and make plots for all variables
@@ -945,7 +1156,16 @@ def main():
     # Make the plots
     make_all_cr_plots(hin_dict,args.year,args.skip_syst,unit_norm_bool,save_dir_path)
     #make_all_sr_plots(hin_dict,args.year,unit_norm_bool,save_dir_path)
+    #make_all_sr_data_mc_plots(hin_dict,args.year,save_dir_path)
     #make_all_sr_sys_plots(hin_dict,args.year,save_dir_path)
+    #make_simple_plots(hin_dict,args.year,save_dir_path)
+
+    # Make unblinded SR data MC comparison plots by year
+    #make_all_sr_data_mc_plots(hin_dict,"2016",save_dir_path)
+    #make_all_sr_data_mc_plots(hin_dict,"2016APV",save_dir_path)
+    #make_all_sr_data_mc_plots(hin_dict,"2017",save_dir_path)
+    #make_all_sr_data_mc_plots(hin_dict,"2018",save_dir_path)
+    #make_all_sr_data_mc_plots(hin_dict,None,save_dir_path)
 
 if __name__ == "__main__":
     main()