diff --git a/analysis/topEFT/get_datacard_yields.py b/analysis/topEFT/get_datacard_yields.py index 1b7fda547..afc61d0d4 100644 --- a/analysis/topEFT/get_datacard_yields.py +++ b/analysis/topEFT/get_datacard_yields.py @@ -3,13 +3,15 @@ import datetime import argparse import json +import numpy as np import topcoffea.modules.MakeLatexTable as mlt -BKG_PROC_LST = ["convs_sm","Diboson_sm","Triboson_sm","charge_flips_sm","fakes_sm"] +BKG_PROC_LST = ["tWZ_sm", "convs_sm","Diboson_sm","Triboson_sm","charge_flips_sm","fakes_sm"] SIG_PROC_LST = ["ttH_sm", "ttlnu_sm", "ttll_sm", "tllq_sm", "tHq_sm", "tttt_sm"] PROC_ORDER = [ + "tWZ_sm", "Diboson_sm", "Triboson_sm", "charge_flips_sm", @@ -25,6 +27,7 @@ "Sum_sig", "Sum_expected", "Observation", + "Pdiff", ] CAT_ORDER = [ "2lss_m_3b", @@ -298,6 +301,8 @@ def main(): cat_name = get_cat_name_from_dc_name(dc_fname) all_rates_dict[cat_name] = rate_dict_sm + #printd(all_rates_dict) + # Sum over jet bins and rename the keys, i.e. just some "post processing" all_rates_dict = comb_dict(all_rates_dict) all_rates_dict = replace_key_names(all_rates_dict,RENAME_CAT_MAP) @@ -306,6 +311,14 @@ def main(): if not args.unblind: all_rates_dict = remove_observed_rates(all_rates_dict,2) + # Get pdiff + for cat in all_rates_dict.keys(): + sm = all_rates_dict[cat]["Sum_expected"] + ob = all_rates_dict[cat]["Observation"] + pdiff = 100.0*(sm-ob)/sm + print(cat,pdiff) + all_rates_dict[cat]["Pdiff"] = pdiff + # Dump to the screen text for a latex table all_rates_dict_none_errs = append_none_errs(all_rates_dict) # Get a dict that will work for the latex table (i.e. need None for errs) mlt.print_latex_yield_table( @@ -316,7 +329,7 @@ def main(): print_begin_info=True, print_end_info=True, column_variable="keys", - hz_line_lst=[4,5,11,12,13], + hz_line_lst=[5,6,12,13,14,15], ) # Save yields to a json diff --git a/analysis/topEFT/get_yield_json.py b/analysis/topEFT/get_yield_json.py index 0a1394677..02a00b651 100644 --- a/analysis/topEFT/get_yield_json.py +++ b/analysis/topEFT/get_yield_json.py @@ -18,7 +18,7 @@ def main(): # Set up the command line parser parser = argparse.ArgumentParser() parser.add_argument("-f", "--pkl-file-path", default="histos/plotsTopEFT.pkl.gz", help = "The path to the pkl file") - parser.add_argument("-y", "--year", default="2017", help = "The year of the sample") + parser.add_argument("-y", "--year", default=None, help = "The year of the sample") parser.add_argument("-t", "--tag", default="Sample", help = "A string to describe the pkl file") parser.add_argument("-n", "--json-name", default="yields", help = "Name of the json file to save") parser.add_argument("-q", "--quiet", action="store_true", help = "Do not print out anything") @@ -27,7 +27,7 @@ def main(): args = parser.parse_args() # Get the histograms, check if split into lep flavors - hin_dict = utils.get_hist_from_pkl(args.pkl_file_path) + hin_dict = utils.get_hist_from_pkl(args.pkl_file_path,allow_empty=False) if not yt.is_split_by_lepflav(hin_dict) and args.by_lep_flavor: raise Exception("Cannot specify --by-lep-flavor option, the yields file is not split by lepton flavor") diff --git a/analysis/topEFT/make_cr_and_sr_plots.py b/analysis/topEFT/make_cr_and_sr_plots.py index 3e79415e4..a1eb580ea 100644 --- a/analysis/topEFT/make_cr_and_sr_plots.py +++ b/analysis/topEFT/make_cr_and_sr_plots.py @@ -59,19 +59,19 @@ SR_CHAN_DICT = { - "2lss_4t_SR": [ - "2lss_4t_p_4j", "2lss_4t_m_5j", "2lss_4t_m_6j", "2lss_4t_m_7j", + "2lss_SR": [ + "2lss_4t_m_4j", "2lss_4t_m_5j", "2lss_4t_m_6j", "2lss_4t_m_7j", "2lss_4t_p_4j", "2lss_4t_p_5j", "2lss_4t_p_6j", "2lss_4t_p_7j", - ], - "2lss_SR" : [ "2lss_m_4j", "2lss_m_5j", "2lss_m_6j", "2lss_m_7j", "2lss_p_4j", "2lss_p_5j", "2lss_p_6j", "2lss_p_7j", ], - "3l_SR" : [ + "3l_offZ_SR" : [ "3l_m_offZ_1b_2j", "3l_m_offZ_1b_3j", "3l_m_offZ_1b_4j", "3l_m_offZ_1b_5j", "3l_m_offZ_2b_2j", "3l_m_offZ_2b_3j", "3l_m_offZ_2b_4j", "3l_m_offZ_2b_5j", "3l_p_offZ_1b_2j", "3l_p_offZ_1b_3j", "3l_p_offZ_1b_4j", "3l_p_offZ_1b_5j", "3l_p_offZ_2b_2j", "3l_p_offZ_2b_3j", "3l_p_offZ_2b_4j", "3l_p_offZ_2b_5j", + ], + "3l_onZ_SR" : [ "3l_onZ_1b_2j" , "3l_onZ_1b_3j" , "3l_onZ_1b_4j" , "3l_onZ_1b_5j", "3l_onZ_2b_2j" , "3l_onZ_2b_3j" , "3l_onZ_2b_4j" , "3l_onZ_2b_5j", ], @@ -97,6 +97,19 @@ "Data" : [], } +SR_GRP_MAP = { + "Data": [], + "Conv": [], + "Multiboson" : [], + "Nonprompt" : [], + "Flips" : [], + "ttH" : [], + "ttlnu" : [], + "ttll" : [], + "tttt" : [], + "tXq" : [], +} + # Best fit point from TOP-19-001 with madup numbers for the 10 new WCs WCPT_EXAMPLE = { "ctW": -0.74, @@ -187,7 +200,7 @@ def group_bins(histo,bin_map,axis_name="sample",drop_unspecified=False): # Remap the bins old_ax = histo.axis(axis_name) new_ax = hist.Cat(old_ax.name,old_ax.label) - new_histo = histo.group(old_ax,new_ax,bin_map) + new_histo = histo.group(old_ax,new_ax,bin_map,overflow="over") return new_histo @@ -619,6 +632,204 @@ def make_all_sr_sys_plots(dict_of_hists,year,save_dir_path): if "www" in save_dir_path_tmp: make_html(save_dir_path_tmp) +###################### Wrapper function for simple plots ###################### +# Wrapper function to loop over categories and make plots for all variables +def make_simple_plots(dict_of_hists,year,save_dir_path): + + all_samples = yt.get_cat_lables(dict_of_hists,"sample",h_name="njets") + + for idx,var_name in enumerate(dict_of_hists.keys()): + #if var_name == "njets": continue + #if "parton" in var_name: save_tag = "partonFlavour" + #if "hadron" in var_name: save_tag = "hadronFlavour" + #if "hadron" not in var_name: continue + #if var_name != "j0hadronFlavour": continue + if var_name != "j0partonFlavour": continue + + histo_orig = dict_of_hists[var_name] + + # Loop over channels + channels_lst = yt.get_cat_lables(dict_of_hists[var_name],"channel") + for chan_name in channels_lst: + + histo = copy.deepcopy(histo_orig) + + # Normalize the MC hists + sample_lumi_dict = {} + for sample_name in all_samples: + sample_lumi_dict[sample_name] = get_lumi_for_sample(sample_name) + histo.scale(sample_lumi_dict,axis="sample") + + histo = yt.integrate_out_appl(histo,chan_name) + histo = histo.integrate("systematic","nominal") + histo = histo.integrate("channel",chan_name) + + print("\n",chan_name) + print(histo.values()) + summed_histo = histo.sum("sample") + print("sum:",sum(summed_histo.values()[()])) + continue + + # Make a sub dir for this category + save_dir_path_tmp = os.path.join(save_dir_path,save_tag) + if not os.path.exists(save_dir_path_tmp): + os.mkdir(save_dir_path_tmp) + + fig = make_single_fig(histo, unit_norm_bool=False) + title = chan_name + "_" + var_name + fig.savefig(os.path.join(save_dir_path_tmp,title)) + + # Make an index.html file if saving to web area + if "www" in save_dir_path: make_html(save_dir_path_tmp) + + +###################### Wrapper function for SR data and mc plots (unblind!) ###################### +# Wrapper function to loop over all SR categories and make plots for all variables +def make_all_sr_data_mc_plots(dict_of_hists,year,save_dir_path): + + # Construct list of MC samples + mc_wl = [] + mc_bl = ["data"] + data_wl = ["data"] + data_bl = [] + if year is None: + pass # Don't think we actually need to do anything here? + elif year == "2017": + mc_wl.append("UL17") + data_wl.append("UL17") + elif year == "2018": + mc_wl.append("UL18") + data_wl.append("UL18") + elif year == "2016": + mc_wl.append("UL16") + mc_bl.append("UL16APV") + data_wl.append("UL16") + data_bl.append("UL16APV") + elif year == "2016APV": + mc_wl.append("UL16APV") + data_wl.append("UL16APV") + else: raise Exception(f"Error: Unknown year \"{year}\".") + + # Get the list of samples we want to plot + samples_to_rm_from_mc_hist = [] + samples_to_rm_from_data_hist = [] + all_samples = yt.get_cat_lables(dict_of_hists,"sample",h_name="lj0pt") + mc_sample_lst = yt.filter_lst_of_strs(all_samples,substr_whitelist=mc_wl,substr_blacklist=mc_bl) + data_sample_lst = yt.filter_lst_of_strs(all_samples,substr_whitelist=data_wl,substr_blacklist=data_bl) + for sample_name in all_samples: + if sample_name not in mc_sample_lst: + samples_to_rm_from_mc_hist.append(sample_name) + if sample_name not in data_sample_lst: + samples_to_rm_from_data_hist.append(sample_name) + print("\nAll samples:",all_samples) + print("\nMC samples:",mc_sample_lst) + print("\nData samples:",data_sample_lst) + print("\nVariables:",dict_of_hists.keys()) + + # Very hard coded :( + for proc_name in mc_sample_lst + data_sample_lst: + if "data" in proc_name: + SR_GRP_MAP["Data"].append(proc_name) + elif "nonprompt" in proc_name: + SR_GRP_MAP["Nonprompt"].append(proc_name) + elif "flips" in proc_name: + SR_GRP_MAP["Flips"].append(proc_name) + elif ("ttH" in proc_name): + SR_GRP_MAP["ttH"].append(proc_name) + elif ("ttlnu" in proc_name): + SR_GRP_MAP["ttlnu"].append(proc_name) + elif ("ttll" in proc_name): + SR_GRP_MAP["ttll"].append(proc_name) + elif (("tllq" in proc_name) or ("tHq" in proc_name)): + SR_GRP_MAP["tXq"].append(proc_name) + elif ("tttt" in proc_name): + SR_GRP_MAP["tttt"].append(proc_name) + elif "TTG" in proc_name: + SR_GRP_MAP["Conv"].append(proc_name) + elif "WWW" in proc_name or "WWZ" in proc_name or "WZZ" in proc_name or "ZZZ" in proc_name: + SR_GRP_MAP["Multiboson"].append(proc_name) + elif "WWTo2L2Nu" in proc_name or "ZZTo4L" in proc_name or "WZTo3LNu" in proc_name: + SR_GRP_MAP["Multiboson"].append(proc_name) + else: + raise Exception(f"Error: Process name \"{proc_name}\" is not known.") + + # The analysis bins + analysis_bins = { + 'njets': { + '2l': [4,5,6,7,dict_of_hists['njets'].axis('njets').edges()[-1]], # Last bin in topeft.py is 10, this should grab the overflow + '3l': [2,3,4,5,dict_of_hists['njets'].axis('njets').edges()[-1]], + '4l': [2,3,4,dict_of_hists['njets'].axis('njets').edges()[-1]] + } + } + analysis_bins['ptz'] = [0, 200, 300, 400, 500, dict_of_hists['ptz'].axis('ptz').edges()[-1]] + analysis_bins['lj0pt'] = [0, 150, 250, 500, dict_of_hists['lj0pt'].axis('lj0pt').edges()[-1]] + + # Loop over hists and make plots + skip_lst = [] # Skip this hist + #keep_lst = ["njets","lj0pt","ptz","nbtagsl","nbtagsm","l0pt","j0pt"] # Skip all but these hists + for idx,var_name in enumerate(dict_of_hists.keys()): + if (var_name in skip_lst): continue + #if (var_name not in keep_lst): continue + print("\nVariable:",var_name) + + # Extract the MC and data hists + hist_mc_orig = dict_of_hists[var_name].remove(samples_to_rm_from_mc_hist,"sample") + hist_data_orig = dict_of_hists[var_name].remove(samples_to_rm_from_data_hist,"sample") + + # Loop over channels + channels_lst = yt.get_cat_lables(dict_of_hists[var_name],"channel") + print("channels:",channels_lst) + #for chan_name in channels_lst: # For each channel individually + for chan_name in SR_CHAN_DICT.keys(): + + #hist_mc = hist_mc_orig.integrate("systematic","nominal").integrate("channel",chan_name) # For each channel individually + #hist_data = hist_data_orig.integrate("systematic","nominal").integrate("channel",chan_name) # For each channel individually + hist_mc = hist_mc_orig.integrate("systematic","nominal").integrate("channel",SR_CHAN_DICT[chan_name],overflow="over") + hist_data = hist_data_orig.integrate("systematic","nominal").integrate("channel",SR_CHAN_DICT[chan_name],overflow="over") + + # Normalize the MC hists + sample_lumi_dict = {} + for sample_name in mc_sample_lst: + sample_lumi_dict[sample_name] = get_lumi_for_sample(sample_name) + hist_mc.scale(sample_lumi_dict,axis="sample") + + hist_mc = group_bins(hist_mc,SR_GRP_MAP) + hist_data = group_bins(hist_data,SR_GRP_MAP) + + # Make a sub dir for this category + save_dir_path_tmp = os.path.join(save_dir_path,chan_name) + if not os.path.exists(save_dir_path_tmp): + os.mkdir(save_dir_path_tmp) + + # Rebin into analysis bins + if var_name in analysis_bins.keys(): + lep_bin = chan_name[:2] + if var_name == "njets": + hist_mc = hist_mc.rebin(var_name, hist.Bin(var_name, hist_mc.axis(var_name).label, analysis_bins[var_name][lep_bin])) + hist_data = hist_data.rebin(var_name, hist.Bin(var_name, hist_data.axis(var_name).label, analysis_bins[var_name][lep_bin])) + else: + hist_mc = hist_mc.rebin(var_name, hist.Bin(var_name, hist_mc.axis(var_name).label, analysis_bins[var_name])) + hist_data = hist_data.rebin(var_name, hist.Bin(var_name, hist_data.axis(var_name).label, analysis_bins[var_name])) + + if hist_mc.values() == {}: + print("Warning: empty mc histo, continuing") + continue + if hist_data.values() == {}: + print("Warning: empty data histo, continuing") + continue + + fig = make_cr_fig(hist_mc, hist_data, unit_norm_bool=False) + if year is not None: year_str = year + else: year_str = "ULall" + title = chan_name + "_" + var_name + "_" + year_str + fig.savefig(os.path.join(save_dir_path_tmp,title)) + + # Make an index.html file if saving to web area + if "www" in save_dir_path_tmp: make_html(save_dir_path_tmp) + + + + ###################### Wrapper function for example SR plots ###################### # Wrapper function to loop over all SR categories and make plots for all variables @@ -945,7 +1156,16 @@ def main(): # Make the plots make_all_cr_plots(hin_dict,args.year,args.skip_syst,unit_norm_bool,save_dir_path) #make_all_sr_plots(hin_dict,args.year,unit_norm_bool,save_dir_path) + #make_all_sr_data_mc_plots(hin_dict,args.year,save_dir_path) #make_all_sr_sys_plots(hin_dict,args.year,save_dir_path) + #make_simple_plots(hin_dict,args.year,save_dir_path) + + # Make unblinded SR data MC comparison plots by year + #make_all_sr_data_mc_plots(hin_dict,"2016",save_dir_path) + #make_all_sr_data_mc_plots(hin_dict,"2016APV",save_dir_path) + #make_all_sr_data_mc_plots(hin_dict,"2017",save_dir_path) + #make_all_sr_data_mc_plots(hin_dict,"2018",save_dir_path) + #make_all_sr_data_mc_plots(hin_dict,None,save_dir_path) if __name__ == "__main__": main() diff --git a/topcoffea/modules/YieldTools.py b/topcoffea/modules/YieldTools.py index 591e876f2..9b40ad51e 100644 --- a/topcoffea/modules/YieldTools.py +++ b/topcoffea/modules/YieldTools.py @@ -16,16 +16,43 @@ def __init__(self): self.CAT_LST = ["2lss_p", "2lss_m", "2lss_4t_p", "2lss_4t_m", "3l_p_offZ_1b", "3l_m_offZ_1b", "3l_p_offZ_2b", "3l_m_offZ_2b", "3l_onZ_1b", "3l_onZ_2b", "4l"] self.CAT_LST_TOP19001 = ["2lss_p", "2lss_m", "3l_p_offZ_1b", "3l_m_offZ_1b", "3l_p_offZ_2b", "3l_m_offZ_2b", "3l_onZ_1b", "3l_onZ_2b", "4l"] + self.SIG = ["ttH","ttlnu","ttll","tllq","tHq","tttt"] + self.BKG = ["flips","fakes","conv","VV","VVV","tWZ"] + self.DIBOSON = ["WW","WZ","ZZ"] + self.TRIBOSON = ["WWW","WWZ","WZZ","ZZZ"] + + self.DATA_MC_COLUMN_ORDER = ["tWZ", "VV", "VVV", "flips", "fakes", "conv", "bkg", "ttlnu", "ttll", "ttH", "tllq", "tHq", "tttt", "sig", "pred", "data", "pdiff"] + # A dictionary mapping names of samples in the samples axis to a short version of the name self.PROC_MAP = { - "ttlnu" : ["ttW_centralUL17" , "ttlnuJet_privateUL18" , "ttlnuJet_privateUL17" , "ttlnuJet_privateUL16" , "ttlnuJet_privateUL16APV"], - "ttll" : ["ttZ_centralUL17" , "ttllJet_privateUL18" , "ttllJet_privateUL17" , "ttllJet_privateUL16" , "ttllJet_privateUL16APV"], - "ttH" : ["ttH_centralUL17" , "ttHJet_privateUL18" , "ttHJet_privateUL17" , "ttHJet_privateUL16" , "ttHJet_privateUL16APV"], - "tllq" : ["tZq_centralUL17" , "tllq_privateUL18" , "tllq_privateUL17" , "tllq_privateUL16" , "tllq_privateUL16APV"], - "tHq" : ["tHq_central2017" , "tHq_privateUL18" , "tHq_privateUL17" , "tHq_privateUL16" , "tHq_privateUL16APV"], - "tttt" : ["tttt_central2017", "tttt_privateUL18" , "tttt_privateUL17" , "tttt_privateUL16" , "tttt_privateUL16APV"], + + "ttlnu" : ["ttW_centralUL16APV" ,"ttW_centralUL16" ,"ttW_centralUL17" ,"ttW_centralUL18" , "ttlnuJet_privateUL18" , "ttlnuJet_privateUL17" , "ttlnuJet_privateUL16" , "ttlnuJet_privateUL16APV"], + "ttll" : ["ttZ_centralUL16APV" ,"ttZ_centralUL16" ,"ttZ_centralUL17" ,"ttZ_centralUL18" , "ttllJet_privateUL18" , "ttllJet_privateUL17" , "ttllJet_privateUL16" , "ttllJet_privateUL16APV"], + "ttH" : ["ttHJet_centralUL16APV" ,"ttHJet_centralUL16" ,"ttH_centralUL17" ,"ttH_centralUL18" , "ttHJet_privateUL18" , "ttHJet_privateUL17" , "ttHJet_privateUL16" , "ttHJet_privateUL16APV"], + "tllq" : ["tZq_centralUL16APV" ,"tZq_centralUL16" ,"tZq_centralUL17" ,"tZq_centralUL18" , "tllq_privateUL18" , "tllq_privateUL17" , "tllq_privateUL16" , "tllq_privateUL16APV"], + "tHq" : ["tHq_centralUL16APV" ,"tHq_centralUL16" ,"tHq_centralUL17" ,"tHq_centralUL18" , "tHq_privateUL18" , "tHq_privateUL17" , "tHq_privateUL16" , "tHq_privateUL16APV"], + "tttt" : ["tttt_centralUL16APV" ,"tttt_centralUL16" ,"tttt_centralUL17","tttt_centralUL18", "tttt_privateUL18" , "tttt_privateUL17" , "tttt_privateUL16" , "tttt_privateUL16APV"], + + "flips" : ["flipsUL16" ,"flipsUL16APV" ,"flipsUL17" ,"flipsUL18" ], + "fakes" : ["nonpromptUL16" ,"nonpromptUL16APV" ,"nonpromptUL17" ,"nonpromptUL18" ], + "conv" : ["TTGamma_centralUL16" ,"TTGamma_centralUL16APV" ,"TTGamma_centralUL17" ,"TTGamma_centralUL18" ], + "WW" : ["WWTo2L2Nu_centralUL16","WWTo2L2Nu_centralUL16APV","WWTo2L2Nu_centralUL17","WWTo2L2Nu_centralUL18"], + "WZ" : ["WZTo3LNu_centralUL16" ,"WZTo3LNu_centralUL16APV" ,"WZTo3LNu_centralUL17" ,"WZTo3LNu_centralUL18" ], + "ZZ" : ["ZZTo4L_centralUL16" ,"ZZTo4L_centralUL16APV" ,"ZZTo4L_centralUL17" ,"ZZTo4L_centralUL18" ], + "WWW" : ["WWW_4F_centralUL16" ,"WWW_centralUL16APV" ,"WWW_centralUL17" ,"WWW_4F_centralUL18" ], + "WWZ" : ["WWZ_4F_centralUL16" ,"WWZ_4F_centralUL16APV" ,"WWZ_centralUL17" ,"WWZ_4F_centralUL18" ], + "WZZ" : ["WZZ_centralUL16" ,"WZZ_centralUL16APV" ,"WZZ_centralUL17" ,"WZZ_centralUL18" ], + "ZZZ" : ["ZZZ_centralUL16" ,"ZZZ_centralUL16APV" ,"ZZZ_centralUL17" ,"ZZZ_centralUL18" ], + "tWZ" : ["TWZToLL_centralUL16" ,"TWZToLL_centralUL16APV" ,"TWZToLL_centralUL17" ,"TWZToLL_centralUL18" ], + + "ttZlowMll" : ["TTZToLL_M1to10_centralUL16" ,"TTZToLL_M1to10_centralUL16APV" ,"TTZToLL_M1to10_centralUL17" ,"TTZToLL_M1to10_centralUL18" ], + "ttbarll" : ["TTTo2L2Nu_centralUL16", "TTTo2L2Nu_centralUL16APV", "TTTo2L2Nu_centralUL17", "TTTo2L2Nu_centralUL18"], + "ttbarsl" : ["TTToSemiLeptonic_centralUL16", "TTToSemiLeptonic_centralUL16APV", "TTToSemiLeptonic_centralUL17", "TTToSemiLeptonic_centralUL18"], + + "data" : ["dataUL16","dataUL16APV","dataUL17","dataUL18"], } + # The jet bins we define for the lep categories (Not currently used) self.JET_BINS = { "2lss" : [4,5,6,7], @@ -43,60 +70,150 @@ def __init__(self): # Yields from TOP-19-001 AN table 15 self.TOP19001_YLDS = { "ttlnu" : { - "2lss_p" : (68.7,None), - "2lss_m" : (37.1,None), + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (68.7,None), + "2lss_m" : (37.1,None), "3l_p_offZ_1b" : (14.4,None), "3l_m_offZ_1b" : (8.0,None), "3l_p_offZ_2b" : (10.8,None), "3l_m_offZ_2b" : (5.9,None), - "3l_onZ_1b" : (2.9,None), - "3l_onZ_2b" : (2.3,None), - "4l" : (0.0,None), + "3l_onZ_1b" : (2.9,None), + "3l_onZ_2b" : (2.3,None), + "4l" : (0.0,None), }, "ttll" : { - "2lss_p" : (19.3,None), - "2lss_m" : (19.0,None), + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (19.3,None), + "2lss_m" : (19.0,None), "3l_p_offZ_1b" : (12.7,None), "3l_m_offZ_1b" : (13.3,None), "3l_p_offZ_2b" : (9.1,None), "3l_m_offZ_2b" : (8.5,None), - "3l_onZ_1b" : (95.5,None), - "3l_onZ_2b" : (63.2,None), - "4l" : (9.4,None), + "3l_onZ_1b" : (95.5,None), + "3l_onZ_2b" : (63.2,None), + "4l" : (9.4,None), }, "ttH" : { - "2lss_p" : (24.7,None), - "2lss_m" : (24.1,None), + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (24.7,None), + "2lss_m" : (24.1,None), "3l_p_offZ_1b" : (7.9,None), "3l_m_offZ_1b" : (7.6,None), "3l_p_offZ_2b" : (5.1,None), "3l_m_offZ_2b" : (5.2,None), - "3l_onZ_1b" : (3.2,None), - "3l_onZ_2b" : (2.2,None), - "4l" : (1.0,None), + "3l_onZ_1b" : (3.2,None), + "3l_onZ_2b" : (2.2,None), + "4l" : (1.0,None), }, "tllq" : { - "2lss_p" : (2.7,None), - "2lss_m" : (1.5,None), + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (2.7,None), + "2lss_m" : (1.5,None), "3l_p_offZ_1b" : (3.5,None), "3l_m_offZ_1b" : (1.8,None), "3l_p_offZ_2b" : (1.2,None), "3l_m_offZ_2b" : (0.6,None), - "3l_onZ_1b" : (39.8,None), - "3l_onZ_2b" : (13.3,None), - "4l" : (0.0,None), + "3l_onZ_1b" : (39.8,None), + "3l_onZ_2b" : (13.3,None), + "4l" : (0.0,None), }, "tHq" : { - "2lss_p" : (0.8,None), - "2lss_m" : (0.4,None), + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (0.8,None), + "2lss_m" : (0.4,None), "3l_p_offZ_1b" : (0.3,None), "3l_m_offZ_1b" : (0.2,None), "3l_p_offZ_2b" : (0.2,None), "3l_m_offZ_2b" : (0.1,None), - "3l_onZ_1b" : (0.1,None), - "3l_onZ_2b" : (0.1,None), - "4l" : (0.0,None), + "3l_onZ_1b" : (0.1,None), + "3l_onZ_2b" : (0.1,None), + "4l" : (0.0,None), + }, + + "VV" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (1.6,None), + "2lss_m" : (1.2,None), + "3l_p_offZ_1b" : (5.9,None), + "3l_m_offZ_1b" : (4.7,None), + "3l_p_offZ_2b" : (0.4,None), + "3l_m_offZ_2b" : (0.3,None), + "3l_onZ_1b" : (52.1,None), + "3l_onZ_2b" : (4.1,None), + "4l" : (0.6,None), + }, + "VVV" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (0.5,None), + "2lss_m" : (0.5,None), + "3l_p_offZ_1b" : (0.2,None), + "3l_m_offZ_1b" : (0.2,None), + "3l_p_offZ_2b" : (0,None), + "3l_m_offZ_2b" : (0.1,None), + "3l_onZ_1b" : (3.5,None), + "3l_onZ_2b" : (0.6,None), + "4l" : (0.1,None), + }, + "flips" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (8.5,None), + "2lss_m" : (8.5,None), + "3l_p_offZ_1b" : (0,None), + "3l_m_offZ_1b" : (0,None), + "3l_p_offZ_2b" : (0,None), + "3l_m_offZ_2b" : (0,None), + "3l_onZ_1b" : (0,None), + "3l_onZ_2b" : (0,None), + "4l" : (0,None), + }, + "fakes" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (25.6,None), + "2lss_m" : (26.8,None), + "3l_p_offZ_1b" : (11.3,None), + "3l_m_offZ_1b" : (13.0,None), + "3l_p_offZ_2b" : (3.3,None), + "3l_m_offZ_2b" : (2.5,None), + "3l_onZ_1b" : (16.9,None), + "3l_onZ_2b" : (3.8,None), + "4l" : (0,None), }, + "conv" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (10.9,None), + "2lss_m" : (9.2,None), + "3l_p_offZ_1b" : (2.3,None), + "3l_m_offZ_1b" : (2.6,None), + "3l_p_offZ_2b" : (1.7,None), + "3l_m_offZ_2b" : (1.9,None), + "3l_onZ_1b" : (0.8,None), + "3l_onZ_2b" : (0.4,None), + "4l" : (0,None), + }, + "data" : { + "2lss_4t_p" : (0,None), + "2lss_4t_m" : (0,None), + "2lss_p" : (192,None), + "2lss_m" : (171,None), + "3l_p_offZ_1b" : (85,None), + "3l_m_offZ_1b" : (64,None), + "3l_p_offZ_2b" : (32,None), + "3l_m_offZ_2b" : (28,None), + "3l_onZ_1b" : (239,None), + "3l_onZ_2b" : (95,None), + "4l" : (12,None), + }, + } @@ -128,7 +245,7 @@ def get_long_name(self,long_name_lst_in,short_name_in): ######### General functions ######### # Get percent difference - def get_pdiff(self,a,b): + def get_pdiff(self,a,b,in_percent=False): #p = (float(a)-float(b))/((float(a)+float(b))/2) if ((a is None) or (b is None)): p = None @@ -136,6 +253,8 @@ def get_pdiff(self,a,b): p = None else: p = (float(a)-float(b))/float(b) + if in_percent: + p = p*100.0 return p # Takes two dictionaries, returns the list of lists [common keys, keys unique to d1, keys unique to d2] @@ -159,6 +278,17 @@ def get_common_keys(self,dict1,dict2): return [common_lst,unique_1_lst,unique_2_lst] + # For a nested dict {k:{subk:v}} reorganizes to be {subk:{k:v}} + def swap_keys_subkeys(self,in_dict): + out_dict = {} + for k in in_dict.keys(): + for subk in in_dict[k].keys(): + if subk not in out_dict: out_dict[subk] = {} + if k in out_dict[subk].keys(): + raise Exception("Cannot invert this dict") + else: + out_dict[subk][k] = in_dict[k][subk] + return out_dict # Get a subset of the elements from a list of strings given a whitelist and/or blacklist of substrings def filter_lst_of_strs(self,in_lst,substr_whitelist=[],substr_blacklist=[]): @@ -188,13 +318,12 @@ def filter_lst_of_strs(self,in_lst,substr_whitelist=[],substr_blacklist=[]): return out_lst + # Get the per lepton e/m factor from e.g. eee and mmm yields def get_em_factor(self,e_val,m_val,nlep): return (e_val/m_val)**(1.0/nlep) - - # Takes a hist, and retruns a list of the axis names def get_axis_list(self,histo): axis_lst = [] @@ -341,7 +470,7 @@ def integrate_out_appl(self,histo,lep_cat): # subk : (val,err) # } # } - def get_diff_between_nested_dicts(self,dict1,dict2,difftype): + def get_diff_between_nested_dicts(self,dict1,dict2,difftype,inpercent=False): # Get list of keys common to both dictionaries common_keys, d1_keys, d2_keys = self.get_common_keys(dict1,dict2) @@ -362,7 +491,7 @@ def get_diff_between_nested_dicts(self,dict1,dict2,difftype): v1,e1 = dict1[k][subk] v2,e1 = dict2[k][subk] if difftype == "percent_diff": - ret_diff = self.get_pdiff(v1,v2) + ret_diff = self.get_pdiff(v1,v2,in_percent=inpercent) elif difftype == "absolute_diff": ret_diff = v1 - v2 else: @@ -372,6 +501,70 @@ def get_diff_between_nested_dicts(self,dict1,dict2,difftype): return ret_dict + # Figures out which year a sample is from, retruns the lumi for that year + def get_lumi_for_sample(self,sample_name): + if "UL17" in sample_name: + lumi = 1000.0*get_lumi("2017") + elif "UL18" in sample_name: + lumi = 1000.0*get_lumi("2018") + elif "UL16APV" in sample_name: + lumi = 1000.0*get_lumi("2016APV") + elif "UL16" in sample_name: + # Should not be here unless "UL16APV" not in sample_name + lumi = 1000.0*get_lumi("2016") + else: + raise Exception(f"Error: Unknown year \"{year}\".") + return lumi + + + # Takes as input a dictionary {"k": {"subk":[val,err]}} and returns {"k":{"subk":val}} + def strip_errs(self,in_dict): + out_dict = {} + for k in in_dict.keys(): + out_dict[k] = {} + for subk in in_dict[k]: + out_dict[k][subk] = in_dict[k][subk][0] + return out_dict + + # Takes as input a dictionary {"k":{"subk":val}} and returns {"k": {"subk":[val,None]}} + def put_none_errs(self,in_dict): + out_dict = {} + for k in in_dict.keys(): + out_dict[k] = {} + for subk in in_dict[k]: + out_dict[k][subk] = [in_dict[k][subk],None] + return out_dict + + + # This function: + # - Takes as input yield dict {"proc":{"cat":yld}} + # - For each process we cobine categories that have the same name except for the given str (e.g. combine across charges if you pass ["p","m"]) + # - Can also take as input {"proc":{"cat":[yld,err]}}, but note we will not propagate errors, and instead will just replace errors with None + def sum_over_cats(self,in_dict,combine_str_lst): + out_dict = {} + for proc_name in in_dict.keys(): + tmp_dict = {} + for cat_name in in_dict[proc_name]: + # Do we have errors or not + if isinstance(in_dict[proc_name][cat_name],float): errs = False + else: errs = True # Assumes we have [yld,err] + # Get the stripped name + cat_name_component_lst = cat_name.split("_") + for s in combine_str_lst: + if s in cat_name_component_lst: cat_name_component_lst.remove(s) + cat_name_stripped = "_".join(cat_name_component_lst) + # Add the common cateogires to the out dict + if cat_name_stripped not in tmp_dict: + if errs: tmp_dict[cat_name_stripped] = [in_dict[proc_name][cat_name][0],None] + else : tmp_dict[cat_name_stripped] = in_dict[proc_name][cat_name] + else: + if errs: tmp_dict[cat_name_stripped][0] = tmp_dict[cat_name_stripped][0] + in_dict[proc_name][cat_name][0] + else : tmp_dict[cat_name_stripped] = tmp_dict[cat_name_stripped] + in_dict[proc_name][cat_name] + out_dict[proc_name] = tmp_dict + + return out_dict + + ######### Functions specifically for getting yields ######### @@ -381,16 +574,20 @@ def get_diff_between_nested_dicts(self,dict1,dict2,difftype): # - You pass a process name, and we select just that category from the sample axis def get_yield(self,h,proc,overflow_str="none"): h_vals = h[proc].values(sumw2=True,overflow=overflow_str) - for i,(k,v) in enumerate(h_vals.items()): - v_sum = v[0].sum() - e_sum = v[1].sum() - if i > 0: raise Exception("Why is i greater than 0? The hist is not what this function is expecting. Exiting...") + if len(h_vals) != 0: # I.e. dict is not empty, this process exists in this dict + for i,(k,v) in enumerate(h_vals.items()): + v_sum = v[0].sum() + e_sum = v[1].sum() + if i > 0: raise Exception("Why is i greater than 0? The hist is not what this function is expecting. Exiting...") + else: + v_sum = 0.0 + e_sum = 0.0 e_sum = np.sqrt(e_sum) - return (v_sum,e_sum) + return [v_sum,e_sum] # Integrates out categories, normalizes, then calls get_yield() - def get_normalized_yield(self,hin_dict,year,proc,cat_dict,overflow_str,rwgt_pt=None,h_name="ht"): + def get_normalized_yield(self,hin_dict,lumi_factor,proc,cat_dict,overflow_str,rwgt_pt=None,h_name="ht"): # Integrate out cateogries h = hin_dict[h_name] @@ -404,9 +601,9 @@ def get_normalized_yield(self,hin_dict,year,proc,cat_dict,overflow_str,rwgt_pt=N h.set_sm() - # Scale by lumi - lumi = 1000.0*get_lumi(year) - h.scale(lumi) + # Scale the mc by lumi + if "data" not in proc: + h.scale(lumi_factor) return self.get_yield(h,proc,overflow_str) @@ -415,7 +612,7 @@ def get_normalized_yield(self,hin_dict,year,proc,cat_dict,overflow_str,rwgt_pt=N # - Takes as input a hist dict (i.e. what the processor outptus) # - Returns a dictionary of yields for the categories in the "channel" axis # - Optionally sums over njets or lep flavs - def get_yld_dict(self,hin_dict,year,njets=False,lepflav=False): + def get_yld_dict(self,hin_dict,year=None,njets=False,lepflav=False): # Check for options that do not make sense if lepflav and not self.is_split_by_lepflav(hin_dict): @@ -423,24 +620,39 @@ def get_yld_dict(self,hin_dict,year,njets=False,lepflav=False): # If we want to seperate by njets, don't use njets hist since njets are not in it's sparse axis hist_to_use = "njets" - if njets: hist_to_use = "ht" + if njets: hist_to_use = "lj0pt" # Get the cat dict (that we will integrate over) cat_dict = {} for ch in self.get_cat_lables(hin_dict,"channel",h_name=hist_to_use): cat_dict[ch] = {} nlep_str = ch.split("_")[0] - cat_dict[ch]["appl"] = self.APPL_DICT[nlep_str] + if "appl" in self.get_axis_list(hin_dict[hist_to_use]): + cat_dict[ch]["appl"] = self.APPL_DICT[nlep_str] cat_dict[ch]["channel"] = ch # Find the yields yld_dict = {} proc_lst = self.get_cat_lables(hin_dict,"sample") + if "flipsUL17" not in proc_lst: proc_lst = proc_lst + ["flipsUL16","flipsUL16APV","flipsUL17","flipsUL18"] # Very bad workaround for _many_ reasons + print("proc_lst",proc_lst) for proc in proc_lst: + p = self.get_short_name(proc) + print("Name:",p,proc) # Print what name the sample has been matched to + + for proc in proc_lst: + if year is not None: + if not proc.endswith(year): continue + lumi_factor = self.get_lumi_for_sample(proc) proc_name_short = self.get_short_name(proc) - yld_dict[proc_name_short] = {} - for cat,cuts_dict in cat_dict.items(): - yld_dict[proc_name_short][cat] = self.get_normalized_yield(hin_dict,year,proc,cuts_dict,overflow_str="over",h_name=hist_to_use) # Important to keep overflow + if proc_name_short not in yld_dict: + yld_dict[proc_name_short] = {} + for cat,cuts_dict in cat_dict.items(): + yld_dict[proc_name_short][cat] = self.get_normalized_yield(hin_dict,lumi_factor,proc,cuts_dict,overflow_str="over",h_name=hist_to_use) # Important to keep overflow + else: + for cat,cuts_dict in cat_dict.items(): + yld_dict[proc_name_short][cat][0] += self.get_normalized_yield(hin_dict,lumi_factor,proc,cuts_dict,overflow_str="over",h_name=hist_to_use)[0] # Important to keep overflow + yld_dict[proc_name_short][cat][1] = None # Ok, let's just forget the sumw2... # If the file is split by lepton flav, but we don't want that, sum over lep flavors: if self.is_split_by_lepflav(hin_dict) and not lepflav: @@ -449,6 +661,9 @@ def get_yld_dict(self,hin_dict,year,njets=False,lepflav=False): return yld_dict + + ######### Functions specifically for manipulating the "yld_dict" object ######### + # This function takes as input a yld_dict that's seperated by lep flavor, and sums categories over lepton flavor def sum_over_lepcats(self,yld_dict): @@ -511,6 +726,108 @@ def find_relative_contributions(self,yld_dict): + # This function is a tool to sum a subset of the values in a dict (a subdict of the yield_dict) + # - Assumes all keys in keys_to_sum list are in the dict + # - Assumes the keys are tuples of val,err and we ignore the err in the sum + def get_subset_sum(self,in_dict,keys_to_sum,skip_missing_keys=False): + out_vals = {} + for k in keys_to_sum: + if k not in in_dict: + if skip_missing_keys: + print(f"Warning: key {k} is missing from dict with keys {list(in_dict.keys())}") + continue + else: + raise Exception(f"Error: key {k} is missing from dict with keys {list(in_dict.keys())}") + for subk in in_dict[k].keys(): + if subk not in out_vals: + out_vals[subk] = [in_dict[k][subk][0],None] + else: + out_vals[subk][0] = out_vals[subk][0] + in_dict[k][subk][0] + return out_vals + + # This function + # - Takes as input a yld_dict + # - Combines the bkg processes (e.g. combine all diboson processes) + # - Returns a new yld dict with these combined processes + def comb_bkg_procs(self,yld_dict): + ret_dict = {} + for proc in self.BKG: + if proc in yld_dict.keys(): + ret_dict[proc] = yld_dict[proc] + else: + if proc == "VV": + ret_dict["VV"] = self.get_subset_sum(yld_dict,self.DIBOSON) + if proc == "VVV": + ret_dict["VVV"] = self.get_subset_sum(yld_dict,self.TRIBOSON) + for proc in self.SIG + ["data"]: + # Pass through + ret_dict[proc] = yld_dict[proc] + + return ret_dict + + # This function + # - takes as input a yld_dict + # - Assumes that the yld dict has all of the processes relevent for the analysis (sig, bkg, data) + # - Combines calculates sub category sums + # - Returns a new yld dict with these combined sums + def sum_over_ana_procs(self,yld_dict,skip4t=False,comb2lss=False): + + # Fill the out dict with the orig set of processes + ret_dict = {} + for proc in yld_dict.keys(): + if skip4t and proc == "tttt": continue + ret_dict[proc] = yld_dict[proc] + + # Get sums and add those to the dict + bkg_lst = copy.deepcopy(self.BKG) + sig_lst = copy.deepcopy(self.SIG) + if skip4t: + sig_lst.remove("tttt") + ret_dict["bkg"] = self.get_subset_sum(yld_dict,bkg_lst) + ret_dict["sig"] = self.get_subset_sum(yld_dict,sig_lst) + ret_dict["pred"] = self.get_subset_sum(yld_dict,sig_lst+bkg_lst) + + # Note that this assumes the 2lss_p, 2lss_m, 2lss_4t_p, and 2lss_4t_m keys are in the dict + # Adds the 4t category to the regular 2lss cat + if comb2lss: + ret_dict_comb_2lss = {} + for proc_name in ret_dict.keys(): + ret_dict_comb_2lss[proc_name] = {} + for cat_name in ret_dict[proc_name].keys(): + val1 = ret_dict[proc_name][cat_name][0] + val2 = None + if cat_name == "2lss_p": + val2 = ret_dict[proc_name]["2lss_4t_p"][0] + elif cat_name == "2lss_m": + val2 = ret_dict[proc_name]["2lss_4t_m"][0] + elif cat_name == "2lss_4t_p" or cat_name == "2lss_4t_m": + # We won't have keys in the out dict for this cat + continue + + # Here number+None=number, and None+None=None + if val1 is None and val2 is None: + val = None + elif val1 is None: + val = val2 + elif val2 is None: + val = val1 + else: + val = val1 + val2 + ret_dict_comb_2lss[proc_name][cat_name] = (val,None) + + ret_dict = ret_dict_comb_2lss + + ret_dict["pdiff"] = {} + for cat in ret_dict["pred"].keys(): + obs = ret_dict["data"][cat][0] + exp = ret_dict["pred"][cat][0] + pdiff = self.get_pdiff(exp,obs,in_percent=True) + ret_dict["pdiff"][cat] = [pdiff,None] + + return ret_dict + + + ######### Functions that just print out information #########