diff --git a/.github/workflows/fitting_check.yml b/.github/workflows/fitting_check.yml new file mode 100644 index 0000000..b16e40c --- /dev/null +++ b/.github/workflows/fitting_check.yml @@ -0,0 +1,61 @@ +name: Validate CMS Open Data ttbar analysis + +on: + push: + branches: + - main + pull_request: + branches: + - main + +jobs: + run-cms-open-data-ttbar-analysis: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v3 + + - name: Set up ROOT environment + run: | + sudo apt-get update + sudo apt-get install -y dpkg-dev cmake g++ gcc binutils libx11-dev libncurses5-dev libssl-dev libxpm-dev \ + libxft-dev libxml2-dev libz-dev libxext-dev python3-dev git libtbb-dev libgif-dev xrootd-client python3 + pip install numpy plotting distributed tqdm uproot + wget https://root.cern/download/root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz + tar -xzvf root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz + source root/bin/thisroot.sh + echo "ROOT is set up" + + - name: Run Analysis + run: | + source root/bin/thisroot.sh + cd analyses/cms-open-data-ttbar/ + ./validate | tee output.txt + + - name: Compare histograms validation output with expected + id: histograms + run: | + cd analyses/cms-open-data-ttbar/ + if grep -q "Test failed: Histograms validation output does not match expected result." output.txt; then + echo "Histograms validation failed." + echo "RESULT_HISTOGRAMS=fail" >> $GITHUB_ENV + exit 1 + else + echo "Histograms validation passed." + echo "RESULT_HISTOGRAMS=pass" >> $GITHUB_ENV + fi + + - name: Run validation sequences for fitResults + id: fitresults + run: | + cd analyses/cms-open-data-ttbar/ + if grep -q "Test failed: fitResults validation output does not match expected result." output.txt; then + echo "fitResults validation failed." + echo "RESULT_FITRESULTS=fail" >> $GITHUB_ENV + exit 1 + else + echo "fitResults validation passed." + echo "RESULT_FITRESULTS=pass" >> $GITHUB_ENV + fi + diff --git a/.github/workflows/validation/histograms_1_file_validation_reference.yml b/.github/workflows/validation/histograms_1_file_validation_reference.yml new file mode 100644 index 0000000..d57629f --- /dev/null +++ b/.github/workflows/validation/histograms_1_file_validation_reference.yml @@ -0,0 +1,2 @@ +Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'... +All good! diff --git a/.gitignore b/.gitignore index a71018f..29cad71 100644 --- a/.gitignore +++ b/.gitignore @@ -7,3 +7,6 @@ data.json __pycache__ tags +analyses/cms-open-data-ttbar/*.root +analyses/cms-open-data-ttbar/*.table +analyses/cms-open-data-ttbar/statistical_data/* diff --git a/analyses/cms-open-data-ttbar/analysis.py b/analyses/cms-open-data-ttbar/analysis.py index baafae6..7502f85 100644 --- a/analyses/cms-open-data-ttbar/analysis.py +++ b/analyses/cms-open-data-ttbar/analysis.py @@ -4,17 +4,12 @@ from time import time from typing import Tuple -from distributed import Client, get_worker, LocalCluster, SSHCluster import ml -from plotting import save_ml_plots, save_plots import ROOT -from utils import ( - AGCInput, - AGCResult, - postprocess_results, - retrieve_inputs, - save_histos, -) +from distributed import Client, LocalCluster, SSHCluster, get_worker +from plotting import save_ml_plots, save_plots +from statistical import fit_histograms +from utils import AGCInput, AGCResult, postprocess_results, retrieve_inputs, save_histos # Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data # as a reference. Values are in pb. @@ -90,7 +85,24 @@ def parse_args() -> argparse.Namespace: "--hosts", help="A comma-separated list of worker node hostnames. Only required if --scheduler=dask-ssh, ignored otherwise.", ) - p.add_argument("-v", "--verbose", help="Turn on verbose execution logs.", action="store_true") + p.add_argument( + "-v", + "--verbose", + help="Turn on verbose execution logs.", + action="store_true", + ) + + p.add_argument( + "--statistical-validation", + help = argparse.SUPPRESS, + action="store_true", + ) + + p.add_argument( + "--no-fitting", + help="Do not run statistical validation part of the analysis.", + action="store_true", + ) return p.parse_args() @@ -109,7 +121,11 @@ def create_dask_client(scheduler: str, ncores: int, hosts: str, scheduler_addres sshc = SSHCluster( workers, connect_options={"known_hosts": None}, - worker_options={"nprocs": ncores, "nthreads": 1, "memory_limit": "32GB"}, + worker_options={ + "nprocs": ncores, + "nthreads": 1, + "memory_limit": "32GB", + }, ) return Client(sshc) @@ -128,7 +144,10 @@ def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame: df = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) > 1") # Build four-momentum vectors for each jet - df = df.Define("Jet_p4", "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)") + df = df.Define( + "Jet_p4", + "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)", + ) # Build trijet combinations df = df.Define("Trijet_idx", "Combinations(Jet_pt_cut, 3)") @@ -186,7 +205,7 @@ def book_histos( # pt_res_up(jet_pt) - jet resolution systematic df = df.Vary( "Jet_pt", - "ROOT::RVec{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt.size())}", + "ROOT::RVec{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt)}", ["pt_scale_up", "pt_res_up"], ) @@ -240,8 +259,7 @@ def book_histos( # Only one b-tagged region required # The observable is the total transvesre momentum # fmt: off - df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1")\ - .Define("HT", "Sum(Jet_pt_cut)") + df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1").Define("HT", "Sum(Jet_pt_cut)") # fmt: on # Define trijet_mass observable for the 4j2b region (this one is more complicated) @@ -251,20 +269,34 @@ def book_histos( results = [] for df, observable, region in zip([df4j1b, df4j2b], ["HT", "Trijet_mass"], ["4j1b", "4j2b"]): histo_model = ROOT.RDF.TH1DModel( - name=f"{region}_{process}_{variation}", title=process, nbinsx=25, xlow=50, xup=550 + name=f"{region}_{process}_{variation}", + title=process, + nbinsx=25, + xlow=50, + xup=550, ) nominal_histo = df.Histo1D(histo_model, observable, "Weights") if variation == "nominal": results.append( AGCResult( - nominal_histo, region, process, variation, nominal_histo, should_vary=True + nominal_histo, + region, + process, + variation, + nominal_histo, + should_vary=True, ) ) else: results.append( AGCResult( - nominal_histo, region, process, variation, nominal_histo, should_vary=False + nominal_histo, + region, + process, + variation, + nominal_histo, + should_vary=False, ) ) print(f"Booked histogram {histo_model.fName}") @@ -292,7 +324,12 @@ def book_histos( if variation == "nominal": ml_results.append( AGCResult( - nominal_histo, feature.name, process, variation, nominal_histo, should_vary=True + nominal_histo, + feature.name, + process, + variation, + nominal_histo, + should_vary=True, ) ) else: @@ -382,7 +419,10 @@ def ml_init(): with create_dask_client(args.scheduler, args.ncores, args.hosts, scheduler_address) as client: for input in inputs: df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame( - "Events", input.paths, daskclient=client, npartitions=args.npartitions + "Events", + input.paths, + daskclient=client, + npartitions=args.npartitions, ) df._headnode.backend.distribute_unique_paths( [ @@ -426,6 +466,10 @@ def main() -> None: # To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity. ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo) + if args.statistical_validation: + fit_histograms(filename=args.output) + return + inputs: list[AGCInput] = retrieve_inputs( args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache ) @@ -457,6 +501,9 @@ def main() -> None: save_histos([r.histo for r in ml_results], output_fname=output_fname) print(f"Result histograms from ML inference step saved in file {output_fname}") + if not args.no_fitting: + fit_histograms(filename=args.output) + if __name__ == "__main__": main() diff --git a/analyses/cms-open-data-ttbar/helpers.h b/analyses/cms-open-data-ttbar/helpers.h index 186f083..25f66cf 100644 --- a/analyses/cms-open-data-ttbar/helpers.h +++ b/analyses/cms-open-data-ttbar/helpers.h @@ -10,21 +10,18 @@ #include "TRandom3.h" #include -// functions creating systematic variations -inline double random_gaus() -{ - thread_local std::random_device rd{}; - thread_local std::mt19937 gen{rd()}; - thread_local std::normal_distribution d{1, 0.05}; - return d(gen); -} - -inline ROOT::RVecF jet_pt_resolution(std::size_t size) +inline ROOT::RVecF jet_pt_resolution(const ROOT::RVecF &jet_pt) { // normal distribution with 5% variations, shape matches jets - ROOT::RVecF res(size); - std::generate(std::begin(res), std::end(res), []() - { return random_gaus(); }); + ROOT::RVecF res(jet_pt.size()); + // We need to create some pseudo-randomness, it should be thread-safe and at the same time do not depend on RNG. We use the fact that [jet_pt is in GeV....]. + // We then use the gaussian quantile to compute the resolution according to the input mean and sigma, using the random bits from the floating-point values. + double mean = 1.; + double sigma = 0.05; + for (std::size_t i = 0; i < jet_pt.size(); ++i) { + res[i] = mean + ROOT::Math::gaussian_quantile(static_cast(0.001 * (static_cast(jet_pt[i] * 1000) % 1000)) + 0.0005, sigma); + } + return res; } diff --git a/analyses/cms-open-data-ttbar/ml.py b/analyses/cms-open-data-ttbar/ml.py index bee9240..38223d5 100644 --- a/analyses/cms-open-data-ttbar/ml.py +++ b/analyses/cms-open-data-ttbar/ml.py @@ -3,7 +3,6 @@ from typing import Tuple import ROOT - from distributed import get_worker # histogram bin lower limit to use for each ML input feature diff --git a/analyses/cms-open-data-ttbar/output_text.txt b/analyses/cms-open-data-ttbar/output_text.txt new file mode 100644 index 0000000..d57629f --- /dev/null +++ b/analyses/cms-open-data-ttbar/output_text.txt @@ -0,0 +1,2 @@ +Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'... +All good! diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_10_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_10_file.root new file mode 100644 index 0000000..bd0da37 Binary files /dev/null and b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_10_file.root differ diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_1_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_1_file.root new file mode 100644 index 0000000..cb5bb72 Binary files /dev/null and b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_1_file.root differ diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_5_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_5_file.root new file mode 100644 index 0000000..e47e8f9 Binary files /dev/null and b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_5_file.root differ diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/validate_fit_result.py b/analyses/cms-open-data-ttbar/reference/fitResults/validate_fit_result.py new file mode 100644 index 0000000..c33c57b --- /dev/null +++ b/analyses/cms-open-data-ttbar/reference/fitResults/validate_fit_result.py @@ -0,0 +1,88 @@ +import argparse + +import ROOT + +# Create an argument parser +parser = argparse.ArgumentParser(description="Run the fitting part of the analysis.") + +# Add argument for the first parameter (n-max-files-per-sample) +parser.add_argument('--n-files-per-sample', type=int, required=True, help="Maximum number of files per sample.") + + +def get_fit_result(file_path, fit_result_name): + """Open the ROOT file and retrieve the RooFitResult object.""" + file = ROOT.TFile(file_path) + fit_result = file.Get(fit_result_name) + if not fit_result: + raise ValueError( + f"Fit result '{fit_result_name}' not found in {file_path}" + ) + return fit_result + + +def compare_fit_results(result1, result2): + """Compare the parameter values of two RooFitResults.""" + params1 = result1.floatParsFinal() + params2 = result2.floatParsFinal() + + # Check for the same number of parameters + if params1.getSize() != params2.getSize(): + print( + f"Number of parameters differ: {params1.getSize()} != {params2.getSize()}" + ) + return + + print("Comparing parameters...") + + ERROR = False + + # Loop over parameters in the first result and compare with the second + for i in range(params1.getSize()): + par1 = params1[i] + par2 = params2.find( + par1.GetName() + ) # Find corresponding parameter by name in result2 + + if not par2: + print( + f"Parameter '{par1.GetName()}' not found in the second fit result." + ) + ERROR = True + continue + + # Compare values and print differences + if abs(par1.getVal() - par2.getVal()) < 1e-6: + print(f"Parameter '{par1.GetName()}' matches: {par1.getVal()}") + else: + print( + f"Parameter '{par1.GetName()}' differs: {par1.getVal()} != {par2.getVal()}" + ) + ERROR = True + + # Optionally compare errors too + if abs(par1.getError() - par2.getError()) > 1e-6: + print( + f"Parameter '{par1.GetName()}' error differs: {par1.getError()} != {par2.getError()}" + ) + ERROR = True + + if ERROR: + print("ERROR: Comparison failed.") + + +args = parser.parse_args() + +number_of_files = args.n_files_per_sample + +# Replace these with the paths to your .root files and fit result names +file1 = "./fitResults.root" +file2 = f"./reference/fitResults/fitResults_{number_of_files}_file.root" +fit_result_name_1 = "fitResult" # Fit result in first file +fit_result_name_2 = "fitResult" # Fit result in second file + +# Load the fit results from the two files +fit_result_1 = get_fit_result(file1, fit_result_name_1) +fit_result_2 = get_fit_result(file2, fit_result_name_2) + +# Compare the fit results +compare_fit_results(fit_result_1, fit_result_2) diff --git a/analyses/cms-open-data-ttbar/statistical.py b/analyses/cms-open-data-ttbar/statistical.py new file mode 100644 index 0000000..21478a0 --- /dev/null +++ b/analyses/cms-open-data-ttbar/statistical.py @@ -0,0 +1,456 @@ +import os + +import ROOT +from statistical_inference.agc_sample import AGCSample +from statistical_inference.drawer import DrawModel, Visualization +from statistical_inference.rebinning_tool import RebinningTool + + +def fit_histograms(filename=""): + print(f"Starting fitting process using input file: {filename}") + + ROOT.RooMsgService.instance().setGlobalKillBelow(ROOT.RooFit.WARNING) + + os.makedirs("statistical_data", exist_ok=True) + + rebinning_tool = RebinningTool() # Rebinning as extra step + rebinning_tool.set_xmin(110) + rebinning_tool.set_rebin(2) + rebinning_tool.set_input_path(filename) + rebinning_tool.set_output_path("statistical_data/temp_histos.root") + rebinning_tool.apply_rebinning() + + file = ROOT.TFile("statistical_data/HistFactoryExtra.root", "RECREATE") # cleaning the file + file.Close() # create separate file for each file -> two much files + + input_file = "statistical_data/temp_histos.root" # input file for futher defined histograms + + meas = ROOT.RooStats.HistFactory.Measurement("meas", "meas") + meas.SetLumi(1.0) + meas.SetLumiRelErr(0.0) + + # ! main difference + lumi_systematics = (ROOT.RooStats.HistFactory.OverallSys()) # we define it one time -> dont need to redefine it + lumi_systematics.SetName("Lumi") + lumi_systematics.SetLow(0.97) + lumi_systematics.SetHigh(1.03) + + channel = ROOT.RooStats.HistFactory.Channel("channel_4j1b_CR") + channel.SetData("4j1b_pseudodata", input_file) + channel.SetStatErrorConfig(0.001, "Gaussian") + + ttbar = AGCSample("ttbar", "4j1b_ttbar", input_file) + ttbar.SetSystematicsInputFile(input_file) # if use SetInputFile -> change input file for nominal histogram + # we can put input file from sample data definition as default + + ttbar.AddOverallSys(lumi_systematics) + ttbar.ActivateStatError() # ROOT does not have DisableStatError() method, so we have to keep this :( + # "histogram_down" can be skipped here: + ttbar.AddNormPlusShapeHistoSys("ME_variation", histoname_up="4j1b_ttbar_ME_var") + ttbar.AddNormPlusShapeHistoSys( + "PS_variation", histoname_up="4j1b_ttbar_PS_var" + ) # histogram path is specified as "" in default case + ttbar.AddNormPlusShapeHistoSys( + "tt_scale_var", + histoname_up="4j1b_ttbar_scaleup", + histoname_down="4j1b_ttbar_scaledown", + ) + ttbar.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j1b_ttbar_pt_scale_up" + ) + ttbar.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j1b_ttbar_pt_res_up" + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j1b_ttbar_btag_var_0_up", + histoname_down="4j1b_ttbar_btag_var_0_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j1b_ttbar_btag_var_1_up", + histoname_down="4j1b_ttbar_btag_var_1_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j1b_ttbar_btag_var_2_up", + histoname_down="4j1b_ttbar_btag_var_2_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j1b_ttbar_btag_var_3_up", + histoname_down="4j1b_ttbar_btag_var_3_down", + ) + ttbar.AddNormFactor("ttbar_norm", 1, 0, 10) + channel.AddSample(ttbar.GetHistFactorySample()) + + wjets = AGCSample("wjets", "4j1b_wjets", input_file) + wjets.SetSystematicsInputFile(input_file) + wjets.ActivateStatError() + wjets.AddOverallSys(lumi_systematics) + wjets.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j1b_wjets_pt_scale_up" + ) + wjets.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j1b_wjets_pt_res_up" + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j1b_wjets_btag_var_0_up", + histoname_down="4j1b_wjets_btag_var_0_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j1b_wjets_btag_var_1_up", + histoname_down="4j1b_wjets_btag_var_1_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j1b_wjets_btag_var_2_up", + histoname_down="4j1b_wjets_btag_var_2_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j1b_wjets_btag_var_3_up", + histoname_down="4j1b_wjets_btag_var_3_down", + ) + wjets.AddNormPlusShapeHistoSys( + "w_plus_jets_scale_var", + histoname_up="4j1b_wjets_scale_var_up", + histoname_down="4j1b_wjets_scale_var_down", + ) + channel.AddSample(wjets.GetHistFactorySample()) + + single_top_s_chan = AGCSample( + "single_top_s", "4j1b_single_top_s_chan", input_file + ) + single_top_s_chan.SetSystematicsInputFile(input_file) + single_top_s_chan.ActivateStatError() + single_top_s_chan.AddOverallSys(lumi_systematics) + single_top_s_chan.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j1b_single_top_s_chan_pt_scale_up" + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j1b_single_top_s_chan_pt_res_up" + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j1b_single_top_s_chan_btag_var_0_up", + histoname_down="4j1b_single_top_s_chan_btag_var_0_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j1b_single_top_s_chan_btag_var_1_up", + histoname_down="4j1b_single_top_s_chan_btag_var_1_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j1b_single_top_s_chan_btag_var_2_up", + histoname_down="4j1b_single_top_s_chan_btag_var_2_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j1b_single_top_s_chan_btag_var_3_up", + histoname_down="4j1b_single_top_s_chan_btag_var_3_down", + ) + + channel.AddSample(single_top_s_chan.GetHistFactorySample()) + + single_top_t_chan = AGCSample( + "single_top_t", "4j1b_single_top_t_chan", input_file + ) + single_top_t_chan.SetSystematicsInputFile(input_file) + single_top_t_chan.ActivateStatError() + single_top_t_chan.AddOverallSys(lumi_systematics) + single_top_t_chan.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j1b_single_top_t_chan_pt_scale_up" + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j1b_single_top_t_chan_pt_res_up" + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j1b_single_top_t_chan_btag_var_0_up", + histoname_down="4j1b_single_top_t_chan_btag_var_0_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j1b_single_top_t_chan_btag_var_1_up", + histoname_down="4j1b_single_top_t_chan_btag_var_1_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j1b_single_top_t_chan_btag_var_2_up", + histoname_down="4j1b_single_top_t_chan_btag_var_2_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j1b_single_top_t_chan_btag_var_3_up", + histoname_down="4j1b_single_top_t_chan_btag_var_3_down", + ) + + channel.AddSample(single_top_t_chan.GetHistFactorySample()) + + single_top_tW = AGCSample( + "single_top_tW", "4j1b_single_top_tW", input_file + ) + single_top_tW.SetSystematicsInputFile(input_file) + single_top_tW.ActivateStatError() + single_top_tW.AddOverallSys(lumi_systematics) + single_top_tW.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j1b_single_top_tW_pt_scale_up" + ) + single_top_tW.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j1b_single_top_tW_pt_res_up" + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j1b_single_top_tW_btag_var_0_up", + histoname_down="4j1b_single_top_tW_btag_var_0_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j1b_single_top_tW_btag_var_1_up", + histoname_down="4j1b_single_top_tW_btag_var_1_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j1b_single_top_tW_btag_var_2_up", + histoname_down="4j1b_single_top_tW_btag_var_2_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j1b_single_top_tW_btag_var_3_up", + histoname_down="4j1b_single_top_tW_btag_var_3_down", + ) + + channel.AddSample(single_top_tW.GetHistFactorySample()) + + meas.AddChannel(channel) + + channel_2b = ROOT.RooStats.HistFactory.Channel("channel_4j2b_SR") + channel_2b.SetData("4j2b_pseudodata", input_file) + channel_2b.SetStatErrorConfig(0.001, "Gaussian") + + ttbar = AGCSample("ttbar", "4j2b_ttbar", input_file) + ttbar.AddOverallSys(lumi_systematics) + ttbar.SetSystematicsInputFile(input_file) + ttbar.ActivateStatError() + ttbar.AddNormPlusShapeHistoSys( + "ME_variation", histoname_up="4j2b_ttbar_ME_var" + ) + ttbar.AddNormPlusShapeHistoSys( + "PS_variation", histoname_up="4j2b_ttbar_PS_var" + ) + ttbar.AddNormPlusShapeHistoSys( + "tt_scale_var", + histoname_up="4j2b_ttbar_scaleup", + histoname_down="4j2b_ttbar_scaledown", + ) + ttbar.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j2b_ttbar_pt_scale_up" + ) + ttbar.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j2b_ttbar_pt_res_up" + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j2b_ttbar_btag_var_0_up", + histoname_down="4j2b_ttbar_btag_var_0_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j2b_ttbar_btag_var_1_up", + histoname_down="4j2b_ttbar_btag_var_1_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j2b_ttbar_btag_var_2_up", + histoname_down="4j2b_ttbar_btag_var_2_down", + ) + ttbar.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j2b_ttbar_btag_var_3_up", + histoname_down="4j2b_ttbar_btag_var_3_down", + ) + ttbar.AddNormFactor("ttbar_norm", 1, 0, 10) + channel_2b.AddSample(ttbar.GetHistFactorySample()) + + wjets = AGCSample("wjets", "4j2b_wjets", input_file) + wjets.SetSystematicsInputFile(input_file) + wjets.ActivateStatError() + wjets.AddOverallSys(lumi_systematics) + wjets.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j2b_wjets_pt_scale_up" + ) + wjets.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j2b_wjets_pt_res_up" + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j2b_wjets_btag_var_0_up", + histoname_down="4j2b_wjets_btag_var_0_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j2b_wjets_btag_var_1_up", + histoname_down="4j2b_wjets_btag_var_1_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j2b_wjets_btag_var_2_up", + histoname_down="4j2b_wjets_btag_var_2_down", + ) + wjets.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j2b_wjets_btag_var_3_up", + histoname_down="4j2b_wjets_btag_var_3_down", + ) + wjets.AddNormPlusShapeHistoSys( + "w_plus_jets_scale_var", + histoname_up="4j2b_wjets_scale_var_up", + histoname_down="4j2b_wjets_scale_var_down", + ) + channel_2b.AddSample(wjets.GetHistFactorySample()) + + single_top_s_chan = AGCSample( + "single_top_s", "4j2b_single_top_s_chan", input_file + ) + single_top_s_chan.SetSystematicsInputFile(input_file) + single_top_s_chan.ActivateStatError() + single_top_s_chan.AddOverallSys(lumi_systematics) + single_top_s_chan.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j2b_single_top_s_chan_pt_scale_up" + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j2b_single_top_s_chan_pt_res_up" + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j2b_single_top_s_chan_btag_var_0_up", + histoname_down="4j2b_single_top_s_chan_btag_var_0_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j2b_single_top_s_chan_btag_var_1_up", + histoname_down="4j2b_single_top_s_chan_btag_var_1_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j2b_single_top_s_chan_btag_var_2_up", + histoname_down="4j2b_single_top_s_chan_btag_var_2_down", + ) + single_top_s_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j2b_single_top_s_chan_btag_var_3_up", + histoname_down="4j2b_single_top_s_chan_btag_var_3_down", + ) + channel_2b.AddSample(single_top_s_chan.GetHistFactorySample()) + + single_top_t_chan = AGCSample( + "single_top_t", "4j2b_single_top_t_chan", input_file + ) + single_top_t_chan.SetSystematicsInputFile(input_file) + single_top_t_chan.ActivateStatError() + single_top_t_chan.AddOverallSys(lumi_systematics) + single_top_t_chan.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j2b_single_top_t_chan_pt_scale_up" + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j2b_single_top_t_chan_pt_res_up" + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j2b_single_top_t_chan_btag_var_0_up", + histoname_down="4j2b_single_top_t_chan_btag_var_0_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j2b_single_top_t_chan_btag_var_1_up", + histoname_down="4j2b_single_top_t_chan_btag_var_1_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j2b_single_top_t_chan_btag_var_2_up", + histoname_down="4j2b_single_top_t_chan_btag_var_2_down", + ) + single_top_t_chan.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j2b_single_top_t_chan_btag_var_3_up", + histoname_down="4j2b_single_top_t_chan_btag_var_3_down", + ) + + channel_2b.AddSample(single_top_t_chan.GetHistFactorySample()) + + single_top_tW = AGCSample( + "single_top_tW", "4j2b_single_top_tW", input_file + ) + single_top_tW.ActivateStatError() + single_top_tW.SetSystematicsInputFile(input_file) + single_top_tW.AddOverallSys(lumi_systematics) + single_top_tW.AddNormPlusShapeHistoSys( + "jet_energy_scale", histoname_up="4j2b_single_top_tW_pt_scale_up" + ) + single_top_tW.AddNormPlusShapeHistoSys( + "jet_energy_res", histoname_up="4j2b_single_top_tW_pt_res_up" + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_1", + histoname_up="4j2b_single_top_tW_btag_var_0_up", + histoname_down="4j2b_single_top_tW_btag_var_0_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_2", + histoname_up="4j2b_single_top_tW_btag_var_1_up", + histoname_down="4j2b_single_top_tW_btag_var_1_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_3", + histoname_up="4j2b_single_top_tW_btag_var_2_up", + histoname_down="4j2b_single_top_tW_btag_var_2_down", + ) + single_top_tW.AddNormPlusShapeHistoSys( + "b_tag_NP_4", + histoname_up="4j2b_single_top_tW_btag_var_3_up", + histoname_down="4j2b_single_top_tW_btag_var_3_down", + ) + + channel_2b.AddSample(single_top_tW.GetHistFactorySample()) + meas.AddChannel(channel_2b) + + meas.SetPOI("ttbar_norm") + meas.CollectHistograms() + + ws = ROOT.RooStats.HistFactory.MakeModelAndMeasurementFast(meas) + + # Retrieve the ModelConfig + modelConfig = ws.obj("ModelConfig") + + # Extract the PDF and global observables + pdf = modelConfig.GetPdf() + globalObservables = ROOT.RooArgSet(modelConfig.GetGlobalObservables()) + + # Perform the fit + result = pdf.fitTo( + ws["obsData"], + Save=True, + PrintLevel=-1, + GlobalObservables=globalObservables, + SumW2Error=False + ) + + vis = Visualization() + vis.CreateAndSaveResultsPicture("fitted_parameters.png", ws) # Parameters fit result + + vis.DrawCorrelationMatrix("correlation_matrix.png", result) # Correlation matrix + + md = DrawModel(meas, ws) + md.Draw(result) + + # save fit results to ROOT file + + with ROOT.TFile.Open("fitResults.root", "RECREATE") as file: + result.Write("fitResult") + + + result.Print() diff --git a/analyses/cms-open-data-ttbar/statistical_inference/__init__.py b/analyses/cms-open-data-ttbar/statistical_inference/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/analyses/cms-open-data-ttbar/statistical_inference/agc_sample.py b/analyses/cms-open-data-ttbar/statistical_inference/agc_sample.py new file mode 100644 index 0000000..7be5fdc --- /dev/null +++ b/analyses/cms-open-data-ttbar/statistical_inference/agc_sample.py @@ -0,0 +1,257 @@ +import ROOT + + +class AGCSample(): + """ + class created to cover extra functions, which are implemented in cabinetry, but missed in cabinetry + """ + + def __init__(self, name, histo_name, histo_file, histo_path=""): + self.root_sample = ROOT.RooStats.HistFactory.Sample(name, histo_name, histo_file, histo_path) + self.output_path = "statistical_data/HistFactoryExtra.root" + # since ROOT need to collect histogram from file at some point, we store created histograms in additional file + # probably can be changed in next ROOT releases -> histograms can be stored just in RAM + self.fInputFile = "" # default input file for all systematics + + # set same input file for all provided systematics: HistoSysm, HistoFactor, NormPlusShape + def SetSystematicsInputFile(self, file): + self.fInputFile = file + + def GetHistFactorySample(self): + return self.root_sample + + def AddHistoSys( + self, + name, + histoname_up=None, + histofile_up=None, + histopath_up="", # overrided function, allowing not to provide additional input file + histoname_down=None, + histofile_down=None, + histopath_down="", + ): + + histSys = ROOT.RooStats.HistFactory.HistoSys() + + if histofile_up is None: + histofile_down = self.fInputFile + histofile_up = self.fInputFile + + histSys.SetName(name) + histSys.SetHistoName(self.GetHistFactorySample().GetHistoName()) + histSys.SetHistoPathHigh(histopath_up) + histSys.SetHistoPathLow(histopath_down) + histSys.SetHistoNameHigh(histoname_up) + histSys.SetHistoNameLow(histoname_down) + histSys.SetInputFileHigh(histofile_up) + histSys.SetInputFileLow(histofile_down) + + self.GetHistFactorySample().GetHistoSysList().push_back(histSys) + + def AddHistoFactor( + self, + name, + histoname_up=None, + histofile_up=None, + histopath_up="", # overrided function, allowing not to provide additional input file + histoname_down=None, + histofile_down=None, + histopath_down="", + ): + + histFactor = ROOT.RooStats.HistFactory.HistoFactor() + + if histofile_up is None: + histofile_down = self.fInputFile + histofile_up = self.fInputFile + + histFactor.SetName(name) + histFactor.SetHistoName(self.GetHistFactorySample().GetHistoName()) + histFactor.SetHistoPathHigh(histopath_up) + histFactor.SetHistoPathLow(histopath_down) + histFactor.SetHistoNameHigh(histoname_up) + histFactor.SetHistoNameLow(histoname_down) + histFactor.SetInputFileHigh(histofile_up) + histFactor.SetInputFileLow(histofile_down) + + self.GetHistFactorySample().GetHistoFactorList().push_back(histFactor) + + def AddShapeSys( + self, + name, + constraint_type=None, + histoname=None, + histofile=None, + histopath="", + ): # overrided function, allowing not to provide additional input file + + shapeSys = ROOT.RooStats.HistFactory.ShapeSys() + + if histofile is None: + histofile = self.fInputFile + + shapeSys.SetName(name) + shapeSys.SetHistoName(self.GetHistFactorySample().GetHistoName()) + shapeSys.SetHistoPath(histopath) + shapeSys.SetInputFile(histofile) + + shapeSys.SetConstraintType(constraint_type) + + self.GetHistFactorySample().GetShapeSysList().push_back(shapeSys) + + def AddNormPlusShapeHistoSys( + self, + name, + histoname_up=None, + histofile_up=None, + histopath_up="", # check more here: https://github.com/scikit-hep/cabinetry/issues/26 + histoname_down=None, + histofile_down=None, + histopath_down="", + ): + if histofile_up is None: + histofile_up = self.fInputFile + histofile_down = self.fInputFile + assert (histofile_up != ""), "ERROR: You not specified input file for sample" + + if histoname_down is None: + self.Symmetrize_AddNormPlusShapeHistoSys(name, histoname_up, histofile_up, histopath_up) + else: + self.NonSymmetrize_AddNormPlusShapeHistoSys( + name, + histoname_up, + histofile_up, + histopath_up, + histoname_down, + histofile_down, + histopath_down, + ) + + def Symmetrize_AddNormPlusShapeHistoSys( + self, name, histoname, histofile, histopath + ): + """ + first symmertrize histogram -> calculate overallsys -> save all to sample + check more here: + https://github.com/scikit-hep/cabinetry/issues/26 + """ + channel_name = str(histoname.split("_")[0]) + + file = ROOT.TFile(histofile, "READ") + dir = file.GetDirectory(histopath) + hist_top = dir.Get(histoname) + + hist_nominal_file = ROOT.TFile(self.GetHistFactorySample().GetInputFile(), "READ") + hist_nominal_name = self.GetHistFactorySample().GetHistoName() + hist_nominal_directory = hist_nominal_file.GetDirectory(self.GetHistFactorySample().GetHistoPath()) + hist_nominal = hist_nominal_directory.Get(hist_nominal_name) + + norm_factor_up = hist_top.Integral() / hist_nominal.Integral() + h_new = hist_top.Clone(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_up_clone") + h_new.Scale(1 / norm_factor_up) + + h_down = hist_nominal.Clone(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_down_clone") + h_down.Scale(2) + h_down.Add(h_new, -1) + + output_file = ROOT.TFile(self.output_path, "UPDATE") + + hist_up_name = str(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_up") + hist_down_name = str(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_down") + + h_new.Write(hist_up_name) + h_down.Write(hist_down_name) + + output_file.Close() + + histSys = ROOT.RooStats.HistFactory.HistoSys() + + histSys.SetName(name) + histSys.SetHistoPathHigh("") + histSys.SetHistoPathLow("") + + histSys.SetHistoNameHigh(hist_up_name) + histSys.SetHistoNameLow(hist_down_name) + + histSys.SetInputFileLow(self.output_path) + histSys.SetInputFileHigh(self.output_path) + + overallSys = ROOT.RooStats.HistFactory.OverallSys() + overallSys.SetName(name) + overallSys.SetLow(2 - norm_factor_up) + overallSys.SetHigh(norm_factor_up) + + self.GetHistFactorySample().GetHistoSysList().push_back(histSys) + self.GetHistFactorySample().GetOverallSysList().push_back(overallSys) + + def NonSymmetrize_AddNormPlusShapeHistoSys( + self, + name, + histoname_up, + histofile_up, + histopath_up, + histoname_down, + histofile_down, + histopath_down, + ): + channel_name = str(histoname_up.split("_")[0]) + + file = ROOT.TFile(histofile_up, "READ") + dir = file.GetDirectory(histopath_up) + hist_top = dir.Get(histoname_up) + + hist_nominal_file = ROOT.TFile(self.GetHistFactorySample().GetInputFile(), "READ") + hist_nominal_name = self.GetHistFactorySample().GetHistoName() + hist_nominal_directory = hist_nominal_file.GetDirectory(self.GetHistFactorySample().GetHistoPath()) + hist_nominal = hist_nominal_directory.Get(hist_nominal_name) + + norm_factor_up = hist_top.Integral() / hist_nominal.Integral() + h_new_up = hist_top.Clone(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_up_clone") + h_new_up.Scale(1 / norm_factor_up) + + file_down = ROOT.TFile(histofile_down, "READ") + dir_down = file_down.GetDirectory(histopath_down) + hist_down = dir_down.Get(histoname_down) + + norm_factor_down = hist_down.Integral() / hist_nominal.Integral() + h_new_down = hist_down.Clone(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_down_clone") + h_new_down.Scale(1 / norm_factor_down) + + output_file = ROOT.TFile(self.output_path, "UPDATE") + + hist_up_name = str(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_up") + hist_down_name = str(f"{channel_name}_{self.GetHistFactorySample().GetName()}_{name}_norm_plus_shape_down") + + h_new_up.Write(hist_up_name) + h_new_down.Write(hist_down_name) + + output_file.Close() + + histSys = ROOT.RooStats.HistFactory.HistoSys() + + histSys.SetName(name) + histSys.SetHistoPathHigh("") + histSys.SetHistoPathLow("") + + histSys.SetHistoNameHigh(hist_up_name) + histSys.SetHistoNameLow(hist_down_name) + + histSys.SetInputFileLow(self.output_path) + histSys.SetInputFileHigh(self.output_path) + + overallSys = ROOT.RooStats.HistFactory.OverallSys() + overallSys.SetName(name) + overallSys.SetLow(norm_factor_down) + overallSys.SetHigh(norm_factor_up) + + self.GetHistFactorySample().GetHistoSysList().push_back(histSys) + self.GetHistFactorySample().GetOverallSysList().push_back(overallSys) + + def ActivateStatError(self): + self.GetHistFactorySample().ActivateStatError() + + def AddOverallSys(self, overallsys): + self.GetHistFactorySample().AddOverallSys(overallsys) + + def AddNormFactor(self, name, central, left, right): + self.GetHistFactorySample().AddNormFactor(name, central, left, right) diff --git a/analyses/cms-open-data-ttbar/statistical_inference/drawer.py b/analyses/cms-open-data-ttbar/statistical_inference/drawer.py new file mode 100644 index 0000000..b732d41 --- /dev/null +++ b/analyses/cms-open-data-ttbar/statistical_inference/drawer.py @@ -0,0 +1,905 @@ +import numpy as np +import ROOT + + +class Visualization: + def CreateAndSaveResultsPicture(self, filename, fWorkspace): + names = [] + values = [] + errors = [] + + vars = fWorkspace.allVars() + # for (auto var : vars) + for var in vars: + name = var.GetName() + if "alpha" in name: + if fWorkspace.var(name).getVal() == 0: + continue + names += [name.split("alpha_")[-1]] + values += [fWorkspace.var(name).getVal()] + errors += [fWorkspace.var(name).getError()] + + num = len(names) + + ROOT.gStyle.SetPalette(1) + self.c1 = ROOT.TCanvas("c1", "c1", 1200, 600) + self.c1.SetLeftMargin(0.2) + + self.frame = ROOT.TH2F("self.frame", "", 6, -3, 3, num, 0, num) + + self.frame.Draw("") + + self.box = ROOT.TBox(-2, 0, 2, num) + # external yellow box, which represent 2 sigma deviation + self.box.SetFillColorAlpha(ROOT.kYellow - 9, 0.8) + self.box.Draw("same A") + + self.box_internal = ROOT.TBox(-1, 0, 1, num) + # internal green box, which represent 1 sigma deviation + self.box_internal.SetFillColorAlpha(ROOT.kGreen, 0.5) + self.box_internal.Draw("same A") + + self.axis = ROOT.TGaxis( + -3, num, 3, num, -3, 3, 510, "-" + ) # left Y axis, which used for naming parameters + + self.frame.GetYaxis().SetTickLength( + 0.0 + ) # set default axis ticks to 0 -> they will be overwritten by manually created ones + + self.graph = ROOT.TGraph() + self.lines = [] + self.right_ticks = [] + self.left_ticks = [] + + self.frame.GetYaxis().SetLabelSize(0.04) + + for i in range(num): + self.graph.SetPoint(i, values[i], i + 0.5) + self.frame.GetYaxis().SetBinLabel(i + 1, names[i]) + + self.lines += [ + ROOT.TLine( + values[i] - errors[i], + i + 0.5, + values[i] + errors[i], + i + 0.5, + ) + ] # line which represent parameter error + self.lines[-1].SetLineColor(ROOT.kBlack) + self.lines[-1].SetLineWidth(2) + self.lines[-1].Draw() + + self.left_ticks += [ + ROOT.TLine(-3, i + 0.5, -2.95, i + 0.5) + ] # left tick, paralel to parameter error line + self.left_ticks[-1].SetLineColor(ROOT.kBlack) + self.left_ticks[-1].SetLineWidth(1) + self.left_ticks[-1].Draw() + + self.right_ticks += [ + ROOT.TLine(2.95, i + 0.5, 3, i + 0.5) + ] # right tick, paralel to parameter error line + self.right_ticks[-1].SetLineColor(ROOT.kBlack) + self.right_ticks[-1].SetLineWidth(1) + self.right_ticks[-1].Draw() + + self.tl = ROOT.TLine(0, 0, 0, num) + self.tl.SetLineStyle(2) + self.tl.Draw() + + self.graph.SetMarkerStyle(20) + self.graph.SetMarkerSize(1) + self.graph.SetMarkerColor(ROOT.kBlack) + + self.graph.Draw("P same") + + self.frame.SetStats(0) + self.axis.SetLabelSize(0.0) + self.axis.Draw() + + ROOT.gPad.RedrawAxis() + + self.c1.SaveAs(filename) + self.c1.Draw() + + def DrawCorrelationMatrix(self, filename, result): + final_parameters = result.floatParsFinal() + corr_matrix_before = result.correlationMatrix() + + number_of_inter_params = 0 + + for i in range(len(final_parameters)): + par = final_parameters.at(i) + if "gamma" in par.GetName(): # skip parameters for bins stat error + continue + + number_of_inter_params += 1 + + name = "CorrelationMatrix for fit results" + + n = corr_matrix_before.GetNcols() + + self.hh = ROOT.TH2D( + name, + name, + number_of_inter_params, + 0, + number_of_inter_params, + number_of_inter_params, + 0, + number_of_inter_params, + ) + + internal_index = 0 + for i in range(n): + + par = final_parameters.at(i) + if "gamma" in par.GetName(): + continue + + internal__internal_index = 0 + for j in range(n): + par = final_parameters.at(j) + if "gamma" in par.GetName(): + continue + self.hh.Fill( + internal_index + 0.5, + number_of_inter_params - internal__internal_index - 0.5, + corr_matrix_before[i][j], + ) + internal__internal_index += 1 + + self.hh.GetXaxis().SetBinLabel( + internal_index + 1, + final_parameters[i].GetName().split("alpha_")[-1], + ) + self.hh.GetYaxis().SetBinLabel( + number_of_inter_params - internal_index, + final_parameters[i].GetName().split("alpha_")[-1], + ) + internal_index += 1 + + self.hh.SetMinimum(-1) + self.hh.SetMaximum(+1) + + self.c = ROOT.TCanvas( + "self.c", + "Canvas", + number_of_inter_params * 100, + number_of_inter_params * 60, + ) + self.hh.Draw("COLZ") + self.hh.SetStats(0) + + ROOT.gStyle.SetPalette(87) + palette = self.hh.GetListOfFunctions().FindObject("palette") + if palette: + palette.SetX1NDC(0.1) # Adjust palette position + palette.SetX2NDC(0.3) # Adjust palette position + + self.c.SaveAs(filename) + self.c.Draw() + + +class DrawModel: + + predefined_colors = [ + ROOT.TColor.GetColor( + "#3F90DA" + ), # check choosen colors here: https://arxiv.org/abs/2107.02270 + ROOT.TColor.GetColor("#FFA90E"), + ROOT.TColor.GetColor("#BD1F01"), + ROOT.TColor.GetColor("#94A4A2"), + ROOT.TColor.GetColor("#832DB6"), + ROOT.TColor.GetColor("#A96B59"), + ROOT.TColor.GetColor("#E76300"), + ROOT.TColor.GetColor("#B9AC70"), + ROOT.TColor.GetColor("#717581"), + ROOT.TColor.GetColor("#92DADD"), + ROOT.TColor.GetColor("#D0E1F9"), # Light Azure + ROOT.TColor.GetColor("#E3D6F1"), # Light Violet + ROOT.TColor.GetColor("#000000"), # Black (for contrast) + ROOT.TColor.GetColor("#C0C0C0"), # Gray + ROOT.TColor.GetColor("#F4A3A0"), # Soft Red + ROOT.TColor.GetColor("#9AB9F5"), # Soft Blue + ROOT.TColor.GetColor("#B5E5B0"), # Soft Green + ROOT.TColor.GetColor("#F0A1B2"), # Soft Magenta + ROOT.TColor.GetColor("#B0D6D5"), # Soft Cyan + ROOT.TColor.GetColor("#F3F9A6"), # Soft Yellow + ] + + def GetChannelPrefitGraph(self, number): + return self.hs_stacks[number] + + def GetChannelPostfitGraph(self, number): + return self.second_hs_stacks[number] + + def __init__(self, meas, ws): + self.meas = meas + self.ws = ws + self.cv_array = ( + [] + ) # save all ROOT Graphic objects as self to be able show picture in main notebook + self.hs_stacks = [] + self.sample_histograms = [] + self.bias_graphs = [] + self.bias_second_graphs = [] + self.normal_lines = [] + self.second_histos = [] + self.second_hs_stacks = [] + self.afterfit_histograms = [] + self.error_graphs = [] + self.error_pre_graphs = [] + + def get_yields(self, variable, observables, pdf, result, prefit=False): + + yields = np.zeros(variable.numBins()) + yields_uncert = np.zeros(variable.numBins()) + sample_values = {} + for i_bin in range(variable.numBins()): + variable.setBin(i_bin) + bin_width = variable.getBinWidth(i_bin) + + if prefit: # for prefit original parameters values + fill_array = np.zeros( + ( + pdf.funcList().size(), + len(pdf.getParameters(observables)), + ) + ) + all_params = pdf.getParameters( + observables + ) # get parameters, which required only for interested pdf + param_values = ( + result.floatParsInit() + ) # original parameters values + + for i_sample in range(pdf.funcList().size()): + sample_yield = ROOT.RooProduct( + "tmp", + "tmp", + [pdf.funcList()[i_sample], pdf.coefList()[i_sample]], + ) # get sample from pdf + + yields[i_bin] += ( + bin_width * sample_yield.getVal() + ) # get total bin content per bin + if i_sample in sample_values: + sample_values[i_sample] += [ + bin_width * sample_yield.getVal() + ] # get value for each sample per bin + else: + sample_values[i_sample] = [ + bin_width * sample_yield.getVal() + ] + + for j_parameter, par in enumerate(all_params): + + name = par.GetName() + + original_ind = param_values.index( + param_values.find(name) + ) # index of parameter from pdf in original parameters array + + postfit_cen_val = ( + par.getVal() + ) # saving postfit value to restore it after all calculations + cen_val = param_values[ + original_ind + ].getVal() # getting original value for parameter + par_err = param_values[ + original_ind + ].getError() # getting original error for parameter + + par.setVal(cen_val + par_err) + par_upper_variation = sample_yield.getVal( + observables + ) # get upper variation for sample + par.setVal(cen_val - par_err) + par_bottom_variation = sample_yield.getVal( + observables + ) # get lower variation for sample + + par.setVal(postfit_cen_val) # restore postfit value + + fill_array[i_sample, j_parameter] = ( + par_upper_variation - par_bottom_variation + ) / 2 + + total_uncertanties_per_variation = np.sum( + fill_array, axis=0 + ) # sum by 0 axis to get total uncertainty per variation + total_uncertainty = np.sum( + np.power(total_uncertanties_per_variation, 2), axis=0 + ) # for postfit covariance matrix = identity matrix, so code be bit simplified + yields_uncert[i_bin] = ( + np.sqrt(total_uncertainty) * bin_width + ) # original values is normalized -> multiply by bin width + + else: + all_pdf_params = pdf.getParameters(observables) + + required_params = [ + i for i in all_pdf_params if i.getError() > 0.001 + ] # get only non zero parameters + + fill_array = np.zeros( + (pdf.funcList().size(), len(required_params)) + ) + + number_of_parameters = len(required_params) + cov_matrix = result.reducedCovarianceMatrix( + required_params + ) # reduced covariance matrix for only non 0 parameters + + numpy_cov_matrix = np.zeros( + (number_of_parameters, number_of_parameters) + ) + + for i_index in range( + number_of_parameters + ): # fill and normalize numpy covariance matrix + for j_index in range(i_index, number_of_parameters): + numpy_cov_matrix[i_index, j_index] = cov_matrix[ + i_index, j_index + ] / np.sqrt( + cov_matrix[i_index, i_index] + * cov_matrix[j_index, j_index] + ) + numpy_cov_matrix[j_index, i_index] = numpy_cov_matrix[ + i_index, j_index + ] + + for i_sample in range(pdf.funcList().size()): + sample_yield = ROOT.RooProduct( + "tmp", + "tmp", + [pdf.funcList()[i_sample], pdf.coefList()[i_sample]], + ) # get sample from pdf + yields[i_bin] += ( + bin_width * sample_yield.getVal() + ) # get total bin content per bin + if i_sample in sample_values: + sample_values[i_sample] += [ + bin_width * sample_yield.getVal() + ] # get value for each sample per bin + else: + sample_values[i_sample] = [ + bin_width * sample_yield.getVal() + ] + + for j_parameter, par in enumerate(required_params): + + cen_val = ( + par.getVal() + ) # getting postfit parameter value + par_err = ( + par.getError() + ) # getting postfit parameter error + + par.setVal(cen_val + par_err) + par_upper_variation = sample_yield.getVal( + observables + ) # getting upper variation + par.setVal(cen_val - par_err) + par_bottom_variation = sample_yield.getVal( + observables + ) # getting lower variation + + par.setVal(cen_val) # restore postfit value + sample_yield.getVal(observables) + + fill_array[i_sample, j_parameter] = ( + par_upper_variation - par_bottom_variation + ) / 2 + + total_uncertanties_per_variation = np.sum( + fill_array, axis=0 + ) # sum by 0 axis to get total uncertainty per variation + + total_uncertainty = np.dot( + total_uncertanties_per_variation, + np.dot(numpy_cov_matrix, total_uncertanties_per_variation), + ) # F * (C * F) + + yields_uncert[i_bin] = np.sqrt(total_uncertainty) * bin_width + + return yields, yields_uncert, sample_values + + def get_yields_no_fit( + self, variable, observables, pdf, result + ): # for extra variables and not fit + yields = np.zeros(variable.numBins()) + yields_uncert = np.zeros(variable.numBins()) + sample_values = {} + for i_bin in range(variable.numBins()): + variable.setBin(i_bin) + bin_width = variable.getBinWidth(i_bin) + + required_params = pdf.getParameters( + observables + ) # get all parameters required for uncertainty calculation + + fit_result_params = ( + result.floatParsFinal() + ) # postfit parameters_values + + fill_array = np.zeros( + (pdf.funcList().size(), len(required_params)) + ) + + number_of_parameters = len(required_params) + + parameters_indices_map = {} + parameters_for_cov_matrix = [] + + internal_index = 0 + for i_index, par in enumerate( + required_params + ): # loop over new pdf parameters and setting their values to postfit parameters values + par_index = fit_result_params.index( + fit_result_params.find(par.GetName()) + ) + if par_index == -1: + continue + parameters_indices_map[i_index] = ( + internal_index # internal index for copying covariance matrix + ) + internal_index += 1 + parameter_to_be_copied = fit_result_params[par_index] + par.setVal(parameter_to_be_copied.getVal()) + par.setError(parameter_to_be_copied.getError()) + parameters_for_cov_matrix += [ + parameter_to_be_copied + ] # save parameters to list to get reduced Covariance martix + + cov_matrix = result.reducedCovarianceMatrix( + parameters_for_cov_matrix + ) + + numpy_cov_matrix = np.zeros( + (number_of_parameters, number_of_parameters) + ) + + for i_index in range( + number_of_parameters + ): # filling covariance matrix using values from postfit covariance matrix. + for j_index in range(i_index, number_of_parameters): + if ( + i_index in parameters_indices_map + and j_index in parameters_indices_map + ): # if both parameters in row and column existing in postfit results -> copy and normalize + numpy_cov_matrix[i_index, j_index] = cov_matrix[ + parameters_indices_map[i_index], + parameters_indices_map[j_index], + ] / np.sqrt( + cov_matrix[ + parameters_indices_map[i_index], + parameters_indices_map[i_index], + ] + * cov_matrix[ + parameters_indices_map[j_index], + parameters_indices_map[j_index], + ] + ) + numpy_cov_matrix[j_index, i_index] = numpy_cov_matrix[ + i_index, j_index + ] + else: # if at least one parameter was not presented in postfit results + numpy_cov_matrix[i_index, j_index] = int( + i_index == j_index + ) + + """ + expected view of covariance matrix after copying: + var1, var2, var3 was presented in postfit results + var4, var5 are new parameters (mostly are stat errors for bins) + + var1 var2 var3 var4 var5 + var1 1 non0 non0 0 0 + var2 non0 1 non0 0 0 + var3 non0 non0 1 0 0 + var4 0 0 0 1 0 + var5 0 0 0 0 1 + + + """ + + for i_sample in range( + pdf.funcList().size() + ): # same as default one + sample_yield = ROOT.RooProduct( + "tmp", + "tmp", + [pdf.funcList()[i_sample], pdf.coefList()[i_sample]], + ) + yields[i_bin] += bin_width * sample_yield.getVal() + if i_sample in sample_values: + sample_values[i_sample] += [ + bin_width * sample_yield.getVal() + ] + else: + sample_values[i_sample] = [ + bin_width * sample_yield.getVal() + ] + + for j_parameter, par in enumerate(required_params): + + cen_val = par.getVal() + par_err = par.getError() + + par.setVal(cen_val + par_err) + par_upper_variation = sample_yield.getVal(observables) + par.setVal(cen_val - par_err) + par_bottom_variation = sample_yield.getVal(observables) + + par.setVal(cen_val) + sample_yield.getVal(observables) + + fill_array[i_sample, j_parameter] = ( + par_upper_variation - par_bottom_variation + ) / 2 + + total_uncertanties_per_variation = np.sum(fill_array, axis=0) + + total_uncertainty = np.dot( + total_uncertanties_per_variation, + np.dot(numpy_cov_matrix, total_uncertanties_per_variation), + ) + + yields_uncert[i_bin] = np.sqrt(total_uncertainty) * bin_width + + return yields, yields_uncert, sample_values + + def Draw(self, result, no_fit=False): + self.boxes = [] + self.error_boxes = [] + self.error_boxes_prefit = [] + self.data_plots = [] + for channel in self.meas.GetChannels(): + channel_pdf = self.ws[str(channel.GetName()) + "_model"] + obs_var = self.ws["obs_x_" + str(channel.GetName())] + observables = ROOT.RooArgSet(obs_var) + + divide_value = 0.3 # divide canvas into 4 pads: prefit histogram, postfit histogram, prefit data/bin_value, postfit data/bin_value + + self.cv_array += [ + ROOT.TCanvas( + "canvas" + str(channel.GetName()), + "canvas" + str(channel.GetName()), + 1500, + 600, + ) + ] + pad1_upper = ROOT.TPad( + "pad1_upper" + str(channel.GetName()), + "pad1_upper" + str(channel.GetName()), + 0, + divide_value, + 0.5, + 1, + ) + pad1_upper.Draw() + pad1_bottom = ROOT.TPad( + "pad1_bottom" + str(channel.GetName()), + "pad1_bottom" + str(channel.GetName()), + 0, + 0, + 0.5, + divide_value, + ) + pad1_bottom.Draw() + + pad2_upper = ROOT.TPad( + "pad2_upper" + str(channel.GetName()), + "pad2_upper" + str(channel.GetName()), + 0.5, + divide_value, + 1, + 1, + ) + pad2_upper.Draw() + pad2_bottom = ROOT.TPad( + "pad2_bottom" + str(channel.GetName()), + "pad2_bottom" + str(channel.GetName()), + 0.5, + 0, + 1, + divide_value, + ) + pad2_bottom.Draw() + pad1_upper.cd() + + prefit_yields, prefit_unc, prefit_sample_values = self.get_yields( + obs_var, observables, channel_pdf, result, prefit=True + ) # get prefit uncert for histogram + + self.hs_stacks += [ + ROOT.THStack( + "hs" + str(channel.GetName()), + "hs" + str(channel.GetName()), + ) + ] + sample_histograms = [] + + original_sample_bin_values = [ + 0 + ] * channel.GetData().GetHisto().GetNbinsX() + + for i, sample in enumerate( + channel.GetSamples() + ): # loop over samples to get prefit histograms + hist = sample.GetHisto() + hist.SetFillColor(self.predefined_colors[i]) + hist.SetLineColor(ROOT.kBlack) + sample_histograms += [hist] + + for i in range(hist.GetNbinsX()): + original_sample_bin_values[i] += hist.GetBinContent(i + 1) + + self.hs_stacks[-1].Add( + sample_histograms[-1] + ) # save histogram to THStack + + channel_name = "_".join(channel.GetName().split("_")[1:]) + self.hs_stacks[-1].SetTitle(channel_name + " PREFIT") + self.hs_stacks[-1].Draw("hist") + self.hs_stacks[-1].GetXaxis().SetLabelSize(0.06) + + maximum_y_val = ( + ROOT.gPad.GetUymax() + ) # used for moving upper limit of Y ax + + bin_index = 1 + + for bin_index in range(1, sample_histograms[-1].GetNbinsX() + 1): + leftEdge = sample_histograms[-1].GetBinLowEdge(bin_index) + binWidth = sample_histograms[-1].GetBinWidth(bin_index) + rightEdge = leftEdge + binWidth + + central_value = original_sample_bin_values[bin_index - 1] + unc = prefit_unc[bin_index - 1] + down_value = central_value - unc + up_value = central_value + unc + + if up_value > maximum_y_val: + maximum_y_val = up_value # if box are over upper limit -> we are going to move upper limit after + + self.boxes += [ + ROOT.TBox(leftEdge, down_value, rightEdge, up_value) + ] # draw uncertainty box for each bin + self.boxes[-1].SetFillStyle(3004) + self.boxes[-1].SetFillColor(ROOT.kGray + 3) + self.boxes[-1].Draw("same") + + self.hs_stacks[-1].SetMaximum( + 1.1 * maximum_y_val + ) # moving upper limit if required + + self.data_histogram = ( + channel.GetData().GetHisto() + ) # getting data histogram from channel data + self.data_histogram.SetStats(0) # remove extra text over histogram + self.data_histogram.SetMarkerStyle(3) # stylization + self.data_histogram.SetMarkerSize(0.5) + + self.data_plots += [ROOT.TGraphErrors()] + for i in range(self.data_histogram.GetNbinsX()): + self.data_plots[-1].SetPoint( + i, + self.data_histogram.GetBinCenter(i + 1), + self.data_histogram.GetBinContent(i + 1), + ) + self.data_plots[-1].SetPointError( + i, 0, self.data_histogram.GetBinError(i + 1) / 2 + ) + + self.data_plots[-1].SetMarkerStyle(8) + self.data_plots[-1].SetMarkerSize(0.5) + self.data_plots[-1].Draw("same p") # draw TGraph instead of TH1F + + pad1_bottom.cd() + number_of_bins = self.data_histogram.GetNbinsX() + self.bias_graphs += [ + ROOT.TGraphErrors(number_of_bins) + ] # TGraph which will be use to save data/bin_value data + self.bias_graphs[-1].SetTitle("") + self.bias_graphs[-1].SetMarkerSize(0.4) + self.bias_graphs[-1].SetMarkerStyle(8) + + for i in range(1, number_of_bins + 1): + original_value = original_sample_bin_values[i - 1] + data_value = self.data_histogram.GetBinContent(i) + data_error = self.data_histogram.GetBinError(i) / 2 + self.bias_graphs[-1].SetPoint( + i - 1, + self.data_histogram.GetBinCenter(i), + data_value / original_value, + ) + self.bias_graphs[-1].SetPointError( + i - 1, 0, data_error / original_value + ) + + self.bias_graphs[-1].Draw("AP") + self.bias_graphs[-1].GetXaxis().SetLabelSize(0.12) + self.bias_graphs[-1].GetYaxis().SetLabelSize(0.08) + + for i in range(1, number_of_bins + 1): + original_value = original_sample_bin_values[i - 1] + unc = prefit_unc[i - 1] + up_value = 1 + unc / (original_value) + down_value = 1 - unc / (original_value) + + if ( + down_value < 0.5 + ): # draw limits, by default in ROOT: if down_value << 0.5 -> box will not be drawn + down_value = 0.5 # handle it manually + if ( + up_value > 1.5 + ): # by default in ROOT: if up_value >> 1.5 -> box would not be drawn + up_value = 1.5 + + leftEdge = self.data_histogram.GetBinLowEdge( + i + ) # left edge of bin -> left edge of error box + binWidth = self.data_histogram.GetBinWidth(i) + rightEdge = ( + leftEdge + binWidth + ) # right edge of bin -> right edge of error box + + self.error_boxes_prefit += [ + ROOT.TBox(leftEdge, down_value, rightEdge, up_value) + ] + self.error_boxes_prefit[-1].SetFillStyle(3004) + self.error_boxes_prefit[-1].SetFillColor(ROOT.kGray + 3) + self.error_boxes_prefit[-1].Draw("same") # drawing error boxes + + minimal_bin_value = self.data_histogram.GetBinLowEdge(1) + maximum_bin_value = self.data_histogram.GetBinLowEdge( + number_of_bins + ) + self.data_histogram.GetBinWidth(number_of_bins) + + self.bias_graphs[-1].GetYaxis().SetRangeUser(0.5, 1.5) + self.bias_graphs[-1].GetXaxis().SetRangeUser( + minimal_bin_value, maximum_bin_value + ) + + self.normal_lines += [ + ROOT.TLine(minimal_bin_value, 1, maximum_bin_value, 1) + ] + self.normal_lines[-1].SetLineStyle(2) + self.normal_lines[-1].SetLineWidth(1) + self.normal_lines[-1].Draw( + "same" + ) # normal line, which represent data/bin_value = 1 + + pad2_upper.cd() # same for postfit + + if no_fit: + ( + postfit_yields, + postfit_yields_uncert, + postfit_sample_values, + ) = self.get_yields_no_fit( + obs_var, observables, channel_pdf, result + ) # if we do for extra workspaces -> use no_fit function + # which handle all parameters copying + else: + ( + postfit_yields, + postfit_yields_uncert, + postfit_sample_values, + ) = self.get_yields( + obs_var, observables, channel_pdf, result, prefit=False + ) + + self.second_hs_stacks += [ + ROOT.THStack("fitted stack", "fitted stack") + ] + + color_number = 0 + + for postfit in postfit_sample_values: + temp_histo = ROOT.TH1F( + channel_name + "afterfit" + str(postfit), + channel_name + "afterfit" + str(postfit), + len(postfit_yields), + minimal_bin_value, + maximum_bin_value, + ) + temp_histo.SetFillColor(self.predefined_colors[color_number]) + color_number += 1 + bin_index = 1 + for bin_value in postfit_sample_values[postfit]: + temp_histo.SetBinContent(bin_index, bin_value) + bin_index += 1 + self.second_hs_stacks[-1].Add(temp_histo) + + channel_name = "_".join(channel.GetName().split("_")[1:]) + self.second_hs_stacks[-1].SetTitle(channel_name + " POSTFIT") + self.second_hs_stacks[-1].Draw("hist") + self.second_hs_stacks[-1].GetXaxis().SetLabelSize(0.06) + self.second_hs_stacks[-1].SetMaximum(1.1 * maximum_y_val) + + bin_index = 1 + for bin_index in range(1, temp_histo.GetNbinsX() + 1): + leftEdge = temp_histo.GetBinLowEdge(bin_index) + binWidth = temp_histo.GetBinWidth(bin_index) + rightEdge = leftEdge + binWidth + + central_value = postfit_yields[bin_index - 1] + unc = postfit_yields_uncert[bin_index - 1] + down_value = central_value - unc + up_value = central_value + unc + + self.boxes += [ + ROOT.TBox(leftEdge, down_value, rightEdge, up_value) + ] + self.boxes[-1].SetFillStyle(3004) + self.boxes[-1].SetFillColor(ROOT.kGray + 3) + self.boxes[-1].Draw("same") + + self.data_plots[-1].Draw("same p") + + self.bias_second_graphs += [ROOT.TGraphErrors(number_of_bins)] + self.bias_second_graphs[-1].SetTitle("") + self.bias_second_graphs[-1].SetMarkerSize(0.4) + self.bias_second_graphs[-1].SetMarkerStyle(8) + + pad2_bottom.cd() + + for i in range(1, number_of_bins + 1): + original_value = postfit_yields[i - 1] + data_value = self.data_histogram.GetBinContent(i) + data_error = self.data_histogram.GetBinError(i) / 2 + + self.bias_second_graphs[-1].SetPoint( + i - 1, + self.data_histogram.GetBinCenter(i), + data_value / original_value, + ) + self.bias_second_graphs[-1].SetPointError( + i - 1, 0, data_error / original_value + ) + + self.bias_second_graphs[-1].Draw("AP") + self.bias_second_graphs[-1].GetXaxis().SetLabelSize(0.12) + self.bias_second_graphs[-1].GetYaxis().SetLabelSize(0.08) + + for i in range(1, number_of_bins + 1): + original_value = postfit_yields[i - 1] + unc = postfit_yields_uncert[i - 1] + up_value = 1 + unc / (original_value) + down_value = 1 - unc / (original_value) + + leftEdge = self.data_histogram.GetBinLowEdge(i) + binWidth = self.data_histogram.GetBinWidth(i) + rightEdge = leftEdge + binWidth + + self.error_boxes += [ + ROOT.TBox(leftEdge, down_value, rightEdge, up_value) + ] + self.error_boxes[-1].SetFillStyle(3004) + self.error_boxes[-1].SetFillColor(ROOT.kGray + 3) + self.error_boxes[-1].Draw("same") + + minimal_bin_value = self.data_histogram.GetBinLowEdge(1) + maximum_bin_value = self.data_histogram.GetBinLowEdge( + number_of_bins + ) + self.data_histogram.GetBinWidth(number_of_bins) + + self.bias_second_graphs[-1].GetYaxis().SetRangeUser(0.5, 1.5) + self.bias_second_graphs[-1].GetXaxis().SetRangeUser( + minimal_bin_value, maximum_bin_value + ) + + self.normal_lines += [ + ROOT.TLine(minimal_bin_value, 1, maximum_bin_value, 1) + ] + self.normal_lines[-1].SetLineStyle(2) + self.normal_lines[-1].SetLineWidth(1) + self.normal_lines[-1].Draw("same") + + self.cv_array[-1].SaveAs(channel.GetName() + "_histo.png") + self.cv_array[-1].Draw() diff --git a/analyses/cms-open-data-ttbar/statistical_inference/rebinning_tool.py b/analyses/cms-open-data-ttbar/statistical_inference/rebinning_tool.py new file mode 100644 index 0000000..18da1ac --- /dev/null +++ b/analyses/cms-open-data-ttbar/statistical_inference/rebinning_tool.py @@ -0,0 +1,149 @@ +import os + +import ROOT + + +class RebinningTool: + """ + Rebinning tool, which can be used for rebinning and cutting edges + Save histograms to new .root file, which have structure as input one + + set_xmin: set lower edge for all histograms + set_xmax: set upper edge for all histograms + set_rebin: set rebin coefficient + + example of usage: + + rebinning_tool = RebinningTool() + rebinning_tool.set_xmin(110) # if xmax was not set -> keep original one + rebinning_tool.set_rebin(2) + rebinning_tool.set_input_path("data/histograms.root") + rebinning_tool.set_output_path("data/temp_histos.root") + rebinning_tool.apply_rebinning() + """ + + def __init__(self): + self.fHistoBinLow = None + self.fHistoBinHigh = None + self.fRebin = None + self.fRebinning = False + + def set_xmin(self, xmin): + self.fHistoBinLow = xmin + + def set_xmax(self, xmax): + self.fHistoBinHigh = xmax + + def set_rebin(self, rebin): + self.fRebinning = True + self.fRebin = rebin + + def list_histograms(self, directory, path=""): + keys = directory.GetListOfKeys() + + # Loop over all keys + for key in keys: + # Get the object associated with the key + obj = key.ReadObj() + + # Construct the full path of the object + + obj_path = os.path.join(path, obj.GetName()) + + # Check if the object is a directory + if obj.InheritsFrom("TDirectory"): + # Recursively list histograms in subdirectories + self.list_histograms(obj, obj_path) + # Check if the object is a histogram + elif obj.InheritsFrom("TH1"): + self.all_histograms.append(obj_path) + + def is_integer(self, value): + return int(value) == value + + def rebin_histogram(self, original): + if not self.fRebinning: + return original + + left_edge = self.fHistoBinLow + right_edge = self.fHistoBinHigh + + if left_edge is None: + left_edge = original.GetXaxis().GetXmin() + + if right_edge is None: + right_edge = original.GetXaxis().GetXmax() + + original_bin_width = original.GetXaxis().GetBinWidth(1) + assert_check_left = ( + left_edge - original.GetXaxis().GetXmin() + ) / original_bin_width + assert_check_right = ( + original.GetXaxis().GetXmax() - right_edge + ) / original_bin_width + + if not self.is_integer(assert_check_left) or not self.is_integer(assert_check_right): + print("Error: The left_edge and right_edge are not multiples of the original bin width") + return original + + number_of_remove_bins = ( + (left_edge - original.GetXaxis().GetXmin()) + + (original.GetXaxis().GetXmax() - right_edge) + ) / original_bin_width + new_nbins = int(original.GetNbinsX() - number_of_remove_bins) + + original_new = ROOT.TH1F( + original.GetName() + "temp_rebin_clone", + original.GetTitle(), + new_nbins, + left_edge, + right_edge, + ) + skipped_bins_left = int((left_edge - original.GetXaxis().GetXmin()) / original_bin_width) + + for i in range(1, new_nbins + 1): + bin_idx = i + skipped_bins_left + original_new.SetBinContent(i, original.GetBinContent(bin_idx)) + original_new.SetBinError(i, original.GetBinError(bin_idx)) + + output = original_new.Rebin(self.fRebin) + output.SetDirectory(ROOT.nullptr) + + return output + + def apply_rebinning(self, input_file_path=None, output_file_path=None): + if input_file_path is None: + input_file_path = self.input_path + if output_file_path is None: + output_file_path = self.output_path + + file = ROOT.TFile(input_file_path, "READ") + + output_file = ROOT.TFile(output_file_path, "RECREATE") + output_file.Close() + + self.all_histograms = [] + self.list_histograms(file) + + for hist_path in self.all_histograms: + hist = file.Get(hist_path) + hist_rebinned = self.rebin_histogram(hist) + output_file = ROOT.TFile(output_file_path, "UPDATE") + + if "/" in hist_path: + dir_path = hist_path.rsplit("/", 1)[0] + self.mkdir_p(output_file, dir_path) + output_file.cd(dir_path) + + hist_rebinned.Write(hist_path.split("/")[-1]) + output_file.Close() + + file.Close() + + def set_input_path(self, input_path): + self.input_path = input_path + + def set_output_path(self, output_path): + self.output_path = output_path + file = ROOT.TFile(self.output_path, "RECREATE") + file.Close() diff --git a/analyses/cms-open-data-ttbar/utils.py b/analyses/cms-open-data-ttbar/utils.py index d24cc75..3c94ca6 100644 --- a/analyses/cms-open-data-ttbar/utils.py +++ b/analyses/cms-open-data-ttbar/utils.py @@ -19,7 +19,9 @@ class AGCInput: @dataclass class AGCResult: histo: Union[ - ROOT.TH1D, ROOT.RDF.RResultPtr[ROOT.TH1D], ROOT.RDF.Experimental.RResultMap[ROOT.TH1D] + ROOT.TH1D, + ROOT.RDF.RResultPtr[ROOT.TH1D], + ROOT.RDF.Experimental.RResultMap[ROOT.TH1D], ] region: str process: str @@ -57,13 +59,23 @@ def update_to(b=1, bsize=1, tsize=None): def _cache_files(file_paths: list, cache_dir: str, remote_prefix: str): for url in file_paths: - out_path = Path(cache_dir) / url.removeprefix(remote_prefix).lstrip("/") + out_path = Path(cache_dir) / url.removeprefix(remote_prefix).lstrip( + "/" + ) out_path.parent.mkdir(parents=True, exist_ok=True) if not out_path.exists(): with tqdm( - unit="B", unit_scale=True, unit_divisor=1024, miniters=1, desc=out_path.name + unit="B", + unit_scale=True, + unit_divisor=1024, + miniters=1, + desc=out_path.name, ) as t: - urlretrieve(url, out_path.absolute(), reporthook=_tqdm_urlretrieve_hook(t)) + urlretrieve( + url, + out_path.absolute(), + reporthook=_tqdm_urlretrieve_hook(t), + ) def retrieve_inputs( @@ -87,7 +99,9 @@ def retrieve_inputs( continue # skip data for variation, sample_info in process_spec.items(): - sample_info = sample_info["files"] # this is now a list of (filename, nevents) tuples + sample_info = sample_info[ + "files" + ] # this is now a list of (filename, nevents) tuples if max_files_per_sample is not None: sample_info = sample_info[:max_files_per_sample] @@ -98,16 +112,22 @@ def retrieve_inputs( if remote_data_prefix is not None: assert all(f.startswith(prefix) for f in file_paths) old_prefix, prefix = prefix, remote_data_prefix - file_paths = [f.replace(old_prefix, prefix) for f in file_paths] + file_paths = [ + f.replace(old_prefix, prefix) for f in file_paths + ] if data_cache is not None: assert all(f.startswith(prefix) for f in file_paths) _cache_files(file_paths, data_cache, prefix) old_prefix, prefix = prefix, str(Path(data_cache).absolute()) - file_paths = [f.replace(old_prefix, prefix) for f in file_paths] + file_paths = [ + f.replace(old_prefix, prefix) for f in file_paths + ] nevents = sum(f["nevts"] for f in sample_info) - input_files.append(AGCInput(file_paths, process, variation, nevents)) + input_files.append( + AGCInput(file_paths, process, variation, nevents) + ) return input_files @@ -124,11 +144,19 @@ def postprocess_results(results: list[AGCResult]): # just extract the histogram from the RResultPtr h = res.histo.GetValue() new_results.append( - AGCResult(h, res.region, res.process, res.variation, res.nominal_histo) + AGCResult( + h, + res.region, + res.process, + res.variation, + res.nominal_histo, + ) ) else: resmap = res.histo - assert hasattr(resmap, "GetKeys") # RResultMap or distrdf equivalent + assert hasattr( + resmap, "GetKeys" + ) # RResultMap or distrdf equivalent # extract each histogram in the map for variation in resmap.GetKeys(): h = resmap[variation] @@ -137,13 +165,55 @@ def postprocess_results(results: list[AGCResult]): new_name = h.GetName().replace("nominal", variation_name) h.SetName(new_name) new_results.append( - AGCResult(h, res.region, res.process, variation_name, res.nominal_histo) + AGCResult( + h, + res.region, + res.process, + variation_name, + res.nominal_histo, + ) ) return new_results +def simplify_histo_name(name) -> str: + """Simplify histogram name by removing the process and nominal variation.""" + if "_nominal" in name: + name = name.replace("_nominal", "") + if "_Jet" in name: + name = name.split("_Jet")[0] + if "_Weights" in name: + name = name.split("_Weights")[0] + return name + + def save_histos(results: list[ROOT.TH1D], output_fname: str): with ROOT.TFile.Open(output_fname, "recreate") as out_file: + names_list = [] for result in results: - out_file.WriteObject(result, result.GetName().replace("_nominal", "")) + result.SetName(simplify_histo_name(result.GetName())) + out_file.WriteObject(result, result.GetName()) + names_list.append(result.GetName()) + + if ( + "4j1b_ttbar_ME_var" in names_list + and "4j1b_ttbar_PS_var" in names_list + and "4j1b_wjets" in names_list + and "4j2b_ttbar_ME_var" in names_list + and "4j2b_ttbar_PS_var" in names_list + and "4j2b_wjets" in names_list + ): + histogram_4j1b = out_file.Get("4j1b_wjets").Clone( + "4j1b_pseudodata" + ) + histogram_4j1b.Add(out_file.Get("4j1b_ttbar_PS_var"), 0.5) + histogram_4j1b.Add(out_file.Get("4j1b_ttbar_ME_var"), 0.5) + out_file.WriteObject(histogram_4j1b, "4j1b_pseudodata") + + histogram_4j2b = out_file.Get("4j2b_wjets").Clone( + "4j2b_pseudodata" + ) + histogram_4j2b.Add(out_file.Get("4j2b_ttbar_PS_var"), 0.5) + histogram_4j2b.Add(out_file.Get("4j2b_ttbar_ME_var"), 0.5) + out_file.WriteObject(histogram_4j2b, "4j2b_pseudodata") diff --git a/analyses/cms-open-data-ttbar/validate b/analyses/cms-open-data-ttbar/validate new file mode 100755 index 0000000..9b3aafd --- /dev/null +++ b/analyses/cms-open-data-ttbar/validate @@ -0,0 +1,53 @@ +#!/bin/bash + +# Check if a number of files is provided as an argument, otherwise default to 1 +NUM_FILES=${1:-1} + +# Check if input is 1, 5, or 10 +if [[ "$NUM_FILES" -eq 1 || "$NUM_FILES" -eq 5 || "$NUM_FILES" -eq 10 ]]; then + echo "Valid NUM_FILES: $NUM_FILES" +else + echo "Error: Invalid input. Please use one of existing values: 1, 5, or 10." + exit 1 +fi + +# Define the remote data prefix and other paths +REMOTE_DATA_PREFIX='root://eospublic.cern.ch//eos/root-eos/AGC' +HISTOS_FILE="histograms.root" +REFERENCE_HISTOS="reference/histos_${NUM_FILES}_file_per_process.json" + +# Function to check the status of the previous command and exit if failed +check_status() { + if [ $? -ne 0 ]; then + echo "Test failed: $1" + exit 1 + fi +} + +# Step 1: Run Analysis +echo "Running analysis with ${NUM_FILES} file(s)..." +python analysis.py --n-max-files-per-sample ${NUM_FILES} --remote-data-prefix="${REMOTE_DATA_PREFIX}" --no-fitting +check_status "Analysis failed" + +# Step 2: Run Validation for histograms +echo "Running histograms validation" +# Run the validation and check if grep produces matching output +if python validate_histograms.py --histos "${HISTOS_FILE}" --reference "${REFERENCE_HISTOS}" | grep "All good!"; then + echo "Histograms validation passed: Output matches expected results." +else + echo "Test failed: Histograms validation output does not match expected result." + exit 1 +fi + +# Step 3: Run Validation for fitting +echo "Running fitting part of analysis" +python analysis.py --n-max-files-per-sample ${NUM_FILES} --remote-data-prefix="${REMOTE_DATA_PREFIX}" --statistical-validation +if python ./reference/fitResults/validate_fit_result.py --n-files-per-sample ${NUM_FILES} | grep "ERROR: Comparison failed."; then + echo "Test failed: fitResults validation output does not match expected result." + exit 1 +else + echo "Test passed: fitResults validation output matches expected result." +fi + +echo "All tests passed successfully." + diff --git a/analyses/cms-open-data-ttbar/validate_histograms.py b/analyses/cms-open-data-ttbar/validate_histograms.py index efc5712..2a49959 100644 --- a/analyses/cms-open-data-ttbar/validate_histograms.py +++ b/analyses/cms-open-data-ttbar/validate_histograms.py @@ -1,4 +1,4 @@ -# This script is originally from the IRIS-HEP analysis-grand-challenge repository +# This script is originally from the IRIS-HEP analysis-grand-challenge repository # and has been adapted to serve the purpose of the RDF-based analysis. # The original script can be found here: # https://github.com/iris-hep/analysis-grand-challenge/blob/main/analyses/cms-open-data-ttbar/validate_histograms.py @@ -7,22 +7,41 @@ # A reference file for N_FILES_MAX_PER_SAMPLE=1 is available in directory `reference/`. from __future__ import annotations + import argparse -from collections import defaultdict import json -import numpy as np import sys +from collections import defaultdict + +import numpy as np import uproot + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser() - parser.add_argument("--histos", help="ROOT file containing the output histograms. Defaults to './histograms.root'.", default="histograms.root") + parser.add_argument( + "--histos", + help="ROOT file containing the output histograms. Defaults to './histograms.root'.", + default="histograms.root", + ) group = parser.add_mutually_exclusive_group(required=True) - group.add_argument("--reference", help="JSON reference against which histogram contents should be compared") - group.add_argument("--dump-json", help="Print JSON representation of histogram contents to screen", action='store_true') - parser.add_argument("--verbose", help="Print extra information about bin mismatches", action='store_true') + group.add_argument( + "--reference", + help="JSON reference against which histogram contents should be compared", + ) + group.add_argument( + "--dump-json", + help="Print JSON representation of histogram contents to screen", + action="store_true", + ) + parser.add_argument( + "--verbose", + help="Print extra information about bin mismatches", + action="store_true", + ) return parser.parse_args() + # convert uproot file containing only TH1Ds to a corresponding JSON-compatible dict with structure: # { "histo1": { "edges": [...], "contents": [...] }, "histo2": { ... }, ... } # Only the highest namecycle for every histogram is considered, and cycles are stripped from the histogram names. @@ -37,6 +56,7 @@ def as_dict(f: uproot.ReadOnlyDirectory) -> dict[str, dict]: histos[name]["contents"] = h.counts(flow=True).tolist() return histos + def find_matching_histogram(name: str, histos: dict) -> str | None: if name in histos: return name @@ -45,7 +65,10 @@ def find_matching_histogram(name: str, histos: dict) -> str | None: return hname return None -def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str]]: + +def validate( + histos: dict, reference: dict, verbose=False +) -> dict[str, list[str]]: errors = defaultdict(list) for name, ref_h in reference.items(): matching_name = find_matching_histogram(name, histos) @@ -55,10 +78,12 @@ def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str continue h = histos[matching_name] - if not np.allclose(h['edges'], ref_h['edges']): - errors[name].append(f"Edges do not match:\n\tgot {h['edges']}\n\texpected {ref_h['edges']}") + if not np.allclose(h["edges"], ref_h["edges"]): + errors[name].append( + f"Edges do not match:\n\tgot {h['edges']}\n\texpected {ref_h['edges']}" + ) contents_depend_on_rng = "pt_res_up" in name - is_close = np.isclose(h['contents'], ref_h['contents']) + is_close = np.isclose(h["contents"], ref_h["contents"]) if not contents_depend_on_rng and not all(is_close): where_not_close = np.where(np.invert(is_close))[0] @@ -73,30 +98,39 @@ def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str is_error = False for group in split_values: - h_group = np.array(h['contents'])[group] - ref_group = np.array(ref_h['contents'])[group] + h_group = np.array(h["contents"])[group] + ref_group = np.array(ref_h["contents"])[group] if not np.allclose(h_group, ref_group, atol=2.0): is_error = True if verbose: print(f"In {name}: Not close enough for bin migration") - print("histogram: ", h_group, ", reference: ", ref_group) + print( + "histogram: ", h_group, ", reference: ", ref_group + ) print() elif not np.allclose(sum(h_group), sum(ref_group)): is_error = True if verbose: print(f"In {name}: Partial sums are unequal") - print("histogram: ", h_group, ", reference: ", ref_group) + print( + "histogram: ", h_group, ", reference: ", ref_group + ) print() else: if verbose: print(f"In {name}: Bin migration likely") - print("histogram: ", h_group, ", reference: ", ref_group) + print( + "histogram: ", h_group, ", reference: ", ref_group + ) print() if is_error: - errors[name].append(f"Contents do not match:\n\tgot {h['contents']}\n\texpected {ref_h['contents']}") + errors[name].append( + f"Contents do not match:\n\tgot {h['contents']}\n\texpected {ref_h['contents']}" + ) return errors + if __name__ == "__main__": args = parse_args() with uproot.open(args.histos) as f: @@ -109,13 +143,15 @@ def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str with open(args.reference) as reference: ref_histos = json.load(reference) - print(f"Validating '{args.histos}' against reference '{args.reference}'...") + print( + f"Validating '{args.histos}' against reference '{args.reference}'..." + ) errs = validate(histos=histos, reference=ref_histos, verbose=args.verbose) - + if len(errs) == 0: print("All good!") else: for hist_name, errors in errs.items(): - errors = '\n\t'.join(errors) + errors = "\n\t".join(errors) print(f"{hist_name}\n\t{errors}") sys.exit(1) diff --git a/pyproject.toml b/pyproject.toml index 45bedcc..13307b0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.ruff] # ignore long lines -ignore = ["E501"] -select = ["E", "F", "I", "N"] +lint.ignore = ["E501", "N806", "N802", "N803"] +lint.select = ["E", "F", "I", "N"] [tool.mypy] ignore_missing_imports = true