Skip to content

Commit

Permalink
Introduce statistical inference step in CMS ttbar analysis
Browse files Browse the repository at this point in the history
And also add a github action to validate results of the analysis and fitting
  • Loading branch information
Valerii Kholoimov authored and vepadulano committed Dec 5, 2024
1 parent 7073818 commit a9ad8c5
Show file tree
Hide file tree
Showing 20 changed files with 2,193 additions and 68 deletions.
61 changes: 61 additions & 0 deletions .github/workflows/fitting_check.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
name: Validate CMS Open Data ttbar analysis

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
run-cms-open-data-ttbar-analysis:
runs-on: ubuntu-latest

steps:
- name: Checkout repository
uses: actions/checkout@v3

- name: Set up ROOT environment
run: |
sudo apt-get update
sudo apt-get install -y dpkg-dev cmake g++ gcc binutils libx11-dev libncurses5-dev libssl-dev libxpm-dev \
libxft-dev libxml2-dev libz-dev libxext-dev python3-dev git libtbb-dev libgif-dev xrootd-client python3
pip install numpy plotting distributed tqdm uproot
wget https://root.cern/download/root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
tar -xzvf root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
source root/bin/thisroot.sh
echo "ROOT is set up"
- name: Run Analysis
run: |
source root/bin/thisroot.sh
cd analyses/cms-open-data-ttbar/
./validate | tee output.txt
- name: Compare histograms validation output with expected
id: histograms
run: |
cd analyses/cms-open-data-ttbar/
if grep -q "Test failed: Histograms validation output does not match expected result." output.txt; then
echo "Histograms validation failed."
echo "RESULT_HISTOGRAMS=fail" >> $GITHUB_ENV
exit 1
else
echo "Histograms validation passed."
echo "RESULT_HISTOGRAMS=pass" >> $GITHUB_ENV
fi
- name: Run validation sequences for fitResults
id: fitresults
run: |
cd analyses/cms-open-data-ttbar/
if grep -q "Test failed: fitResults validation output does not match expected result." output.txt; then
echo "fitResults validation failed."
echo "RESULT_FITRESULTS=fail" >> $GITHUB_ENV
exit 1
else
echo "fitResults validation passed."
echo "RESULT_FITRESULTS=pass" >> $GITHUB_ENV
fi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
All good!
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,6 @@
data.json
__pycache__
tags
analyses/cms-open-data-ttbar/*.root
analyses/cms-open-data-ttbar/*.table
analyses/cms-open-data-ttbar/statistical_data/*
87 changes: 67 additions & 20 deletions analyses/cms-open-data-ttbar/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,12 @@
from time import time
from typing import Tuple

from distributed import Client, get_worker, LocalCluster, SSHCluster
import ml
from plotting import save_ml_plots, save_plots
import ROOT
from utils import (
AGCInput,
AGCResult,
postprocess_results,
retrieve_inputs,
save_histos,
)
from distributed import Client, LocalCluster, SSHCluster, get_worker
from plotting import save_ml_plots, save_plots
from statistical import fit_histograms
from utils import AGCInput, AGCResult, postprocess_results, retrieve_inputs, save_histos

# Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
# as a reference. Values are in pb.
Expand Down Expand Up @@ -90,7 +85,24 @@ def parse_args() -> argparse.Namespace:
"--hosts",
help="A comma-separated list of worker node hostnames. Only required if --scheduler=dask-ssh, ignored otherwise.",
)
p.add_argument("-v", "--verbose", help="Turn on verbose execution logs.", action="store_true")
p.add_argument(
"-v",
"--verbose",
help="Turn on verbose execution logs.",
action="store_true",
)

p.add_argument(
"--statistical-validation",
help = argparse.SUPPRESS,
action="store_true",
)

p.add_argument(
"--no-fitting",
help="Do not run statistical validation part of the analysis.",
action="store_true",
)

return p.parse_args()

Expand All @@ -109,7 +121,11 @@ def create_dask_client(scheduler: str, ncores: int, hosts: str, scheduler_addres
sshc = SSHCluster(
workers,
connect_options={"known_hosts": None},
worker_options={"nprocs": ncores, "nthreads": 1, "memory_limit": "32GB"},
worker_options={
"nprocs": ncores,
"nthreads": 1,
"memory_limit": "32GB",
},
)
return Client(sshc)

Expand All @@ -128,7 +144,10 @@ def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
df = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) > 1")

# Build four-momentum vectors for each jet
df = df.Define("Jet_p4", "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)")
df = df.Define(
"Jet_p4",
"ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)",
)

# Build trijet combinations
df = df.Define("Trijet_idx", "Combinations(Jet_pt_cut, 3)")
Expand Down Expand Up @@ -186,7 +205,7 @@ def book_histos(
# pt_res_up(jet_pt) - jet resolution systematic
df = df.Vary(
"Jet_pt",
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt.size())}",
"ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt)}",
["pt_scale_up", "pt_res_up"],
)

Expand Down Expand Up @@ -240,8 +259,7 @@ def book_histos(
# Only one b-tagged region required
# The observable is the total transvesre momentum
# fmt: off
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1")\
.Define("HT", "Sum(Jet_pt_cut)")
df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1").Define("HT", "Sum(Jet_pt_cut)")
# fmt: on

# Define trijet_mass observable for the 4j2b region (this one is more complicated)
Expand All @@ -251,20 +269,34 @@ def book_histos(
results = []
for df, observable, region in zip([df4j1b, df4j2b], ["HT", "Trijet_mass"], ["4j1b", "4j2b"]):
histo_model = ROOT.RDF.TH1DModel(
name=f"{region}_{process}_{variation}", title=process, nbinsx=25, xlow=50, xup=550
name=f"{region}_{process}_{variation}",
title=process,
nbinsx=25,
xlow=50,
xup=550,
)
nominal_histo = df.Histo1D(histo_model, observable, "Weights")

if variation == "nominal":
results.append(
AGCResult(
nominal_histo, region, process, variation, nominal_histo, should_vary=True
nominal_histo,
region,
process,
variation,
nominal_histo,
should_vary=True,
)
)
else:
results.append(
AGCResult(
nominal_histo, region, process, variation, nominal_histo, should_vary=False
nominal_histo,
region,
process,
variation,
nominal_histo,
should_vary=False,
)
)
print(f"Booked histogram {histo_model.fName}")
Expand Down Expand Up @@ -292,7 +324,12 @@ def book_histos(
if variation == "nominal":
ml_results.append(
AGCResult(
nominal_histo, feature.name, process, variation, nominal_histo, should_vary=True
nominal_histo,
feature.name,
process,
variation,
nominal_histo,
should_vary=True,
)
)
else:
Expand Down Expand Up @@ -382,7 +419,10 @@ def ml_init():
with create_dask_client(args.scheduler, args.ncores, args.hosts, scheduler_address) as client:
for input in inputs:
df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
"Events", input.paths, daskclient=client, npartitions=args.npartitions
"Events",
input.paths,
daskclient=client,
npartitions=args.npartitions,
)
df._headnode.backend.distribute_unique_paths(
[
Expand Down Expand Up @@ -426,6 +466,10 @@ def main() -> None:
# To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity.
ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo)

if args.statistical_validation:
fit_histograms(filename=args.output)
return

inputs: list[AGCInput] = retrieve_inputs(
args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache
)
Expand Down Expand Up @@ -457,6 +501,9 @@ def main() -> None:
save_histos([r.histo for r in ml_results], output_fname=output_fname)
print(f"Result histograms from ML inference step saved in file {output_fname}")

if not args.no_fitting:
fit_histograms(filename=args.output)


if __name__ == "__main__":
main()
23 changes: 10 additions & 13 deletions analyses/cms-open-data-ttbar/helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,18 @@
#include "TRandom3.h"
#include <Math/Vector4D.h>

// functions creating systematic variations
inline double random_gaus()
{
thread_local std::random_device rd{};
thread_local std::mt19937 gen{rd()};
thread_local std::normal_distribution<double> d{1, 0.05};
return d(gen);
}

inline ROOT::RVecF jet_pt_resolution(std::size_t size)
inline ROOT::RVecF jet_pt_resolution(const ROOT::RVecF &jet_pt)
{
// normal distribution with 5% variations, shape matches jets
ROOT::RVecF res(size);
std::generate(std::begin(res), std::end(res), []()
{ return random_gaus(); });
ROOT::RVecF res(jet_pt.size());
// We need to create some pseudo-randomness, it should be thread-safe and at the same time do not depend on RNG. We use the fact that [jet_pt is in GeV....].
// We then use the gaussian quantile to compute the resolution according to the input mean and sigma, using the random bits from the floating-point values.
double mean = 1.;
double sigma = 0.05;
for (std::size_t i = 0; i < jet_pt.size(); ++i) {
res[i] = mean + ROOT::Math::gaussian_quantile(static_cast<double>(0.001 * (static_cast<int>(jet_pt[i] * 1000) % 1000)) + 0.0005, sigma);
}

return res;
}

Expand Down
1 change: 0 additions & 1 deletion analyses/cms-open-data-ttbar/ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from typing import Tuple

import ROOT

from distributed import get_worker

# histogram bin lower limit to use for each ML input feature
Expand Down
2 changes: 2 additions & 0 deletions analyses/cms-open-data-ttbar/output_text.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
All good!
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import argparse

import ROOT

# Create an argument parser
parser = argparse.ArgumentParser(description="Run the fitting part of the analysis.")

# Add argument for the first parameter (n-max-files-per-sample)
parser.add_argument('--n-files-per-sample', type=int, required=True, help="Maximum number of files per sample.")


def get_fit_result(file_path, fit_result_name):
"""Open the ROOT file and retrieve the RooFitResult object."""
file = ROOT.TFile(file_path)
fit_result = file.Get(fit_result_name)
if not fit_result:
raise ValueError(
f"Fit result '{fit_result_name}' not found in {file_path}"
)
return fit_result


def compare_fit_results(result1, result2):
"""Compare the parameter values of two RooFitResults."""
params1 = result1.floatParsFinal()
params2 = result2.floatParsFinal()

# Check for the same number of parameters
if params1.getSize() != params2.getSize():
print(
f"Number of parameters differ: {params1.getSize()} != {params2.getSize()}"
)
return

print("Comparing parameters...")

ERROR = False

# Loop over parameters in the first result and compare with the second
for i in range(params1.getSize()):
par1 = params1[i]
par2 = params2.find(
par1.GetName()
) # Find corresponding parameter by name in result2

if not par2:
print(
f"Parameter '{par1.GetName()}' not found in the second fit result."
)
ERROR = True
continue

# Compare values and print differences
if abs(par1.getVal() - par2.getVal()) < 1e-6:
print(f"Parameter '{par1.GetName()}' matches: {par1.getVal()}")
else:
print(
f"Parameter '{par1.GetName()}' differs: {par1.getVal()} != {par2.getVal()}"
)
ERROR = True

# Optionally compare errors too
if abs(par1.getError() - par2.getError()) > 1e-6:
print(
f"Parameter '{par1.GetName()}' error differs: {par1.getError()} != {par2.getError()}"
)
ERROR = True

if ERROR:
print("ERROR: Comparison failed.")


args = parser.parse_args()

number_of_files = args.n_files_per_sample

# Replace these with the paths to your .root files and fit result names
file1 = "./fitResults.root"
file2 = f"./reference/fitResults/fitResults_{number_of_files}_file.root"
fit_result_name_1 = "fitResult" # Fit result in first file
fit_result_name_2 = "fitResult" # Fit result in second file

# Load the fit results from the two files
fit_result_1 = get_fit_result(file1, fit_result_name_1)
fit_result_2 = get_fit_result(file2, fit_result_name_2)

# Compare the fit results
compare_fit_results(fit_result_1, fit_result_2)
Loading

0 comments on commit a9ad8c5

Please sign in to comment.