Introduce statistical inference step in CMS ttbar analysis

And also add a github action to validate results of the analysis and fitting
root-project · Dec 5, 2024 · a9ad8c5 · a9ad8c5
1 parent 7073818
commit a9ad8c5
Show file tree

Hide file tree

Showing 20 changed files with 2,193 additions and 68 deletions.
diff --git a/.github/workflows/fitting_check.yml b/.github/workflows/fitting_check.yml
@@ -0,0 +1,61 @@
+name: Validate CMS Open Data ttbar analysis
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  run-cms-open-data-ttbar-analysis:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: Set up ROOT environment
+        run: |
+          sudo apt-get update
+          sudo apt-get install -y dpkg-dev cmake g++ gcc binutils libx11-dev libncurses5-dev libssl-dev libxpm-dev \
+            libxft-dev libxml2-dev libz-dev libxext-dev python3-dev git libtbb-dev libgif-dev xrootd-client python3
+          pip install numpy plotting distributed tqdm uproot
+          wget  https://root.cern/download/root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
+          tar -xzvf root_v6.32.04.Linux-ubuntu22.04-x86_64-gcc11.4.tar.gz
+          source root/bin/thisroot.sh
+          echo "ROOT is set up"
+
+      - name: Run Analysis
+        run: |
+          source root/bin/thisroot.sh
+          cd analyses/cms-open-data-ttbar/
+          ./validate | tee output.txt
+
+      - name: Compare histograms validation output with expected
+        id: histograms
+        run: |
+          cd analyses/cms-open-data-ttbar/
+          if grep -q "Test failed: Histograms validation output does not match expected result." output.txt; then
+            echo "Histograms validation failed."
+            echo "RESULT_HISTOGRAMS=fail" >> $GITHUB_ENV
+            exit 1
+          else
+            echo "Histograms validation passed."
+            echo "RESULT_HISTOGRAMS=pass" >> $GITHUB_ENV
+          fi
+          
+      - name: Run validation sequences for fitResults
+        id: fitresults
+        run: |
+          cd analyses/cms-open-data-ttbar/
+          if grep -q "Test failed: fitResults validation output does not match expected result." output.txt; then
+            echo "fitResults validation failed."
+            echo "RESULT_FITRESULTS=fail" >> $GITHUB_ENV
+            exit 1
+          else
+            echo "fitResults validation passed."
+            echo "RESULT_FITRESULTS=pass" >> $GITHUB_ENV
+          fi
+
diff --git a/.github/workflows/validation/histograms_1_file_validation_reference.yml b/.github/workflows/validation/histograms_1_file_validation_reference.yml
@@ -0,0 +1,2 @@
+Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
+All good!
diff --git a/.gitignore b/.gitignore
@@ -7,3 +7,6 @@
 data.json
 __pycache__
 tags
+analyses/cms-open-data-ttbar/*.root
+analyses/cms-open-data-ttbar/*.table
+analyses/cms-open-data-ttbar/statistical_data/*
diff --git a/analyses/cms-open-data-ttbar/analysis.py b/analyses/cms-open-data-ttbar/analysis.py
@@ -4,17 +4,12 @@
 from time import time
 from typing import Tuple
 
-from distributed import Client, get_worker, LocalCluster, SSHCluster
 import ml
-from plotting import save_ml_plots, save_plots
 import ROOT
-from utils import (
-    AGCInput,
-    AGCResult,
-    postprocess_results,
-    retrieve_inputs,
-    save_histos,
-)
+from distributed import Client, LocalCluster, SSHCluster, get_worker
+from plotting import save_ml_plots, save_plots
+from statistical import fit_histograms
+from utils import AGCInput, AGCResult, postprocess_results, retrieve_inputs, save_histos
 
 # Using https://atlas-groupdata.web.cern.ch/atlas-groupdata/dev/AnalysisTop/TopDataPreparation/XSection-MC15-13TeV.data
 # as a reference. Values are in pb.
@@ -90,7 +85,24 @@ def parse_args() -> argparse.Namespace:
         "--hosts",
         help="A comma-separated list of worker node hostnames. Only required if --scheduler=dask-ssh, ignored otherwise.",
     )
-    p.add_argument("-v", "--verbose", help="Turn on verbose execution logs.", action="store_true")
+    p.add_argument(
+        "-v",
+        "--verbose",
+        help="Turn on verbose execution logs.",
+        action="store_true",
+    )
+
+    p.add_argument(
+        "--statistical-validation",
+        help = argparse.SUPPRESS,
+        action="store_true",
+    )
+
+    p.add_argument(
+        "--no-fitting",
+        help="Do not run statistical validation part of the analysis.",
+        action="store_true",
+    )
 
     return p.parse_args()
 
@@ -109,7 +121,11 @@ def create_dask_client(scheduler: str, ncores: int, hosts: str, scheduler_addres
         sshc = SSHCluster(
             workers,
             connect_options={"known_hosts": None},
-            worker_options={"nprocs": ncores, "nthreads": 1, "memory_limit": "32GB"},
+            worker_options={
+                "nprocs": ncores,
+                "nthreads": 1,
+                "memory_limit": "32GB",
+            },
         )
         return Client(sshc)
 
@@ -128,7 +144,10 @@ def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
     df = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) > 1")
 
     # Build four-momentum vectors for each jet
-    df = df.Define("Jet_p4", "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)")
+    df = df.Define(
+        "Jet_p4",
+        "ConstructP4(Jet_pt_cut, Jet_eta_cut, Jet_phi_cut, Jet_mass_cut)",
+    )
 
     # Build trijet combinations
     df = df.Define("Trijet_idx", "Combinations(Jet_pt_cut, 3)")
@@ -186,7 +205,7 @@ def book_histos(
         # pt_res_up(jet_pt) - jet resolution systematic
         df = df.Vary(
             "Jet_pt",
-            "ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt.size())}",
+            "ROOT::RVec<ROOT::RVecF>{Jet_pt*pt_scale_up(), Jet_pt*jet_pt_resolution(Jet_pt)}",
             ["pt_scale_up", "pt_res_up"],
         )
 
@@ -240,8 +259,7 @@ def book_histos(
     # Only one b-tagged region required
     # The observable is the total transvesre momentum
     # fmt: off
-    df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1")\
-               .Define("HT", "Sum(Jet_pt_cut)")
+    df4j1b = df.Filter("Sum(Jet_btagCSVV2_cut > 0.5) == 1").Define("HT", "Sum(Jet_pt_cut)")
     # fmt: on
 
     # Define trijet_mass observable for the 4j2b region (this one is more complicated)
@@ -251,20 +269,34 @@ def book_histos(
     results = []
     for df, observable, region in zip([df4j1b, df4j2b], ["HT", "Trijet_mass"], ["4j1b", "4j2b"]):
         histo_model = ROOT.RDF.TH1DModel(
-            name=f"{region}_{process}_{variation}", title=process, nbinsx=25, xlow=50, xup=550
+            name=f"{region}_{process}_{variation}",
+            title=process,
+            nbinsx=25,
+            xlow=50,
+            xup=550,
         )
         nominal_histo = df.Histo1D(histo_model, observable, "Weights")
 
         if variation == "nominal":
             results.append(
                 AGCResult(
-                    nominal_histo, region, process, variation, nominal_histo, should_vary=True
+                    nominal_histo,
+                    region,
+                    process,
+                    variation,
+                    nominal_histo,
+                    should_vary=True,
                 )
             )
         else:
             results.append(
                 AGCResult(
-                    nominal_histo, region, process, variation, nominal_histo, should_vary=False
+                    nominal_histo,
+                    region,
+                    process,
+                    variation,
+                    nominal_histo,
+                    should_vary=False,
                 )
             )
         print(f"Booked histogram {histo_model.fName}")
@@ -292,7 +324,12 @@ def book_histos(
         if variation == "nominal":
             ml_results.append(
                 AGCResult(
-                    nominal_histo, feature.name, process, variation, nominal_histo, should_vary=True
+                    nominal_histo,
+                    feature.name,
+                    process,
+                    variation,
+                    nominal_histo,
+                    should_vary=True,
                 )
             )
         else:
@@ -382,7 +419,10 @@ def ml_init():
     with create_dask_client(args.scheduler, args.ncores, args.hosts, scheduler_address) as client:
         for input in inputs:
             df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
-                "Events", input.paths, daskclient=client, npartitions=args.npartitions
+                "Events",
+                input.paths,
+                daskclient=client,
+                npartitions=args.npartitions,
             )
             df._headnode.backend.distribute_unique_paths(
                 [
@@ -426,6 +466,10 @@ def main() -> None:
         # To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity.
         ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo)
 
+    if args.statistical_validation:
+        fit_histograms(filename=args.output)
+        return
+
     inputs: list[AGCInput] = retrieve_inputs(
         args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache
     )
@@ -457,6 +501,9 @@ def main() -> None:
         save_histos([r.histo for r in ml_results], output_fname=output_fname)
         print(f"Result histograms from ML inference step saved in file {output_fname}")
 
+    if not args.no_fitting:
+        fit_histograms(filename=args.output)
+
 
 if __name__ == "__main__":
     main()
diff --git a/analyses/cms-open-data-ttbar/helpers.h b/analyses/cms-open-data-ttbar/helpers.h
@@ -10,21 +10,18 @@
 #include "TRandom3.h"
 #include <Math/Vector4D.h>
 
-// functions creating systematic variations
-inline double random_gaus()
-{
-   thread_local std::random_device rd{};
-   thread_local std::mt19937 gen{rd()};
-   thread_local std::normal_distribution<double> d{1, 0.05};
-   return d(gen);
-}
-
-inline ROOT::RVecF jet_pt_resolution(std::size_t size)
+inline ROOT::RVecF jet_pt_resolution(const ROOT::RVecF &jet_pt)
 {
    // normal distribution with 5% variations, shape matches jets
-   ROOT::RVecF res(size);
-   std::generate(std::begin(res), std::end(res), []()
-                 { return random_gaus(); });
+   ROOT::RVecF res(jet_pt.size());
+   // We need to create some pseudo-randomness, it should be thread-safe and at the same time do not depend on RNG. We use the fact that [jet_pt is in GeV....]. 
+   // We then use the gaussian quantile to compute the resolution according to the input mean and sigma, using the random bits from the floating-point values.
+   double mean = 1.;
+   double sigma = 0.05;
+   for (std::size_t i = 0; i < jet_pt.size(); ++i) {
+      res[i] = mean + ROOT::Math::gaussian_quantile(static_cast<double>(0.001 * (static_cast<int>(jet_pt[i] * 1000) % 1000)) + 0.0005, sigma);
+   }
+
    return res;
 }
 

diff --git a/analyses/cms-open-data-ttbar/ml.py b/analyses/cms-open-data-ttbar/ml.py
@@ -3,7 +3,6 @@
 from typing import Tuple
 
 import ROOT
-
 from distributed import get_worker
 
 # histogram bin lower limit to use for each ML input feature

diff --git a/analyses/cms-open-data-ttbar/output_text.txt b/analyses/cms-open-data-ttbar/output_text.txt
@@ -0,0 +1,2 @@
+Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
+All good!
diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_10_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_10_file.root
diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_1_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_1_file.root
diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_5_file.root b/analyses/cms-open-data-ttbar/reference/fitResults/fitResults_5_file.root
diff --git a/analyses/cms-open-data-ttbar/reference/fitResults/validate_fit_result.py b/analyses/cms-open-data-ttbar/reference/fitResults/validate_fit_result.py
@@ -0,0 +1,88 @@
+import argparse
+
+import ROOT
+
+# Create an argument parser
+parser = argparse.ArgumentParser(description="Run the fitting part of the analysis.")
+
+# Add argument for the first parameter (n-max-files-per-sample)
+parser.add_argument('--n-files-per-sample', type=int, required=True, help="Maximum number of files per sample.")
+
+
+def get_fit_result(file_path, fit_result_name):
+    """Open the ROOT file and retrieve the RooFitResult object."""
+    file = ROOT.TFile(file_path)
+    fit_result = file.Get(fit_result_name)
+    if not fit_result:
+        raise ValueError(
+            f"Fit result '{fit_result_name}' not found in {file_path}"
+        )
+    return fit_result
+
+
+def compare_fit_results(result1, result2):
+    """Compare the parameter values of two RooFitResults."""
+    params1 = result1.floatParsFinal()
+    params2 = result2.floatParsFinal()
+
+    # Check for the same number of parameters
+    if params1.getSize() != params2.getSize():
+        print(
+            f"Number of parameters differ: {params1.getSize()} != {params2.getSize()}"
+        )
+        return
+
+    print("Comparing parameters...")
+
+    ERROR = False
+
+    # Loop over parameters in the first result and compare with the second
+    for i in range(params1.getSize()):
+        par1 = params1[i]
+        par2 = params2.find(
+            par1.GetName()
+        )  # Find corresponding parameter by name in result2
+
+        if not par2:
+            print(
+                f"Parameter '{par1.GetName()}' not found in the second fit result."
+            )
+            ERROR = True
+            continue
+
+        # Compare values and print differences
+        if abs(par1.getVal() - par2.getVal()) < 1e-6:
+            print(f"Parameter '{par1.GetName()}' matches: {par1.getVal()}")
+        else:
+            print(
+                f"Parameter '{par1.GetName()}' differs: {par1.getVal()} != {par2.getVal()}"
+            )
+            ERROR = True
+
+        # Optionally compare errors too
+        if abs(par1.getError() - par2.getError()) > 1e-6:
+            print(
+                f"Parameter '{par1.GetName()}' error differs: {par1.getError()} != {par2.getError()}"
+            )
+            ERROR = True
+
+    if ERROR:
+        print("ERROR: Comparison failed.")
+
+
+args = parser.parse_args()
+
+number_of_files = args.n_files_per_sample
+
+# Replace these with the paths to your .root files and fit result names
+file1 = "./fitResults.root"
+file2 = f"./reference/fitResults/fitResults_{number_of_files}_file.root"
+fit_result_name_1 = "fitResult"  # Fit result in first file
+fit_result_name_2 = "fitResult"  # Fit result in second file
+
+# Load the fit results from the two files
+fit_result_1 = get_fit_result(file1, fit_result_name_1)
+fit_result_2 = get_fit_result(file2, fit_result_name_2)
+
+# Compare the fit results
+compare_fit_results(fit_result_1, fit_result_2)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Validating 'histograms.root' against reference 'reference/histos_1_file_per_process.json'...
		All good!