Added validation script

Added short comment about script origin and adaptation
root-project · Jul 9, 2024 · 288a59c · 288a59c
1 parent f1d4df2
commit 288a59c
Showing 1 changed file with 119 additions and 0 deletions.
diff --git a/analyses/cms-open-data-ttbar/validate_histograms.py b/analyses/cms-open-data-ttbar/validate_histograms.py
@@ -0,0 +1,119 @@
+# This script is original of the IRIS-HEP analysis-grand-challenge repository 
+# and has been adapted to serve the purpose of the RDF-based analysis.
+
+# Compare the content of histograms produced by ttbar_analysis_pipeline with a reference file.
+# A reference file for N_FILES_MAX_PER_SAMPLE=1 is available in directory `reference/`.
+
+from __future__ import annotations
+import argparse
+from collections import defaultdict
+import json
+import numpy as np
+import sys
+import uproot
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--histos", help="ROOT file containing the output histograms. Defaults to './histograms.root'.", default="histograms.root")
+    group = parser.add_mutually_exclusive_group(required=True)
+    group.add_argument("--reference", help="JSON reference against which histogram contents should be compared")
+    group.add_argument("--dump-json", help="Print JSON representation of histogram contents to screen", action='store_true')
+    parser.add_argument("--verbose", help="Print extra information about bin mismatches", action='store_true')
+    return parser.parse_args()
+
+# convert uproot file containing only TH1Ds to a corresponding JSON-compatible dict with structure:
+# { "histo1": { "edges": [...], "contents": [...] }, "histo2": { ... }, ... }
+# Only the highest namecycle for every histogram is considered, and cycles are stripped from the histogram names.
+def as_dict(f: uproot.ReadOnlyDirectory) -> dict[str, dict]:
+    histos = defaultdict(dict)
+    # this assumes that the rightmost ";" (if any) comes before a namecycle
+    names = set(k.rsplit(";", 1)[0] for k in f)
+    for name in names:
+        h = f[name]
+        assert isinstance(h, uproot.behaviors.TH1.Histogram)
+        histos[name]["edges"] = h.axis().edges().tolist()
+        histos[name]["contents"] = h.counts(flow=True).tolist()
+    return histos
+
+def find_matching_histogram(name: str, histos: dict) -> str | None:
+    if name in histos:
+        return name
+    for hname in histos.keys():
+        if hname.startswith(name):
+            return hname
+    return None
+
+def validate(histos: dict, reference: dict, verbose=False) -> dict[str, list[str]]:
+    errors = defaultdict(list)
+    for name, ref_h in reference.items():
+        matching_name = find_matching_histogram(name, histos)
+        if matching_name is None:
+            if name != "4j1b_pseudodata" and name != "4j2b_pseudodata":
+                errors[name].append("Histogram not found.")
+            continue
+
+        h = histos[matching_name]
+        if not np.allclose(h['edges'], ref_h['edges']):
+            errors[name].append(f"Edges do not match:\n\tgot      {h['edges']}\n\texpected {ref_h['edges']}")
+        contents_depend_on_rng = "pt_res_up" in name
+        is_close = np.isclose(h['contents'], ref_h['contents'])
+
+        if not contents_depend_on_rng and not all(is_close):
+            where_not_close = np.where(np.invert(is_close))[0]
+            diff_values = np.diff(where_not_close)
+            split_indices = np.argwhere(np.abs(diff_values) > 1).reshape(-1)
+
+            if len(split_indices) == 0:
+                split_values = [where_not_close]
+            else:
+                split_indices += 1
+                split_values = np.split(where_not_close, split_indices)
+
+            is_error = False
+            for group in split_values:
+                h_group = np.array(h['contents'])[group]
+                ref_group = np.array(ref_h['contents'])[group]
+                if not np.allclose(h_group, ref_group, atol=2.0):
+                    is_error = True
+                    if verbose:
+                        print(f"In {name}: Not close enough for bin migration")
+                        print("histogram: ", h_group, ", reference: ", ref_group)
+                        print()
+                elif not np.allclose(sum(h_group), sum(ref_group)):
+                    is_error = True
+                    if verbose:
+                        print(f"In {name}: Partial sums are unequal")
+                        print("histogram: ", h_group, ", reference: ", ref_group)
+                        print()
+                else:
+                    if verbose:
+                        print(f"In {name}: Bin migration likely")
+                        print("histogram: ", h_group, ", reference: ", ref_group)
+                        print()
+            if is_error:
+                errors[name].append(f"Contents do not match:\n\tgot      {h['contents']}\n\texpected {ref_h['contents']}")
+
+    return errors
+
+if __name__ == "__main__":
+    args = parse_args()
+    with uproot.open(args.histos) as f:
+        histos = as_dict(f)
+
+    if args.dump_json:
+        print(json.dumps(histos, indent=2, sort_keys=True))
+        sys.exit(0)
+
+    with open(args.reference) as reference:
+        ref_histos = json.load(reference)
+
+    print(f"Validating '{args.histos}' against reference '{args.reference}'...")
+    errs = validate(histos=histos, reference=ref_histos, verbose=args.verbose)
+
+    if len(errs) == 0:
+        print("All good!")
+    else:
+        for hist_name, errors in errs.items():
+            errors = '\n\t'.join(errors)
+            print(f"{hist_name}\n\t{errors}")
+        sys.exit(1)