Merge pull request #377 from davidwalter2/240110_AppendHistmakerSupport

Add support for appending histmaker analysis output to existing file
WMass · Jan 11, 2024 · 2b2950d · 2b2950d
2 parents 20dbb76 + a219a1c
commit 2b2950d
Show file tree

Hide file tree

Showing 14 changed files with 96 additions and 23 deletions.
diff --git a/scripts/histmakers/mw_lowPU.py b/scripts/histmakers/mw_lowPU.py
@@ -475,4 +475,4 @@ def build_graph_cutFlow(df, dataset):
     scale_to_data(resultdict)
     aggregate_groups(datasets, resultdict, groups_to_aggregate)
 
-output_tools.write_analysis_output(resultdict, f"mw_lowPU_{flavor}.hdf5", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"mw_lowPU_{flavor}.hdf5", args)
diff --git a/scripts/histmakers/mw_with_mu_eta_pt.py b/scripts/histmakers/mw_with_mu_eta_pt.py
@@ -539,4 +539,4 @@ def build_graph(df, dataset):
     scale_to_data(resultdict)
     aggregate_groups(datasets, resultdict, groups_to_aggregate)
 
-output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args)
diff --git a/scripts/histmakers/mz_dilepton.py b/scripts/histmakers/mz_dilepton.py
@@ -406,4 +406,4 @@ def build_graph(df, dataset):
     scale_to_data(resultdict)
     aggregate_groups(datasets, resultdict, args.aggregateGroups)
 
-output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args)
diff --git a/scripts/histmakers/mz_lowPU.py b/scripts/histmakers/mz_lowPU.py
@@ -352,4 +352,4 @@ def build_graph(df, dataset):
     scale_to_data(resultdict)
     aggregate_groups(datasets, resultdict, args.aggregateGroups)
 
-output_tools.write_analysis_output(resultdict, f"mz_lowPU_{flavor}.hdf5", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"mz_lowPU_{flavor}.hdf5", args)
diff --git a/scripts/histmakers/mz_wlike_with_mu_eta_pt.py b/scripts/histmakers/mz_wlike_with_mu_eta_pt.py
@@ -332,4 +332,4 @@ def build_graph(df, dataset):
     scale_to_data(resultdict)
     aggregate_groups(datasets, resultdict, args.aggregateGroups)
 
-output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args)
diff --git a/scripts/histmakers/w_z_gen_dists.py b/scripts/histmakers/w_z_gen_dists.py
@@ -264,7 +264,7 @@ def build_graph(df, dataset):
     return results, weightsum
 
 resultdict = narf.build_and_run(datasets, build_graph)
-output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args)
 
 logger.info("computing angular coefficients")
 z_moments = None
@@ -308,4 +308,4 @@ def build_graph(df, dataset):
         if args.useTheoryAgnosticBinning:
             outfname += "_theoryAgnosticBinning"
         outfname += ".hdf5"
-        output_tools.write_analysis_output(moments_out, outfname, args, update_name=not args.forceDefaultName)
+        output_tools.write_analysis_output(moments_out, outfname, args)
diff --git a/scripts/histmakers/w_z_muonresponse.py b/scripts/histmakers/w_z_muonresponse.py
@@ -180,5 +180,5 @@ def build_graph(df, dataset):
 
 resultdict = narf.build_and_run(datasets, build_graph)
 
-output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args, update_name=not args.forceDefaultName)
+output_tools.write_analysis_output(resultdict, f"{os.path.basename(__file__).replace('py', 'hdf5')}", args)
 
diff --git a/scripts/utilities/run_histmakers.sh b/scripts/utilities/run_histmakers.sh
@@ -0,0 +1,45 @@
+# Due to memory constraints run the histmaker multiple times and append the output file gradually
+# run e.g. source scripts/utilities/run_histmakers.sh wmass /scratch/$USER/results_histmaker/ nominal --unfolding --genVars ptGen absEtaGen --genBins 32 24 --pt 32 25 57 --noAuxiliaryHistograms
+
+if [[ $# -lt 3 ]]; then
+	echo "Requires at least three arguments: run_histmakers.sh <MODE> <OUTPUT_DIR> <POSTFIX> (<OPTIONAL OPTS>)"
+	exit 1
+fi
+
+MODE=$1
+OUTPUT_DIR=$2
+POSTFIX=$3
+shift
+shift
+shift
+
+if [ "$MODE" == "wmass" ]; then
+    HISTMAKER="mw_with_mu_eta_pt"
+    separateProcs=("WminusmunuPostVFP" "WplusmunuPostVFP" "WminustaunuPostVFP" "WplustaunuPostVFP")
+elif [ "$MODE" == "wlike" ]; then
+    HISTMAKER="mz_wlike_with_mu_eta_pt"
+    separateProcs=("ZmumuPostVFP" "ZtautauPostVFP")
+elif [ "$MODE" == "dilepton" ]; then
+    HISTMAKER="mz_dilepton"
+    separateProcs=("ZmumuPostVFP" "ZtautauPostVFP")
+fi
+
+OUTPUT_FILE=$OUTPUT_DIR/${HISTMAKER}_${POSTFIX}.hdf5
+
+OPTS="--forceDefaultName --postfix $POSTFIX $@"
+
+CMD="python ./scripts/histmakers/${HISTMAKER}.py \
+    -o $OUTPUT_DIR $OPTS --excludeProcs ${separateProcs[@]}"
+# echo $CMD
+eval $CMD
+
+# Processes that should be processed individually
+for proc in "${separateProcs[@]}"; do
+    CMD="python ./scripts/histmakers/${HISTMAKER}.py \
+        --appendOutputFile $OUTPUT_FILE $OPTS --filterProcs $proc"
+    # echo $CMD
+    eval $CMD
+done
+
+
+
diff --git a/utilities/common.py b/utilities/common.py
@@ -178,6 +178,7 @@ def __call__(self, parser, namespace, values, option_string=None):
     parser.add_argument("--onlyMainHistograms", action='store_true', help="Only produce some histograms, skipping (most) systematics to run faster when those are not needed")
     parser.add_argument("--met", type=str, choices=["DeepMETReso", "RawPFMET"], help="MET (DeepMETReso or RawPFMET)", default="DeepMETReso")
     parser.add_argument("-o", "--outfolder", type=str, default="", help="Output folder")
+    parser.add_argument("--appendOutputFile", type=str, default="", help="Append analysis output to specified output file")
     parser.add_argument("-e", "--era", type=str, choices=["2016PreVFP","2016PostVFP", "2017", "2018"], help="Data set to process", default="2016PostVFP")
     parser.add_argument("--nonClosureScheme", type=str, default = "A-only", choices=["none", "A-M-separated", "A-M-combined", "binned", "binned-plus-M", "A-only", "M-only"], help = "source of the Z non-closure nuisances")
     parser.add_argument("--correlatedNonClosureNP", action="store_false", help="disable the de-correlation of Z non-closure nuisance parameters after the jpsi massfit")

diff --git a/utilities/io_tools/input_tools.py b/utilities/io_tools/input_tools.py
@@ -16,6 +16,12 @@
 
 scetlib_tnp_match_expr = ["^gamma_.*[+|-]\d+", "^b_.*[+|-]\d+", "^s[+|-]\d+", "^h_.*\d+"]
 
+def load_results_h5py(h5file):
+    if "results" in h5file.keys():
+        return ioutils.pickle_load_h5py(h5file["results"])
+    else:
+        return {k: ioutils.pickle_load_h5py(v) for k,v in h5file.items()}
+
 def read_and_scale_pkllz4(fname, proc, histname, calculate_lumi=False, scale=1):
     with lz4.frame.open(fname) as f:
         results = pickle.load(f)
@@ -24,29 +30,29 @@ def read_and_scale_pkllz4(fname, proc, histname, calculate_lumi=False, scale=1):
 
 def read_hist_names(fname, proc):
     with h5py.File(fname, "r") as h5file:
-        results = ioutils.pickle_load_h5py(h5file["results"])
+        results = load_results_h5py(h5file)
         if proc not in results:
             raise ValueError(f"Invalid process {proc}! No output found in file {fname}")
         return results[proc]["output"].keys()
 
 def read_keys(fname):
     with h5py.File(fname, "r") as h5file:
-        results = ioutils.pickle_load_h5py(h5file["results"])
+        results = load_results_h5py(h5file)
         return results.keys()
 
 def read_xsec(fname, proc):
     with h5py.File(fname, "r") as h5file:
-        results = ioutils.pickle_load_h5py(h5file["results"])
+        results = load_results_h5py(h5file)
         return results[proc]["dataset"]["xsec"]
 
 def read_sumw(fname, proc):
     with h5py.File(fname, "r") as h5file:
-        results = ioutils.pickle_load_h5py(h5file["results"])
+        results = load_results_h5py(h5file)
         return results[proc]["weight_sum"]
 
 def read_and_scale(fname, proc, histname, calculate_lumi=False, scale=1, apply_xsec=True):
     with h5py.File(fname, "r") as h5file:
-        results = ioutils.pickle_load_h5py(h5file["results"])
+        results = load_results_h5py(h5file)
 
         return load_and_scale(results, proc, histname, calculate_lumi, scale, apply_xsec)
 
@@ -68,7 +74,7 @@ def load_and_scale(res_dict, proc, histname, calculate_lumi=False, scale=1., app
 
 def read_all_and_scale(fname, procs, histnames, lumi=False):
     h5file = h5py.File(fname, "r")
-    results = ioutils.pickle_load_h5py(h5file["results"])
+    results = load_results_h5py(h5file)
 
     hists = []
     for histname in histnames:
@@ -417,6 +423,8 @@ def get_metadata(infile):
             results = pickle.load(f)
     elif infile.endswith(".hdf5"):
         h5file = h5py.File(infile, "r")
+        if "meta_info" in h5file.keys():
+            return ioutils.pickle_load_h5py(h5file["meta_info"])
         meta = h5file.get("results", h5file.get("meta", None))
         results = ioutils.pickle_load_h5py(meta) if meta else None
 

diff --git a/utilities/io_tools/output_tools.py b/utilities/io_tools/output_tools.py
@@ -49,9 +49,8 @@ def writeMetaInfoToRootFile(rtfile, exclude_diff='notebooks', args=None):
         out = ROOT.TNamed(str(key), str(value))
         out.Write()
 
-def write_analysis_output(results, outfile, args, update_name=True):
+def write_analysis_output(results, outfile, args):
     analysis_debug_output(results)
-    results.update({"meta_info" : narf.ioutils.make_meta_info_dict(args=args, wd=common.base_dir)})
 
     to_append = []
     if args.theoryCorr and not args.theoryCorrAltOnly:
@@ -63,7 +62,7 @@ def write_analysis_output(results, outfile, args, update_name=True):
     if hasattr(args, "ptqVgen") and args.ptqVgen:
         to_append.append("vars_qtbyQ")
 
-    if to_append and update_name:
+    if to_append and not args.forceDefaultName:
         outfile = outfile.replace(".hdf5", f"_{'_'.join(to_append)}.hdf5")
 
     if args.postfix:
@@ -75,9 +74,27 @@ def write_analysis_output(results, outfile, args, update_name=True):
             os.makedirs(args.outfolder)
         outfile = os.path.join(args.outfolder, outfile)
 
+    if args.appendOutputFile:
+        outfile = args.appendOutputFile
+        if os.path.isfile(outfile):
+            logger.info(f"Analysis output will be appended to file {outfile}")
+            open_as="a"
+        else:
+            logger.warning(f"Analysis output requested to be appended to file {outfile}, but the file does not exist yet, it will be created instead")
+            open_as="w"
+    else:
+        if os.path.isfile(outfile):
+            logger.warning(f"Output file {outfile} exists already, it will be overwritten")
+        open_as="w"
+
     time0 = time.time()
-    with h5py.File(outfile, 'w') as f:
-        narf.ioutils.pickle_dump_h5py("results", results, f)
+    with h5py.File(outfile, open_as) as f:
+        for k, v in results.items():
+            logger.debug(f"Pickle and dump {k}")
+            narf.ioutils.pickle_dump_h5py(k, v, f)
+
+        if "meta_info" not in f.keys():
+            narf.ioutils.pickle_dump_h5py("meta_info", narf.ioutils.make_meta_info_dict(args=args, wd=common.base_dir), f)
 
     logger.info(f"Writing output: {time.time()-time0}")
     logger.info(f"Output saved in {outfile}")

diff --git a/wremnants/datasets/datagroups.py b/wremnants/datasets/datagroups.py
@@ -1,4 +1,5 @@
 from utilities import boostHistHelpers as hh,common,logging
+from utilities.io_tools import input_tools
 import lz4.frame
 import pickle
 import h5py
@@ -29,7 +30,7 @@ def __init__(self, infile, mode=None, **kwargs):
         elif infile.endswith(".hdf5"):
             logger.info("Load input file")
             self.h5file = h5py.File(infile, "r")
-            self.results = narf.ioutils.pickle_load_h5py(self.h5file["results"])
+            self.results = input_tools.load_results_h5py(self.h5file)
         else:
             raise ValueError(f"{infile} has unsupported file type")
 

diff --git a/wremnants/helicity_utils.py b/wremnants/helicity_utils.py
@@ -11,6 +11,7 @@
 from .theory_tools import moments_to_angular_coeffs
 from utilities import common, logging
 from utilities import boostHistHelpers as hh
+from utilities.io_tools import input_tools
 import numpy as np
 import h5py
 import hdf5plugin
@@ -31,7 +32,7 @@ def makehelicityWeightHelper(is_w_like = False, filename=None):
     if filename is None:
         filename = f"{common.data_dir}/angularCoefficients/w_z_moments_theoryAgnosticBinning.hdf5"
     with h5py.File(filename, "r") as ff:
-        out = narf.ioutils.pickle_load_h5py(ff["results"])
+        out = input_tools.load_results_h5py(ff)
 
     moments = out["Z"] if is_w_like else out["W"]
 

diff --git a/wremnants/theory_corrections.py b/wremnants/theory_corrections.py
@@ -261,7 +261,7 @@ def make_qcd_uncertainty_helper_by_helicity(is_w_like = False, filename=None):
 
     # load moments from file
     with h5py.File(filename, "r") as h5file:
-        results = narf.ioutils.pickle_load_h5py(h5file["results"])
+        results = input_tools.load_results_h5py(h5file)
         moments = results["Z"] if is_w_like else results["W"]
 
     moments_nom = moments[{"muRfact" : 1.j, "muFfact" : 1.j}].values()
@@ -316,7 +316,7 @@ def make_helicity_test_corrector(is_w_like = False, filename = None):
 
     # load moments from file
     with h5py.File(filename, "r") as h5file:
-        results = narf.ioutils.pickle_load_h5py(h5file["results"])
+        results = input_tools.load_results_h5py(h5file)
         moments = results["Z"] if is_w_like else results["W"]
 
     coeffs = theory_tools.moments_to_angular_coeffs(moments)