Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Isolate two execution modes in main function #50

Merged
merged 1 commit into from
Jul 26, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
153 changes: 85 additions & 68 deletions analyses/cms-open-data-ttbar/analysis.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import argparse
import multiprocessing
from pathlib import Path
from time import time
from typing import Optional, Tuple

Check failure on line 5 in analyses/cms-open-data-ttbar/analysis.py

View workflow job for this annotation

GitHub Actions / linter

Ruff (F401)

analyses/cms-open-data-ttbar/analysis.py:5:20: F401 `typing.Optional` imported but unused

from distributed import Client, get_worker, LocalCluster, SSHCluster
import ml
Expand Down Expand Up @@ -113,26 +113,6 @@
f"Unexpected scheduling mode '{scheduler}'. Valid modes are ['dask-local', 'dask-ssh']."
)


def make_rdf(
files: list[str], client: Optional[Client], npartitions: Optional[int]
) -> ROOT.RDataFrame:
"""Construct and return a dataframe or, if a dask client is present, a distributed dataframe."""
if client is not None:
d = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
"Events", files, daskclient=client, npartitions=npartitions
)
d._headnode.backend.distribute_unique_paths(
[
"helpers.h",
"ml_helpers.cpp",
]
)
return d

return ROOT.RDataFrame("Events", files)


def define_trijet_mass(df: ROOT.RDataFrame) -> ROOT.RDataFrame:
"""Add the trijet_mass observable to the dataframe after applying the appropriate selections."""

Expand Down Expand Up @@ -322,6 +302,88 @@
ROOT.gSystem.CompileMacro(str(cpp_source), "kO")


def run_mt(
program_start: float,
args: argparse.Namespace,
inputs: list[AGCInput],
results: list[AGCResult],
ml_results: list[AGCResult]) -> None:
ROOT.EnableImplicitMT(args.ncores)
print(f"Number of threads: {ROOT.GetThreadPoolSize()}")
load_cpp()
if args.inference:
ml.load_cpp()

for input in inputs:
df = ROOT.RDataFrame("Events", input.paths)
hist_list, ml_hist_list = book_histos(
df, input.process, input.variation, input.nevents, inference=args.inference
)
results += hist_list
ml_results += ml_hist_list

for r in results + ml_results:
if r.should_vary:
r.histo = ROOT.RDF.Experimental.VariationsFor(r.histo)

print(
f"Building the computation graphs took {time() - program_start:.2f} seconds")

# Run the event loops for all processes and variations here
run_graphs_start = time()
ROOT.RDF.RunGraphs([r.nominal_histo for r in results + ml_results])

print(
f"Executing the computation graphs took {time() - run_graphs_start:.2f} seconds")


def run_distributed(
program_start: float,
args: argparse.Namespace,
inputs: list[AGCInput],
results: list[AGCResult],
ml_results: list[AGCResult]) -> None:
if args.inference:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)
if args.inference:
ROOT.RDF.Experimental.Distributed.initialize(ml.load_cpp)
else:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)

with create_dask_client(args.scheduler, args.ncores, args.hosts) as client:
for input in inputs:
df = ROOT.RDF.Experimental.Distributed.Dask.RDataFrame(
"Events", input.paths, daskclient=client, npartitions=args.npartitions
)
df._headnode.backend.distribute_unique_paths(
[
"helpers.h",
"ml_helpers.cpp",
]
)
hist_list, ml_hist_list = book_histos(
df, input.process, input.variation, input.nevents, inference=args.inference
)
results += hist_list
ml_results += ml_hist_list

for r in results + ml_results:
if r.should_vary:
r.histo = ROOT.RDF.Experimental.Distributed.VariationsFor(
r.histo)

print(
f"Building the computation graphs took {time() - program_start:.2f} seconds")

# Run the event loops for all processes and variations here
run_graphs_start = time()
ROOT.RDF.Experimental.Distributed.RunGraphs(
[r.nominal_histo for r in results + ml_results])

print(
f"Executing the computation graphs took {time() - run_graphs_start:.2f} seconds")


def main() -> None:
program_start = time()
args = parse_args()
Expand All @@ -336,61 +398,16 @@
# To only change the verbosity in a given scope, use ROOT.Experimental.RLogScopedVerbosity.
ROOT.Detail.RDF.RDFLogChannel().SetVerbosity(ROOT.Experimental.ELogLevel.kInfo)

if args.scheduler == "mt":
# Setup for local, multi-thread RDataFrame
ROOT.EnableImplicitMT(args.ncores)
print(f"Number of threads: {ROOT.GetThreadPoolSize()}")
client = None
load_cpp()
if args.inference:
ml.load_cpp()

run_graphs = ROOT.RDF.RunGraphs
else:
# Setup for distributed RDataFrame
client = create_dask_client(args.scheduler, args.ncores, args.hosts)
if args.inference:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)
if args.inference:
ROOT.RDF.Experimental.Distributed.initialize(ml.load_cpp)

else:
ROOT.RDF.Experimental.Distributed.initialize(load_cpp)
run_graphs = ROOT.RDF.Experimental.Distributed.RunGraphs

# Book RDataFrame results
inputs: list[AGCInput] = retrieve_inputs(
args.n_max_files_per_sample, args.remote_data_prefix, args.data_cache
)
results: list[AGCResult] = []
ml_results: list[AGCResult] = []

for input in inputs:
df = make_rdf(input.paths, client, args.npartitions)
hist_list, ml_hist_list = book_histos(
df, input.process, input.variation, input.nevents, inference=args.inference
)
results += hist_list
ml_results += ml_hist_list

# Select the right VariationsFor function depending on RDF or DistRDF
if type(df).__module__ == "DistRDF.DataFrame":
variationsfor_func = ROOT.RDF.Experimental.Distributed.VariationsFor
if args.scheduler == "mt":
run_mt(program_start, args, inputs, results, ml_results)
else:
variationsfor_func = ROOT.RDF.Experimental.VariationsFor
for r in results + ml_results:
if r.should_vary:
r.histo = variationsfor_func(r.histo)

print(f"Building the computation graphs took {time() - program_start:.2f} seconds")

# Run the event loops for all processes and variations here
run_graphs_start = time()
run_graphs([r.nominal_histo for r in results + ml_results])

print(f"Executing the computation graphs took {time() - run_graphs_start:.2f} seconds")
if client is not None:
client.close()
run_distributed(program_start, args, inputs, results, ml_results)

results = postprocess_results(results)
save_plots(results)
Expand Down
Loading