Skip to content

Commit

Permalink
maybe handle remote paths in merge_parquet
Browse files Browse the repository at this point in the history
  • Loading branch information
ikrommyd committed Sep 9, 2024
1 parent c1efa1e commit f691c4a
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 7 deletions.
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -121,3 +121,4 @@ isort.required-imports = ["from __future__ import annotations"]
[tool.ruff.lint.per-file-ignores]
"**.ipynb" = ["B008", "T20", "I002", "E402", "E703", "B018"]
"src/egamma_tnp/nanoaod_efficiency.py" = ["PLW2901"]
"src/egamma_tnp/__init__.py" = ["E402"]
23 changes: 16 additions & 7 deletions scripts/merge_parquet.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
from __future__ import annotations

import argparse
import logging
import os

import awkward as ak
import fsspec

logger = logging.getLogger(__name__)
from egamma_tnp.utils.logger_utils import setup_logger

logger = setup_logger(level="INFO")


def generate(files):
Expand All @@ -18,14 +19,22 @@ def generate(files):

def main():
parser = argparse.ArgumentParser(description="Simple utility script to merge all parquet files in one folder.")
parser.add_argument("--source", type=str, required=True, help="Source folder containing parquet files.")
parser.add_argument("--source", type=str, required=True, help="Source folder or path of files with wildcards containing parquet files.")
parser.add_argument("--target", type=str, required=True, help="Target parquet file location.")
args = parser.parse_args()

logger.info(f"Merging parquet files from {args.source} to {args.target}")
files = [os.path.join(args.source, f) for f in os.listdir(args.source) if f.endswith(".parquet")]

ak.to_parquet_row_groups(generate(files), args.target)
fs, token, paths = fsspec.get_fs_token_paths(args.source)
if len(paths) == 1:
if fs.isfile(paths[0]):
files = paths
else:
files = fs.glob(f"{paths[0]}/*.parquet")
else:
files = paths

final_files = [fs.unstrip_protocol(f) for f in files]
ak.to_parquet_row_groups(generate(final_files), args.target)
logger.info("Done.")


Expand Down
5 changes: 5 additions & 0 deletions src/egamma_tnp/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
from __future__ import annotations

import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="coffea.*")

from egamma_tnp.config import binning
from egamma_tnp.nanoaod_efficiency import ElectronTagNProbeFromNanoAOD, PhotonTagNProbeFromNanoAOD
from egamma_tnp.ntuple_efficiency import ElectronTagNProbeFromNTuples, PhotonTagNProbeFromNTuples

from . import _version

warnings.filterwarnings("ignore", category=FutureWarning, module="coffea.*")
__version__ = _version.__version__
__all__ = ("binning", "ElectronTagNProbeFromNTuples", "ElectronTagNProbeFromNanoAOD", "PhotonTagNProbeFromNTuples", "PhotonTagNProbeFromNanoAOD")

Expand Down

0 comments on commit f691c4a

Please sign in to comment.