Skip to content
This repository has been archived by the owner on Jun 21, 2022. It is now read-only.

Commit

Permalink
Merge pull request #264 from scikit-hep/issue-263
Browse files Browse the repository at this point in the history
Fix iteration over DataFrames and provide more interfaces
  • Loading branch information
jpivarski authored Mar 27, 2019
2 parents 3cd056f + 922cd1c commit adba8a5
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 12 deletions.
4 changes: 3 additions & 1 deletion uproot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@
from uproot.interp.objects import STLString
asdebug = asjagged(asdtype("u1"))

from uproot import pandas

# put help strings on everything (they're long, too disruptive to intersperse
# in the code, and are built programmatically to avoid duplication; Python's
# inline docstring method doesn't accept non-literals)
Expand All @@ -175,4 +177,4 @@
# don't expose uproot.uproot; it's ugly
del uproot

__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "__version__"]
__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "pandas", "__version__"]
20 changes: 16 additions & 4 deletions uproot/_connect/to_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,19 +36,28 @@

import awkward as awkwardbase

import uproot.tree
import uproot.interp.numerical
from uproot.interp.jagged import asjagged
from uproot.interp.numerical import asdtype
from uproot.interp.objects import asobj
from uproot.interp.objects import astable

from uproot.source.memmap import MemmapSource
from uproot.source.xrootd import XRootDSource
from uproot.source.http import HTTPSource

class TTreeMethods_pandas(object):
def __init__(self, tree):
self._tree = tree

def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
import pandas
return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)

def iterate(self, branches=None, entrysteps=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
import pandas
return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)
return self._tree.iterate(branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportentries=False, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)

def default_flatname(branchname, fieldname, index):
out = branchname
Expand Down Expand Up @@ -131,7 +140,7 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw
interpretation = interpretation.content

# justifies the assumption that array.content == array.flatten() and array.stops.max() == array.stops[-1]
assert array._canuseoffset() and len(array.starts) > 0 and array.starts[0] == 0
assert array._canuseoffset() and (len(array.starts) == 0 or array.starts[0] == 0)

if starts is None:
starts = array.starts
Expand All @@ -141,7 +150,10 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw
if starts is not array.starts and not awkward.numpy.array_equal(starts, array.starts):
raise ValueError("cannot use flatten=True on branches with different jagged structure, such as electrons and muons (different, variable number of each per event); either explicitly select compatible branches, such as [\"MET_*\", \"Muon_*\"] (scalar and variable per event is okay), or set flatten=False")

array = array.content
if len(array.starts) == 0:
array = array.content[0:0]
else:
array = array.content
needbroadcasts.append(False)

else:
Expand Down
40 changes: 40 additions & 0 deletions uproot/pandas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
#!/usr/bin/env python

# Copyright (c) 2019, IRIS-HEP
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
# * Redistributions of source code must retain the above copyright notice, this
# list of conditions and the following disclaimer.
#
# * Redistributions in binary form must reproduce the above copyright notice,
# this list of conditions and the following disclaimer in the documentation
# and/or other materials provided with the distribution.
#
# * Neither the name of the copyright holder nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""Top-level functions for Pandas."""

import uproot.tree
from uproot.source.memmap import MemmapSource
from uproot.source.xrootd import XRootDSource
from uproot.source.http import HTTPSource

def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options):
import pandas
return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options)
2 changes: 1 addition & 1 deletion uproot/rootio.py
Original file line number Diff line number Diff line change
Expand Up @@ -666,7 +666,7 @@ def _defineclasses(streamerinfos, classes):

if isinstance(streamerinfo, TStreamerInfo) and pyclassname not in builtin_classes and (pyclassname not in classes or hasattr(classes[pyclassname], "_versions")):
code = [" @classmethod",
" def _readinto(cls, self, source, cursor, context, parent):",
" def _readinto(cls, self, source, cursor, context, parent, asclass=None):",
" start, cnt, classversion = _startcheck(source, cursor)",
" if cls._classversion != classversion:",
" cursor.index = start",
Expand Down
27 changes: 22 additions & 5 deletions uproot/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,8 +115,15 @@ def iterate(path, treepath, branches=None, entrysteps=None, outputtype=dict, nam
for tree, newbranches, globalentrystart, thispath, thisfile in _iterate(path, treepath, branches, awkward, localsource, xrootdsource, httpsource, **options):
for start, stop, arrays in tree.iterate(branches=newbranches, entrysteps=entrysteps, outputtype=outputtype, namedecode=namedecode, reportentries=True, entrystart=0, entrystop=tree.numentries, flatten=flatten, flatname=flatname, awkwardlib=awkward, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking):
if getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame":
index = awkward.numpy.frombuffer(arrays.index.data, dtype=arrays.index.dtype)
awkward.numpy.add(index, globalentrystart, index)
if type(arrays.index).__name__ == "MultiIndex":
index = arrays.index.levels[0].to_numpy()
awkward.numpy.add(index, globalentrystart, out=index)
elif type(arrays.index).__name__ == "RangeIndex":
arrays.index._start += globalentrystart
arrays.index._stop += globalentrystart
else:
index = arrays.index.to_numpy()
awkward.numpy.add(index, globalentrystart, out=index)
out = (arrays,)
if reportentries:
out = (globalentrystart + start, globalentrystart + stop) + out
Expand Down Expand Up @@ -519,6 +526,9 @@ def lazyarrays(self, branches=None, outputtype=dict, namedecode=None, limitbytes
def iterate(self, branches=None, entrysteps=None, outputtype=dict, namedecode=None, reportentries=False, entrystart=None, entrystop=None, flatten=False, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
entrystart, entrystop = self._normalize_entrystartstop(entrystart, entrystop)

# for the case of outputtype == pandas.DataFrame, do some preparation to fill DataFrames efficiently
ispandas = getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame"

if entrysteps is None:
entrysteps = self.clusters(branches, entrystart=entrystart, entrystop=entrystop, strict=False)

Expand Down Expand Up @@ -561,7 +571,7 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize):
if cache is not None:
cache[cachekey] = out
if flatten and isinstance(interpretation, asjagged):
return out.content
return out.flatten()
elif pythonize:
return list(out)
else:
Expand All @@ -571,15 +581,22 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize):
outputtype = namedtuple("Arrays", [codecs.ascii_decode(branch.name, "replace")[0] if namedecode is None else branch.name.decode(namedecode) for branch, interpretation in branches])
def wrap_for_python_scope(futures, start, stop):
return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures])
elif getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame":

elif ispandas:
import uproot._connect.to_pandas
def wrap_for_python_scope(futures, start, stop):
return lambda: outputtype(data=OrderedDict((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, isinstance(interpretation, asjagged))) for branch, interpretation, future, past, cachekey in futures), index=awkward.numpy.arange(start, stop))
def wrap_again(branch, interpretation, future):
return lambda: interpretation.finalize(future(), branch)
return lambda: uproot._connect.to_pandas.futures2df([(branch.name, interpretation, wrap_again(branch, interpretation, future)) for branch, interpretation, future, past, cachekey in futures], outputtype, start, stop, flatten, flatname, awkward)

elif isinstance(outputtype, type) and issubclass(outputtype, dict):
def wrap_for_python_scope(futures, start, stop):
return lambda: outputtype((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, False)) for branch, interpretation, future, past, cachekey in futures)

elif isinstance(outputtype, type) and issubclass(outputtype, (list, tuple)):
def wrap_for_python_scope(futures, start, stop):
return lambda: outputtype(evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures)

else:
def wrap_for_python_scope(futures, start, stop):
return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures])
Expand Down
2 changes: 1 addition & 1 deletion uproot/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

import re

__version__ = "3.4.15"
__version__ = "3.4.16"
version = __version__
version_info = tuple(re.split(r"[-\.]", __version__))

Expand Down

0 comments on commit adba8a5

Please sign in to comment.