Merge pull request #264 from scikit-hep/issue-263

Fix iteration over DataFrames and provide more interfaces
scikit-hep · Mar 27, 2019 · adba8a5 · adba8a5
2 parents 3cd056f + 922cd1c
commit adba8a5
Show file tree

Hide file tree

Showing 6 changed files with 83 additions and 12 deletions.
diff --git a/uproot/__init__.py b/uproot/__init__.py
@@ -164,6 +164,8 @@
 from uproot.interp.objects import STLString
 asdebug = asjagged(asdtype("u1"))
 
+from uproot import pandas
+
 # put help strings on everything (they're long, too disruptive to intersperse
 # in the code, and are built programmatically to avoid duplication; Python's
 # inline docstring method doesn't accept non-literals)
@@ -175,4 +177,4 @@
 # don't expose uproot.uproot; it's ugly
 del uproot
 
-__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "__version__"]
+__all__ = ["open", "xrootd", "http", "iterate", "numentries", "lazyarray", "lazyarrays", "daskarray", "daskarrays", "daskframe", "create", "recreate", "update", "MemmapSource", "FileSource", "XRootDSource", "HTTPSource", "interpret", "asdtype", "asarray", "asdouble32", "asstlbitset", "asjagged", "astable", "asobj", "asgenobj", "asstring", "asdebug", "SimpleArray", "STLVector", "STLMap", "STLString", "pandas", "__version__"]
diff --git a/uproot/_connect/to_pandas.py b/uproot/_connect/to_pandas.py
@@ -36,19 +36,28 @@
 
 import awkward as awkwardbase
 
+import uproot.tree
 import uproot.interp.numerical
 from uproot.interp.jagged import asjagged
 from uproot.interp.numerical import asdtype
 from uproot.interp.objects import asobj
 from uproot.interp.objects import astable
 
+from uproot.source.memmap import MemmapSource
+from uproot.source.xrootd import XRootDSource
+from uproot.source.http import HTTPSource
+
 class TTreeMethods_pandas(object):
  def __init__(self, tree):
  self._tree = tree
 
- def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
+ def df(self, branches=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
+ import pandas
+ return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)
+
+ def iterate(self, branches=None, entrysteps=None, namedecode="utf-8", entrystart=None, entrystop=None, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
  import pandas
- return self._tree.arrays(branches=branches, outputtype=pandas.DataFrame, namedecode=namedecode, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)
+ return self._tree.iterate(branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportentries=False, entrystart=entrystart, entrystop=entrystop, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking)
 
 def default_flatname(branchname, fieldname, index):
  out = branchname
@@ -131,7 +140,7 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw
  interpretation = interpretation.content
 
  # justifies the assumption that array.content == array.flatten() and array.stops.max() == array.stops[-1]
- assert array._canuseoffset() and len(array.starts) > 0 and array.starts[0] == 0
+ assert array._canuseoffset() and (len(array.starts) == 0 or array.starts[0] == 0)
 
  if starts is None:
  starts = array.starts
@@ -141,7 +150,10 @@ def futures2df(futures, outputtype, entrystart, entrystop, flatten, flatname, aw
  if starts is not array.starts and not awkward.numpy.array_equal(starts, array.starts):
  raise ValueError("cannot use flatten=True on branches with different jagged structure, such as electrons and muons (different, variable number of each per event); either explicitly select compatible branches, such as [\"MET_*\", \"Muon_*\"] (scalar and variable per event is okay), or set flatten=False")
 
- array = array.content
+ if len(array.starts) == 0:
+ array = array.content[0:0]
+ else:
+ array = array.content
  needbroadcasts.append(False)
 
  else:

diff --git a/uproot/pandas.py b/uproot/pandas.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python
+
+# Copyright (c) 2019, IRIS-HEP
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+# list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+# this list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Top-level functions for Pandas."""
+
+import uproot.tree
+from uproot.source.memmap import MemmapSource
+from uproot.source.xrootd import XRootDSource
+from uproot.source.http import HTTPSource
+
+def iterate(path, treepath, branches=None, entrysteps=None, namedecode="utf-8", reportpath=False, reportfile=False, flatten=True, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True, localsource=MemmapSource.defaults, xrootdsource=XRootDSource.defaults, httpsource=HTTPSource.defaults, **options):
+ import pandas
+ return uproot.tree.iterate(path, treepath, branches=branches, entrysteps=entrysteps, outputtype=pandas.DataFrame, namedecode=namedecode, reportpath=reportpath, reportfile=reportfile, reportentries=False, flatten=flatten, flatname=flatname, awkwardlib=awkwardlib, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking, localsource=localsource, xrootdsource=xrootdsource, httpsource=httpsource, **options)
diff --git a/uproot/rootio.py b/uproot/rootio.py
@@ -666,7 +666,7 @@ def _defineclasses(streamerinfos, classes):
 
  if isinstance(streamerinfo, TStreamerInfo) and pyclassname not in builtin_classes and (pyclassname not in classes or hasattr(classes[pyclassname], "_versions")):
  code = [" @classmethod",
- " def _readinto(cls, self, source, cursor, context, parent):",
+ " def _readinto(cls, self, source, cursor, context, parent, asclass=None):",
  " start, cnt, classversion = _startcheck(source, cursor)",
  " if cls._classversion != classversion:",
  " cursor.index = start",

diff --git a/uproot/tree.py b/uproot/tree.py
@@ -115,8 +115,15 @@ def iterate(path, treepath, branches=None, entrysteps=None, outputtype=dict, nam
  for tree, newbranches, globalentrystart, thispath, thisfile in _iterate(path, treepath, branches, awkward, localsource, xrootdsource, httpsource, **options):
  for start, stop, arrays in tree.iterate(branches=newbranches, entrysteps=entrysteps, outputtype=outputtype, namedecode=namedecode, reportentries=True, entrystart=0, entrystop=tree.numentries, flatten=flatten, flatname=flatname, awkwardlib=awkward, cache=cache, basketcache=basketcache, keycache=keycache, executor=executor, blocking=blocking):
  if getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame":
- index = awkward.numpy.frombuffer(arrays.index.data, dtype=arrays.index.dtype)
- awkward.numpy.add(index, globalentrystart, index)
+ if type(arrays.index).__name__ == "MultiIndex":
+ index = arrays.index.levels[0].to_numpy()
+ awkward.numpy.add(index, globalentrystart, out=index)
+ elif type(arrays.index).__name__ == "RangeIndex":
+ arrays.index._start += globalentrystart
+ arrays.index._stop += globalentrystart
+ else:
+ index = arrays.index.to_numpy()
+ awkward.numpy.add(index, globalentrystart, out=index)
  out = (arrays,)
  if reportentries:
  out = (globalentrystart + start, globalentrystart + stop) + out
@@ -519,6 +526,9 @@ def lazyarrays(self, branches=None, outputtype=dict, namedecode=None, limitbytes
  def iterate(self, branches=None, entrysteps=None, outputtype=dict, namedecode=None, reportentries=False, entrystart=None, entrystop=None, flatten=False, flatname=None, awkwardlib=None, cache=None, basketcache=None, keycache=None, executor=None, blocking=True):
  entrystart, entrystop = self._normalize_entrystartstop(entrystart, entrystop)
 
+ # for the case of outputtype == pandas.DataFrame, do some preparation to fill DataFrames efficiently
+ ispandas = getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame"
+
  if entrysteps is None:
  entrysteps = self.clusters(branches, entrystart=entrystart, entrystop=entrystop, strict=False)
 
@@ -561,7 +571,7 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize):
  if cache is not None:
  cache[cachekey] = out
  if flatten and isinstance(interpretation, asjagged):
- return out.content
+ return out.flatten()
  elif pythonize:
  return list(out)
  else:
@@ -571,15 +581,22 @@ def evaluate(branch, interpretation, future, past, cachekey, pythonize):
  outputtype = namedtuple("Arrays", [codecs.ascii_decode(branch.name, "replace")[0] if namedecode is None else branch.name.decode(namedecode) for branch, interpretation in branches])
  def wrap_for_python_scope(futures, start, stop):
  return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures])
- elif getattr(outputtype, "__name__", None) == "DataFrame" and getattr(outputtype, "__module__", None) == "pandas.core.frame":
+
+ elif ispandas:
+ import uproot._connect.to_pandas
  def wrap_for_python_scope(futures, start, stop):
- return lambda: outputtype(data=OrderedDict((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, isinstance(interpretation, asjagged))) for branch, interpretation, future, past, cachekey in futures), index=awkward.numpy.arange(start, stop))
+ def wrap_again(branch, interpretation, future):
+ return lambda: interpretation.finalize(future(), branch)
+ return lambda: uproot._connect.to_pandas.futures2df([(branch.name, interpretation, wrap_again(branch, interpretation, future)) for branch, interpretation, future, past, cachekey in futures], outputtype, start, stop, flatten, flatname, awkward)
+
  elif isinstance(outputtype, type) and issubclass(outputtype, dict):
  def wrap_for_python_scope(futures, start, stop):
  return lambda: outputtype((branch.name if namedecode is None else branch.name.decode(namedecode), evaluate(branch, interpretation, future, past, cachekey, False)) for branch, interpretation, future, past, cachekey in futures)
+
  elif isinstance(outputtype, type) and issubclass(outputtype, (list, tuple)):
  def wrap_for_python_scope(futures, start, stop):
  return lambda: outputtype(evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures)
+
  else:
  def wrap_for_python_scope(futures, start, stop):
  return lambda: outputtype(*[evaluate(branch, interpretation, future, past, cachekey, False) for branch, interpretation, future, past, cachekey in futures])

diff --git a/uproot/version.py b/uproot/version.py
@@ -30,7 +30,7 @@
 
 import re
 
-__version__ = "3.4.15"
+__version__ = "3.4.16"
 version = __version__
 version_info = tuple(re.split(r"[-\.]", __version__))