Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add known forth for ATLAS #1282

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 9 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions src/uproot/interpretation/known_forth/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

"""
This module defines known forth code for types it is known a priori.
"""
from __future__ import annotations

from uproot.interpretation.known_forth.atlas.element_link import VectorVectorElementLink

KNOWN_FORTH_DICT = {
"std::vector<std::vector<ElementLink<DataVector<xAOD::CaloCluster_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::Vertex_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TruthParticle_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TrackParticle_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::TauTrack_v1>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::IParticle>>>>": VectorVectorElementLink,
"std::vector<std::vector<ElementLink<DataVector<xAOD::NeutralParticle_v1>>>>": VectorVectorElementLink,
}


def known_forth_of(model):
if not hasattr(model, "typename"):
return

typename = model.typename

if typename not in KNOWN_FORTH_DICT:
return

return KNOWN_FORTH_DICT[typename](typename)
5 changes: 5 additions & 0 deletions src/uproot/interpretation/known_forth/atlas/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

"""
This module defines ATLAS specific known forth code
"""
82 changes: 82 additions & 0 deletions src/uproot/interpretation/known_forth/atlas/element_link.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/uproot5/blob/main/LICENSE

"""
This module defines known forth code for some ElementLink data types in ATLAS (D)AODs
"""

from __future__ import annotations

import re


class VectorVectorElementLink:

forth_code = """
input stream
input byteoffsets
input bytestops
output node1-offsets int64
output node2-offsets int64
output node3-data uint32
output node4-data uint32

0 node1-offsets <- stack
0 node2-offsets <- stack

0 do
byteoffsets I-> stack
stream seek
6 stream skip
stream !I-> stack
dup node1-offsets +<- stack
0 do
stream !I-> stack
dup node2-offsets +<- stack
0 do
20 stream skip
stream !I-> node3-data
stream !I-> node4-data
loop
loop
loop
"""

def __init__(self, typename):
self.typename = typename
self.inner_typename = re.sub(
"std::vector<std::vector<(.*)>>", r"\1", self.typename
)

@property
def awkward_form(self):
return {
"class": "ListOffsetArray",
"offsets": "i64",
"form_key": "node1",
"content": {
"class": "ListOffsetArray",
"offsets": "i64",
"form_key": "node2",
"content": {
"class": "RecordArray",
"fields": ["m_persKey", "m_persIndex"],
"contents": [
{
"class": "NumpyArray",
"primitive": "uint32",
"inner_shape": [],
"parameters": {},
"form_key": "node3",
},
{
"class": "NumpyArray",
"primitive": "uint32",
"inner_shape": [],
"parameters": {},
"form_key": "node4",
},
],
"parameters": {"__record__": f"{self.inner_typename}"},
},
},
}
19 changes: 17 additions & 2 deletions src/uproot/interpretation/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

import uproot
import uproot._awkwardforth
from uproot.interpretation.known_forth import known_forth_of


class AsObjects(uproot.interpretation.Interpretation):
Expand All @@ -45,14 +46,21 @@ class AsObjects(uproot.interpretation.Interpretation):
:ref:`uproot.interpretation.objects.AsObjects.simplify` attempts to
replace this interpretation with a faster-to-read equivalent, but not all
data types can be simplified.

# TODO: known_forth can define forth code and forms for special cases that will be picked up here as well
"""

def __init__(self, model, branch=None):
self._model = model
self._branch = branch
self._form = None
self._forth = True
self._complete_forth_code = None
self._form = None
known_forth = known_forth_of(self._model)
if known_forth is not None:
self._complete_forth_code = known_forth.forth_code
self._form = known_forth.awkward_form
else:
self._complete_forth_code = None
nikoladze marked this conversation as resolved.
Show resolved Hide resolved
self._forth_lock = threading.Lock()

@property
Expand Down Expand Up @@ -122,6 +130,13 @@ def awkward_form(
tobject_header=False,
breadcrumbs=(),
):
awkward = uproot.extras.awkward()
if self._form is not None: # TODO: is this really fine?
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jpivarski i'm a bit uncomfortable with the piece of code i added here. The _form attribute already existed before, but was never directly returned here. The tests seem to pass, but do you know if there are potential problems that may be caused by directly returning _form if it is not None?

One thing that i had to do (and also don't quite feel comfortable with) was to convert the form from a dict representation which seems to happen sometimes (but i'm a bit unsure where and why not always).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm still trying to understand your first paragraph, but about the second paragraph: there are three ways that a Form can be represented,

  • as a ak.forms.Form object, which is immutable and therefore needs to be entirely known or at least built from the leaves up to the root
  • as a Python dict (JSON represented as dicts and lists), which isn't type-safe, but it can be modified in place
  • as a JSON string, which doesn't even ensure that pairs of square brackets are closed.

Thus, they have decreasing levels of safety and we prefer the safer ones when possible. The problem is that the process of discovering the type involves starting at the root and walking down toward the leaves, filling in a ListOffsetArray's content if we see a non-empty example of it. So the ideal, ak.forms.Form, isn't possible and the Form starts its life as a Python dict. When it gets converted into an object, that's final. The Forth is fully discovered at that point.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i now realized that the self._form attribute will always be in the Python dict form (either from generated or known forth code). So at the place where it is returned in AsObjects.awkward_form(...) it has to be converted to the final ak.forms.Form object. In principle the form from the known forth path could have been provided directly as ak.forms.Form, but to be consistent with the one created from the ForthGenerator it is now also a dict.

if isinstance(self._form, dict):
# TODO don't know when and why the form sometimes is a dict
# (and if it causes problems to convert it here)
self._form = awkward.forms.from_dict(self._form)
return self._form
context = self._make_context(
context, index_format, header, tobject_header, breadcrumbs
)
Expand Down
48 changes: 48 additions & 0 deletions tests/test_1282_add_known_forth_for_atlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/env python3

import awkward
import pytest
import skhep_testdata
import uproot

VECTOR_VECTOR_ELEMENTLINK_BRANCHES = [
"AnalysisHLT_e12_lhloose_nod0_2mu10AuxDyn.TrigMatchedObjects",
"AnalysisElectronsAuxDyn.caloClusterLinks",
"AnalysisPhotonsAuxDyn.vertexLinks",
"TruthMuonsAuxDyn.childLinks",
"AnalysisElectronsAuxDyn.trackParticleLinks",
"PrimaryVerticesAuxDyn.neutralParticleLinks",
"AnalysisTauJetsAuxDyn.tauTrackLinks",
]


@pytest.mark.parametrize("key", VECTOR_VECTOR_ELEMENTLINK_BRANCHES)
def test_pickup_vector_vector_elementlink(key):
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
branch = tree[key]
assert branch.interpretation._complete_forth_code is not None
assert branch.interpretation._form is not None


def test_consistent_library_np_vector_vector_elementlink():
arrays_np = {}
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
for key in VECTOR_VECTOR_ELEMENTLINK_BRANCHES:
arrays_np[key] = tree[key].array(library="np")
arrays_ak = {}
with uproot.open(
{skhep_testdata.data_path("uproot-issue-123a.root"): "CollectionTree"}
) as tree:
for key in VECTOR_VECTOR_ELEMENTLINK_BRANCHES:
arrays_ak[key] = tree[key].array()
for key in arrays_np:
array_ak = arrays_ak[key]
array_np = uproot.interpretation.library._object_to_awkward_array(
awkward, array_ak.layout.form.to_dict(), arrays_np[key]
)
for field in array_ak.fields:
assert awkward.all(array_np[field] == array_ak[field])
Loading