Skip to content

Commit

Permalink
add programmatic usage example
Browse files Browse the repository at this point in the history
  • Loading branch information
ceteri committed Oct 2, 2022
1 parent 888211f commit eae20d1
Show file tree
Hide file tree
Showing 4 changed files with 216 additions and 48 deletions.
48 changes: 31 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,13 @@ The Parquet file formats depend on Arrow 5.0.x or later.
For the Python dependencies, see the `requirements.txt` file.


## Getting started
## Set up

To install via PIP:

```
python3 -m pip install -U pynock
```

To set up this library locally:

Expand All @@ -52,7 +58,9 @@ python3 -m pip install -U pip wheel
python3 -m pip install -r requirements.txt
```

Then to run examples from CLI:
## Usage via CLI

To run examples from CLI:

```
python3 example.py load-parq --file dat/recipes.parq --debug
Expand All @@ -68,22 +76,11 @@ For further information:
python3 example.py --help
```

## Usage programmatically in Python

## Package Release

First, verify that `setup.py` will run correctly for the package
release process:

```
python3 -m pip install -e .
python3 -m pytest tests/
python3 -m pip uninstall pynock
```


## Why the name?

A `nock` is the English word for the end of an arrow opposite its point.
To construct a partition file programmatically, see the sample code
[`tiny.py`](https://github.com/DerwenAI/pynock/blob/main/tiny.py)
which builds the minimal recipe example as an RDF graph.


## Background
Expand All @@ -97,3 +94,20 @@ For more details about using Arrow and Parquet see:
["Apache Arrow: Read DataFrame With Zero Memory"](https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a)
Dejan Simic
_Towards Data Science_ (2020-06-25)


## Why the name?

A `nock` is the English word for the end of an arrow opposite its point.


## Package Release

First, verify that `setup.py` will run correctly for the package
release process:

```
python3 -m pip install -e .
python3 -m pytest tests/
python3 -m pip uninstall pynock
```
6 changes: 3 additions & 3 deletions dat/tiny.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ https://www.food.com/recipe/327593,0,http://purl.org/heals/food/uses_ingredient,
https://www.food.com/recipe/327593,1,http://purl.org/heals/food/uses_ingredient,http://purl.org/heals/ingredient/CowMilk,1.0,-1,True,,null
https://www.food.com/recipe/327593,2,http://purl.org/heals/food/uses_ingredient,http://purl.org/heals/ingredient/WholeWheatFlour,1.0,-1,True,,null
https://www.food.com/recipe/327593,3,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://purl.org/heals/food/Recipe,1.0,-1,True,,null
http://purl.org/heals/ingredient/ChickenEgg,-1,,,1.0,-1,True,Ingredient,"null"
http://purl.org/heals/ingredient/CowMilk,-1,,,1.0,-1,True,Ingredient,"null"
http://purl.org/heals/ingredient/ChickenEgg,-1,,,1.0,-1,True,Ingredient,null
http://purl.org/heals/ingredient/CowMilk,-1,,,1.0,-1,True,Ingredient,null
http://purl.org/heals/ingredient/WholeWheatFlour,-1,,,1.0,-1,True,Ingredient,"{""vegan"":true}"
http://purl.org/heals/food/Recipe,-1,,,1.0,-1,True,top-level,"null"
http://purl.org/heals/food/Recipe,-1,,,1.0,-1,True,top-level,null
84 changes: 56 additions & 28 deletions pynock/pynock.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,21 @@
NOT_FOUND: int = -1


######################################################################
## edges

class Edge (BaseModel): # pylint: disable=R0903
"""
Representing an edge (arc) in the graph.
"""
BLANK_RELATION: typing.ClassVar[int] = 0

rel: int = BLANK_RELATION
node_id: int = NOT_FOUND
truth: float = 1.0
prop_map: PropMap = {}


######################################################################
## nodes

Expand All @@ -44,26 +59,26 @@ class Node (BaseModel): # pylint: disable=R0903
node_id: int = NOT_FOUND
name: str = ""
shadow: int = BASED_LOCAL
is_rdf: bool = True
is_rdf: bool = False
label_set: typing.Set[str] = set()
truth: float = 1.0
prop_map: PropMap = {}
edge_map: typing.Dict[int, list] = {}


######################################################################
## edges

class Edge (BaseModel): # pylint: disable=R0903
"""
Representing an edge (arc) in the graph.
"""
BLANK_RELATION: typing.ClassVar[int] = 0
def add_edge (
self,
edge: Edge,
*,
debug: bool = False, # pylint: disable=W0613
) -> None:
"""
Add the given edge to its src node.
"""
if edge.rel not in self.edge_map:
self.edge_map[edge.rel] = []

rel: int = BLANK_RELATION
node_id: int = NOT_FOUND
truth: float = 1.0
prop_map: PropMap = {}
self.edge_map[edge.rel].append(edge)


######################################################################
Expand Down Expand Up @@ -152,12 +167,24 @@ def save_props (

if len(prop_map) > 0:
props = json.dumps(prop_map)
props = props.replace("\": \"", "\":\"")
props = props.replace("\", \"", "\",\"")
props = props.replace("\": ", "\":")
props = props.replace(", \"", ",\"")

return props


def add_node (
self,
node: Node,
*,
debug: bool = False, # pylint: disable=W0613
) -> None:
"""
Add a node to the partition.
"""
self.nodes[node.node_id] = node


def populate_node (
self,
row: GraphRow,
Expand All @@ -169,6 +196,7 @@ def populate_node (
"""
# create a src node
node: Node = Node(
node_id = self.create_node_name(row["src_name"]),
name = row["src_name"],
truth = row["truth"],
is_rdf = row["is_rdf"],
Expand All @@ -177,10 +205,7 @@ def populate_node (
prop_map = self.load_props(row["props"]),
)

node.node_id = self.create_node_name(node.name)

# add this node to the global list
self.nodes[node.node_id] = node
self.add_node(node)

return node

Expand All @@ -196,7 +221,10 @@ def get_edge_rel (
Lookup the integer index for the named edge relation.
"""
if rel_name not in self.edge_rels:
self.edge_rels.append(rel_name)
if create:
self.edge_rels.append(rel_name)
else:
return NOT_FOUND

return self.edge_rels.index(rel_name)

Expand All @@ -223,7 +251,7 @@ def populate_edge (
is_rdf = row["is_rdf"],
)

self.nodes[dst_node.node_id] = dst_node
self.add_node(dst_node)

# create the edge
edge: Edge = Edge(
Expand All @@ -233,11 +261,7 @@ def populate_edge (
prop_map = self.load_props(row["props"]),
)

# add this edge to its src node
if edge.rel not in node.edge_map:
node.edge_map[edge.rel] = []

node.edge_map[edge.rel].append(edge)
node.add_edge(edge)

return edge

Expand Down Expand Up @@ -290,7 +314,7 @@ def iter_load_csv (
row_num: int = 0

with open(csv_path) as fp:
reader = csv.reader(fp, delimiter=",")
reader = csv.reader(fp, delimiter=",", quotechar='"')
header = next(reader)

for row_val in reader:
Expand Down Expand Up @@ -411,8 +435,10 @@ def iter_gen_rows (
"props": self.save_props(node.prop_map),
}

edge_id: int = 0

for _, edge_list in node.edge_map.items():
for edge_id, edge in enumerate(edge_list):
for edge in edge_list:
yield {
"src_name": node.name,
"edge_id": edge_id,
Expand All @@ -425,6 +451,8 @@ def iter_gen_rows (
"props": self.save_props(edge.prop_map),
}

edge_id += 1


def save_file_parquet (
self,
Expand Down
126 changes: 126 additions & 0 deletions tiny.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
A minimal example using `pynock` to construct a partition
programmatically, based on the graph described in `dat/tiny.rdf`
"""

import cloudpathlib

from pynock import Partition, Node, Edge


if __name__ == "__main__":
# initialize a partition
part: Partition = Partition(
part_id = 0,
)

# define a src node for the recipe
# NB: this node has properties, which RDF cannot query
src_name: str = "https://www.food.com/recipe/327593"

src_node: Node = Node(
node_id = part.create_node_name(src_name),
name = src_name,
is_rdf = True,
label_set = set(["Recipe"]),
prop_map = {
"minutes": 8,
"name": "anytime crepes"
},
)

part.add_node(src_node)

# define a dst node for the "Egg" ingredient
dst_name: str = "http://purl.org/heals/ingredient/ChickenEgg"

dst_node: Node = Node(
node_id = part.create_node_name(dst_name),
name = dst_name,
is_rdf = True,
label_set = set(["Ingredient"]),
)

part.add_node(dst_node)

# define an edge connecting src => dst for this ingredient
edge: Edge = Edge(
rel = part.get_edge_rel("http://purl.org/heals/food/uses_ingredient", create=True),
node_id = dst_node.node_id,
)

src_node.add_edge(edge)

# define a dst node for the "Milk" ingredient
dst_name = "http://purl.org/heals/ingredient/CowMilk"

dst_node = Node(
node_id = part.create_node_name(dst_name),
name = dst_name,
is_rdf = True,
label_set = set(["Ingredient"]),
)

part.add_node(dst_node)

# define an edge connecting src => dst for this ingredient
edge = Edge(
rel = part.get_edge_rel("http://purl.org/heals/food/uses_ingredient", create=True),
node_id = dst_node.node_id,
)

src_node.add_edge(edge)

# define a dst node for the "Flour" ingredient
# NB: this node has properties, which RDF cannot query
dst_name = "http://purl.org/heals/ingredient/WholeWheatFlour"

dst_node = Node(
node_id = part.create_node_name(dst_name),
name = dst_name,
is_rdf = True,
label_set = set(["Ingredient"]),
prop_map = {
"vegan": True,
},
)

part.add_node(dst_node)

# define an edge connecting src => dst for this ingredient
edge = Edge(
rel = part.get_edge_rel("http://purl.org/heals/food/uses_ingredient", create=True),
node_id = dst_node.node_id,
)

src_node.add_edge(edge)

# define a dst node for the "wtm:Recipe" parent
dst_name = "http://purl.org/heals/food/Recipe"

dst_node = Node(
node_id = part.create_node_name(dst_name),
name = dst_name,
is_rdf = True,
label_set = set(["top-level"]),
)

part.add_node(dst_node)

# define an edge connecting src => dst for this inheritance
edge = Edge(
rel = part.get_edge_rel("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", create=True),
node_id = dst_node.node_id,
)

src_node.add_edge(edge)

# serialize this partition to multiple formats
part.save_file_parquet(cloudpathlib.AnyPath("foo.parq"))
part.save_file_csv(cloudpathlib.AnyPath("foo.csv"))
part.save_file_rdf(cloudpathlib.AnyPath("foo.rdf"), "ttl")

# check the files "foo.*" to see what was constructed programmatically

0 comments on commit eae20d1

Please sign in to comment.