From 8757eda2b3bc272649ca261414a590e926138523 Mon Sep 17 00:00:00 2001 From: Paco Nathan Date: Sun, 2 Oct 2022 15:48:37 -0700 Subject: [PATCH] proposed standard; rename cli.py --- README.md | 80 +++++++++++++++++++++++++++++++++++++------- example.py => cli.py | 0 pynock/pynock.py | 15 ++++++--- setup.py | 17 +++++++--- tiny.py | 5 +++ 5 files changed, 96 insertions(+), 21 deletions(-) rename example.py => cli.py (100%) diff --git a/README.md b/README.md index 2c77d4b..8f7b5a7 100644 --- a/README.md +++ b/README.md @@ -1,22 +1,33 @@ # pynock +The following describes a proposed standard `NOCK` for a Parquet +format that supports efficient distributed serialization of multiple +kinds of graph technologies. + This library `pynock` provides Examples for working with low-level Parquet read/write efficiently in Python. -Our intent is to serialize graphs which align the data representations -required for multiple areas of popular graph technologies: +Our intent is to serialize graphs in a way which aligns the data +representations required for popular graph technologies and related +data sources: - * semantic graphs (e.g., W3C) + * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.) * labeled property graphs (e.g., openCypher) * probabilistic graphs (e.g., PSL) - * edge lists (e.g., NetworkX) + * spreadsheet import/export (e.g., CSV) + * dataframes (e.g., Pandas, Dask, Spark, etc.) + * edge lists (e.g., NetworkX, cuGraph, etc.) -This approach also supports distributed partitions based on Parquet -which can scale to very large (+1 T node) graphs. +This approach also efficient distributed partitions based on Parquet, +which can scale on a cluster to very large (+1 T node) graphs. -For details about the formatting required in Parquet files, see the +For details about the proposed format in Parquet files, see the [`FORMAT.md`](https://github.com/DerwenAI/pynock/blob/main/FORMAT.md) -page. +file. + +If you have questions, suggestions, or bug reports, please open +[an issue](https://github.com/DerwenAI/pynock/issues) +on our public GitHub repo. ## Caveats @@ -37,7 +48,9 @@ no guarantees regarding correct behaviors on other versions. The Parquet file formats depend on Arrow 5.0.x or later. -For the Python dependencies, see the `requirements.txt` file. +For the Python dependencies, the library versioning info is listed in the +[`requirements.txt`](https://github.com/DerwenAI/pynock/blob/main/requirements.txt) +file. ## Set up @@ -63,17 +76,17 @@ python3 -m pip install -r requirements.txt To run examples from CLI: ``` -python3 example.py load-parq --file dat/recipes.parq --debug +python3 cli.py load-parq --file dat/recipes.parq --debug ``` ``` -python3 example.py load-rdf --file dat/tiny.ttl --save-cvs foo.cvs +python3 cli.py load-rdf --file dat/tiny.ttl --save-cvs foo.cvs ``` For further information: ``` -python3 example.py --help +python3 cli.py --help ``` ## Usage programmatically in Python @@ -100,8 +113,31 @@ _Towards Data Science_ (2020-06-25) A `nock` is the English word for the end of an arrow opposite its point. +If you must have an acronym, the proposed standard `NOCK` stands for +**N**etwork **O**bjects for **C**onsistent **K**nowledge. + +Also, the library name had minimal namespace collisions on GitHub and +PyPi :) + + +## Developer updates + +To set up the build environment locally, also run: +``` +python3 -m pip install -U pip setuptools wheel +python3 -m pip install -r requirements-dev.txt +``` + +Note that we require the use of [`pre-commit` hooks](https://pre-commit.com/) +and to configure that locally: + +``` +pre-commit install +git config --local core.hooksPath .git/hooks/ +``` + -## Package Release +## Package releases First, verify that `setup.py` will run correctly for the package release process: @@ -111,3 +147,21 @@ python3 -m pip install -e . python3 -m pytest tests/ python3 -m pip uninstall pynock ``` + +Next, update the semantic version number in `setup.py` and create a +release on GitHub, and make sure to update the local repo: + +``` +git stash +git checkout main +git pull +``` + +Make sure that you have set up your 2FA authentication for generating +an API token on PyPi: + +Then run our PyPi push script: + +``` +./bin/push_pypi.sh +``` diff --git a/example.py b/cli.py similarity index 100% rename from example.py rename to cli.py diff --git a/pynock/pynock.py b/pynock/pynock.py index 93b6326..b5b7317 100644 --- a/pynock/pynock.py +++ b/pynock/pynock.py @@ -454,6 +454,15 @@ def iter_gen_rows ( edge_id += 1 + def to_df ( + self, + ) -> pd.DataFrame: + """ +Represent the partition as a DataFrame. + """ + return pd.DataFrame([row for row in self.iter_gen_rows()]) + + def save_file_parquet ( self, save_parq: cloudpathlib.AnyPath, @@ -463,8 +472,7 @@ def save_file_parquet ( """ Save a partition to a Parquet file. """ - df = pd.DataFrame([row for row in self.iter_gen_rows()]) - table = pa.Table.from_pandas(df) + table = pa.Table.from_pandas(self.to_df()) writer = pq.ParquetWriter(save_parq.as_posix(), table.schema) writer.write_table(table) writer.close() @@ -479,8 +487,7 @@ def save_file_csv ( """ Save a partition to a CSV file. """ - df = pd.DataFrame([row for row in self.iter_gen_rows()]) - df.to_csv(save_csv.as_posix(), index=False) + self.to_df().to_csv(save_csv.as_posix(), index=False) def save_file_rdf ( diff --git a/setup.py b/setup.py index 77cc534..6ddffb7 100644 --- a/setup.py +++ b/setup.py @@ -1,5 +1,5 @@ """ -Package set up, used for CI testing. +Package set up. """ import pathlib @@ -9,13 +9,21 @@ DESCRIP = """ -Examples for low-level Parquet read/write in Python +A proposed standard `NOCK` for a Parquet format that supports efficient +distributed serialization of multiple kinds of graph technologies. """.strip() KEYWORDS = [ + "CSV", + "Parquet", + "RDF", + "dataframe", + "graph data science", "knowledge graph", - "parquet", + "openCypher", "serialization", + "spreadsheet", + "open standard", ] @@ -40,12 +48,13 @@ def parse_requirements_file (filename: str) -> typing.List[ str ]: if __name__ == "__main__": setuptools.setup( name = "pynock", - version = "1.0.0", + version = "1.0.1", license = "MIT", python_requires = ">=3.8", install_requires = parse_requirements_file("requirements.txt"), packages = setuptools.find_packages(exclude=[ + "bin", "dat", "tests", "venv", diff --git a/tiny.py b/tiny.py index 16670fd..ff0c1c4 100644 --- a/tiny.py +++ b/tiny.py @@ -6,6 +6,7 @@ programmatically, based on the graph described in `dat/tiny.rdf` """ +from icecream import ic import cloudpathlib from pynock import Partition, Node, Edge @@ -124,3 +125,7 @@ part.save_file_rdf(cloudpathlib.AnyPath("foo.rdf"), "ttl") # check the files "foo.*" to see what was constructed programmatically + # also, here's a dataframe representation + df = part.to_df() + ic(df.head()) +