From 8757eda2b3bc272649ca261414a590e926138523 Mon Sep 17 00:00:00 2001
From: Paco Nathan <ceteri@gmail.com>
Date: Sun, 2 Oct 2022 15:48:37 -0700
Subject: [PATCH] proposed standard; rename cli.py

---
 README.md            | 80 +++++++++++++++++++++++++++++++++++++-------
 example.py => cli.py |  0
 pynock/pynock.py     | 15 ++++++---
 setup.py             | 17 +++++++---
 tiny.py              |  5 +++
 5 files changed, 96 insertions(+), 21 deletions(-)
 rename example.py => cli.py (100%)

diff --git a/README.md b/README.md
index 2c77d4b..8f7b5a7 100644
--- a/README.md
+++ b/README.md
@@ -1,22 +1,33 @@
 # pynock
 
+The following describes a proposed standard `NOCK` for a Parquet
+format that supports efficient distributed serialization of multiple
+kinds of graph technologies.
+
 This library `pynock` provides Examples for working with low-level
 Parquet read/write efficiently in Python.
 
-Our intent is to serialize graphs which align the data representations
-required for multiple areas of popular graph technologies:
+Our intent is to serialize graphs in a way which aligns the data
+representations required for popular graph technologies and related
+data sources:
 
-  * semantic graphs (e.g., W3C)
+  * semantic graphs (e.g., W3C formats RDF, TTL, JSON-LD, etc.)
   * labeled property graphs (e.g., openCypher)
   * probabilistic graphs (e.g., PSL)
-  * edge lists (e.g., NetworkX)
+  * spreadsheet import/export (e.g., CSV)
+  * dataframes (e.g., Pandas, Dask, Spark, etc.)
+  * edge lists (e.g., NetworkX, cuGraph, etc.)
 
-This approach also supports distributed partitions based on Parquet
-which can scale to very large (+1 T node) graphs.
+This approach also efficient distributed partitions based on Parquet,
+which can scale on a cluster to very large (+1 T node) graphs.
 
-For details about the formatting required in Parquet files, see the
+For details about the proposed format in Parquet files, see the
 [`FORMAT.md`](https://github.com/DerwenAI/pynock/blob/main/FORMAT.md)
-page.
+file.
+
+If you have questions, suggestions, or bug reports, please open
+[an issue](https://github.com/DerwenAI/pynock/issues)
+on our public GitHub repo.
 
 
 ## Caveats
@@ -37,7 +48,9 @@ no guarantees regarding correct behaviors on other versions.
 
 The Parquet file formats depend on Arrow 5.0.x or later.
 
-For the Python dependencies, see the `requirements.txt` file.
+For the Python dependencies, the library versioning info is listed in the
+[`requirements.txt`](https://github.com/DerwenAI/pynock/blob/main/requirements.txt)
+file.
 
 
 ## Set up
@@ -63,17 +76,17 @@ python3 -m pip install -r requirements.txt
 To run examples from CLI:
 
 ```
-python3 example.py load-parq --file dat/recipes.parq --debug
+python3 cli.py load-parq --file dat/recipes.parq --debug
 ```
 
 ```
-python3 example.py load-rdf --file dat/tiny.ttl --save-cvs foo.cvs
+python3 cli.py load-rdf --file dat/tiny.ttl --save-cvs foo.cvs
 ```
 
 For further information:
 
 ```
-python3 example.py --help
+python3 cli.py --help
 ```
 
 ## Usage programmatically in Python
@@ -100,8 +113,31 @@ _Towards Data Science_ (2020-06-25)
 
 A `nock` is the English word for the end of an arrow opposite its point.
 
+If you must have an acronym, the proposed standard `NOCK` stands for
+**N**etwork **O**bjects for **C**onsistent **K**nowledge.
+
+Also, the library name had minimal namespace collisions on GitHub and
+PyPi :)
+
+
+## Developer updates
+
+To set up the build environment locally, also run:
+```
+python3 -m pip install -U pip setuptools wheel
+python3 -m pip install -r requirements-dev.txt
+```
+
+Note that we require the use of [`pre-commit` hooks](https://pre-commit.com/)
+and to configure that locally:
+
+```
+pre-commit install
+git config --local core.hooksPath .git/hooks/
+```
+
 
-## Package Release
+## Package releases
 
 First, verify that `setup.py` will run correctly for the package
 release process:
@@ -111,3 +147,21 @@ python3 -m pip install -e .
 python3 -m pytest tests/
 python3 -m pip uninstall pynock
 ```
+
+Next, update the semantic version number in `setup.py` and create a
+release on GitHub, and make sure to update the local repo:
+
+```
+git stash
+git checkout main
+git pull
+```
+
+Make sure that you have set up your 2FA authentication for generating
+an API token on PyPi: <https://pypi.org/manage/account/token/>
+
+Then run our PyPi push script:
+
+```
+./bin/push_pypi.sh
+```
diff --git a/example.py b/cli.py
similarity index 100%
rename from example.py
rename to cli.py
diff --git a/pynock/pynock.py b/pynock/pynock.py
index 93b6326..b5b7317 100644
--- a/pynock/pynock.py
+++ b/pynock/pynock.py
@@ -454,6 +454,15 @@ def iter_gen_rows (
                     edge_id += 1
 
 
+    def to_df (
+        self,
+        ) -> pd.DataFrame:
+        """
+Represent the partition as a DataFrame.
+        """
+        return pd.DataFrame([row for row in self.iter_gen_rows()])
+
+
     def save_file_parquet (
         self,
         save_parq: cloudpathlib.AnyPath,
@@ -463,8 +472,7 @@ def save_file_parquet (
         """
 Save a partition to a Parquet file.
         """
-        df = pd.DataFrame([row for row in self.iter_gen_rows()])
-        table = pa.Table.from_pandas(df)
+        table = pa.Table.from_pandas(self.to_df())
         writer = pq.ParquetWriter(save_parq.as_posix(), table.schema)
         writer.write_table(table)
         writer.close()
@@ -479,8 +487,7 @@ def save_file_csv (
         """
 Save a partition to a CSV file.
         """
-        df = pd.DataFrame([row for row in self.iter_gen_rows()])
-        df.to_csv(save_csv.as_posix(), index=False)
+        self.to_df().to_csv(save_csv.as_posix(), index=False)
 
 
     def save_file_rdf (
diff --git a/setup.py b/setup.py
index 77cc534..6ddffb7 100644
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,5 @@
 """
-Package set up, used for CI testing.
+Package set up.
 """
 
 import pathlib
@@ -9,13 +9,21 @@
 
 
 DESCRIP = """
-Examples for low-level Parquet read/write in Python
+A proposed standard `NOCK` for a Parquet format that supports efficient
+distributed serialization of multiple kinds of graph technologies.
 """.strip()
 
 KEYWORDS = [
+    "CSV",
+    "Parquet",
+    "RDF",
+    "dataframe",
+    "graph data science",
     "knowledge graph",
-    "parquet",
+    "openCypher",
     "serialization",
+    "spreadsheet",
+    "open standard",
 ]
 
 
@@ -40,12 +48,13 @@ def parse_requirements_file (filename: str) -> typing.List[ str ]:
 if __name__ == "__main__":
     setuptools.setup(
         name = "pynock",
-        version = "1.0.0",
+        version = "1.0.1",
         license = "MIT",
 
         python_requires = ">=3.8",
         install_requires = parse_requirements_file("requirements.txt"),
         packages = setuptools.find_packages(exclude=[
+            "bin",
             "dat",
             "tests",
             "venv",
diff --git a/tiny.py b/tiny.py
index 16670fd..ff0c1c4 100644
--- a/tiny.py
+++ b/tiny.py
@@ -6,6 +6,7 @@
 programmatically, based on the graph described in `dat/tiny.rdf`
 """
 
+from icecream import ic
 import cloudpathlib
 
 from pynock import Partition, Node, Edge
@@ -124,3 +125,7 @@
     part.save_file_rdf(cloudpathlib.AnyPath("foo.rdf"), "ttl")
 
     # check the files "foo.*" to see what was constructed programmatically
+    # also, here's a dataframe representation
+    df = part.to_df()
+    ic(df.head())
+