From 5e001d6509b2c82bb85613cca59256258f581bd3 Mon Sep 17 00:00:00 2001 From: Chris Holmes Date: Wed, 13 Sep 2023 07:44:50 -0700 Subject: [PATCH] removed validator, as discussed in #184 --- validator/.gitignore | 160 ------------------ validator/README.md | 7 - validator/python/README.md | 40 ----- .../python/geoparquet_validator/__init__.py | 92 ---------- .../python/geoparquet_validator/schema.json | 1 - validator/python/setup.py | 28 --- 6 files changed, 328 deletions(-) delete mode 100644 validator/.gitignore delete mode 100644 validator/README.md delete mode 100644 validator/python/README.md delete mode 100755 validator/python/geoparquet_validator/__init__.py delete mode 120000 validator/python/geoparquet_validator/schema.json delete mode 100644 validator/python/setup.py diff --git a/validator/.gitignore b/validator/.gitignore deleted file mode 100644 index 68bc17f..0000000 --- a/validator/.gitignore +++ /dev/null @@ -1,160 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/validator/README.md b/validator/README.md deleted file mode 100644 index 092746d..0000000 --- a/validator/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# GeoParquet validator - -Command-line tools to validate a GeoParquet file. Using [JSON Schema](https://json-schema.org/). - -## Flavors - -- [GeoParquet validator - Python](./python) diff --git a/validator/python/README.md b/validator/python/README.md deleted file mode 100644 index 7a40cc6..0000000 --- a/validator/python/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# GeoParquet validator - Python - -Command-line tool to validate a GeoParquet file. Written in Python. Using [JSON Schema](https://json-schema.org/). - -## Installation - -``` -pip install --no-binary geoparquet_validator . -``` - -**Update** - -``` -pip install --no-binary geoparquet_validator -U . -``` - -**Development** - -``` -pip install -e . -``` - -**Uninstall** - -``` -pip uninstall geoparquet_validator -``` - -## Usage - -``` -geoparquet_validator ../../examples/example.parquet -geoparquet_validator https://storage.googleapis.com/open-geodata/linz-examples/nz-buildings-outlines.parquet -``` - -The validator also supports remote files. - -- `http://` or `https://`: no further configuration is needed. -- `s3://`: `s3fs` needs to be installed (run `pip install .[s3]`) and you may need to set environment variables. Refer [here](https://s3fs.readthedocs.io/en/latest/#credentials) for how to define credentials. -- `gs://`: `gcsfs` needs to be installed (run `pip install .[gcs]`). By default, `gcsfs` will attempt to use your default gcloud credentials or, attempt to get credentials from the google metadata service, or fall back to anonymous access. diff --git a/validator/python/geoparquet_validator/__init__.py b/validator/python/geoparquet_validator/__init__.py deleted file mode 100755 index b19103d..0000000 --- a/validator/python/geoparquet_validator/__init__.py +++ /dev/null @@ -1,92 +0,0 @@ -import json -import click -import pyarrow.parquet as pq - -from pprint import pprint -from urllib.parse import urlparse -from importlib_resources import files -from jsonschema.validators import Draft7Validator -from pyarrow.fs import FSSpecHandler, PyFileSystem -from fsspec import AbstractFileSystem -from fsspec.implementations.http import HTTPFileSystem -from fsspec.implementations.local import LocalFileSystem - - -def choose_fsspec_fs(url_or_path: str) -> AbstractFileSystem: - """Choose fsspec filesystem by sniffing input url""" - parsed = urlparse(url_or_path) - - if parsed.scheme.startswith("http"): - return HTTPFileSystem() - - if parsed.scheme == "s3": - from s3fs import S3FileSystem - - return S3FileSystem() - - if parsed.scheme == "gs": - from gcsfs import GCSFileSystem - - return GCSFileSystem() - - # TODO: Add Azure - return LocalFileSystem() - - -def load_parquet_schema(url_or_path: str) -> pq.ParquetSchema: - """Load schema from local or remote Parquet file""" - fsspec_fs = choose_fsspec_fs(url_or_path) - pyarrow_fs = PyFileSystem(FSSpecHandler(fsspec_fs)) - return pq.read_schema(pyarrow_fs.open_input_file(url_or_path)) - - -def log(text: str, status="info"): - status_color = { - "info": "white", - "warning": "yellow", - "error": "red", - "success": "green"} - click.echo(click.style(text, fg=status_color[status])) - - -@click.command() -@click.argument("input_file") -def main(input_file): - schema_source = files("geoparquet_validator").joinpath("schema.json") - schema = json.loads(schema_source.read_text()) - - parquet_schema = load_parquet_schema(input_file) - - if b"geo" not in parquet_schema.metadata: - log("Parquet file schema does not have 'geo' key", "error") - exit(1) - - metadata = json.loads(parquet_schema.metadata[b"geo"]) - log("Metadata loaded from file:") - pprint(metadata) - - valid = True - log("Validating file...") - - errors = Draft7Validator(schema).iter_errors(metadata) - - for error in errors: - valid = False - log(f" - {error.json_path}: {error.message}", "warning") - if "description" in error.schema: - log(f" \"{error.schema['description']}\"", "warning") - - # Extra errors - if (metadata["primary_column"] not in metadata["columns"]): - valid = False - log("- $.primary_column: must be in $.columns", "warning") - - if valid: - log("This is a valid GeoParquet file.\n", "success") - else: - log("This is an invalid GeoParquet file.\n", "error") - exit(1) - - -if __name__ == "__main__": - main() diff --git a/validator/python/geoparquet_validator/schema.json b/validator/python/geoparquet_validator/schema.json deleted file mode 120000 index b667c23..0000000 --- a/validator/python/geoparquet_validator/schema.json +++ /dev/null @@ -1 +0,0 @@ -../../../format-specs/schema.json \ No newline at end of file diff --git a/validator/python/setup.py b/validator/python/setup.py deleted file mode 100644 index 330a16e..0000000 --- a/validator/python/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="geoparquet_validator", - version="0.0.1", - install_requires=[ - "jsonschema>=4.4", - "pyarrow>=7.0", - "fsspec>=2022.3", - "requests>=2.27", - "aiohttp>=3.8", - "click>=8.1", - "colorama>=0.4" - ], - extras_require={ - "s3": ["s3fs"], - "gcs": ["gcsfs"] - }, - packages=find_packages(), - package_data={ - "geoparquet_validator": ["schema.json"] - }, - entry_points={ - "console_scripts": [ - "geoparquet_validator=geoparquet_validator:main" - ] - } -)