diff --git a/.github/workflows/scripts.yml b/.github/workflows/scripts.yml index e8649f2..c694152 100644 --- a/.github/workflows/scripts.yml +++ b/.github/workflows/scripts.yml @@ -7,27 +7,6 @@ on: pull_request: jobs: - validate-examples: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - - name: Set up Python 3.8 - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - name: Install validator - run: | - cd validator/python - python -m pip install --no-binary geoparquet_validator . - - - name: Run validator - run: | - for example in $(ls examples/*.parquet); do - echo $example; - geoparquet_validator $example || exit 1; - done test-json-metadata: runs-on: ubuntu-latest diff --git a/README.md b/README.md index c67862b..fb85195 100644 --- a/README.md +++ b/README.md @@ -2,9 +2,7 @@ ## About -This repository defines a [specification](https://geoparquet.org/releases/) for how to store geospatial [vector data](https://gisgeography.com/spatial-data-types-vector-raster/) (point, lines, polygons) in [Apache Parquet](https://parquet.apache.org/), a popular columnar storage format for tabular data - see [this vendor explanation](https://databricks.com/glossary/what-is-parquet) for more on what that means. Our goal is to standardize how geospatial data is represented in Parquet to further geospatial interoperability among tools using Parquet today, and hopefully help push forward what's possible with 'cloud-native geospatial' workflows. There are now more than 10 different tools and libraries in 6 different languages that support GeoParquet, you can learn more at [geoparquet.org](https://geoparquet.org). - -**Note:** This specification is currently in 1.0 'release candidate' status, which means the community is proposing the current version to be 1.0.0, and if no blocking negative feedback is made until end of August 2023 then it will become 1.0.0. This means breaking changes are still possible, but quite unlikely - see the [versioning](#versioning) section below for more info. +This repository defines a [specification](https://geoparquet.org/releases/) for how to store geospatial [vector data](https://gisgeography.com/spatial-data-types-vector-raster/) (point, lines, polygons) in [Apache Parquet](https://parquet.apache.org/), a popular columnar storage format for tabular data - see [this vendor explanation](https://databricks.com/glossary/what-is-parquet) for more on what that means. Our goal is to standardize how geospatial data is represented in Parquet to further geospatial interoperability among tools using Parquet today, and hopefully help push forward what's possible with 'cloud-native geospatial' workflows. There are now more than 20 different tools and libraries in 6 different languages that support GeoParquet, you can learn more at [geoparquet.org](https://geoparquet.org). Early contributors include developers from GeoPandas, GeoTrellis, OpenLayers, Vis.gl, Voltron Data, Microsoft, Carto, Azavea, Planet & Unfolded. Anyone is welcome to join the project, by building implementations, trying it out, giving feedback through issues and contributing to the spec via pull requests. @@ -12,10 +10,21 @@ Initial work started in the [geo-arrow-spec](https://github.com/geoarrow/geoarro Arrow work in a compatible way, with this specification focused solely on Parquet. We are in the process of becoming an [OGC](https://ogc.org) official [Standards Working Group](https://portal.ogc.org/files/103450) and are on the path to be a full OGC standard. -- [**Specification**](format-specs/geoparquet.md) +The latest [stable specification](https://geoparquet.org/releases/v1.0.0/) and [JSON schema](https://geoparquet.org/releases/v1.0.0/schema.json) are published at [geoparquet.org/releases/](https://geoparquet.org/releases/). + +The 'dev' versions of the spec are available in this repo: + +- [**Specification**](format-specs/geoparquet.md) (dev version - not stable, go to for latest stable) - [JSON Schema](format-specs/schema.json) - [Examples](examples/) +## Validating GeoParquet + +There are two tools that validate the metadata and the actual data. It is recommended to use one of them to ensure any GeoParquet you produce or are given is completely valid according to the specification: + +* **[GPQ](https://github.com/planetlabs/gpq)** - the `validate` command generates a report with `gpq validate example.parquet`. +* **[GDAL/OGR Validation Script](https://gdal.org/drivers/vector/parquet.html#validation-script)** - a Python script that can check compliance with `python3 validate_geoparquet.py --check-data my_geo.parquet` + ## Goals There are a few core goals driving the initial development. @@ -53,16 +62,14 @@ will work much better if it is backing a system that is constantly updating the ## Roadmap -Our aim is to get to a 1.0.0 final by the end of August 2023. The goal of 1.0.0 is to establish a baseline of interoperability for geospatial information in Parquet. For 1.0.0 -the only geometry encoding option is Well Known Binary, but we made it an option to allow other encodings. The main goal of 1.1.0 will be to incorporate a more columnar-oriented +The goal of 1.0.0 was to establish a baseline of interoperability for geospatial information in Parquet. For 1.0.0 +the only geometry encoding option is Well Known Binary, but there is an option to allow other encodings. The main goal of 1.1.0 will be to incorporate a more columnar-oriented geometry format, which is currently being worked on as part of the [GeoArrow spec](https://github.com/geoarrow/geoarrow). Once that gets finalized we will add the option to -GeoParquet. In general 1.1.0 will further explore spatial optimization, spatial indices and spatial partitioning to improve GeoParquet's performance. +GeoParquet. In general 1.1.0 will further explore spatial optimization, spatial indices and spatial partitioning to improve performance reading spatial subsets. ## Versioning -After we reach version 1.0 we will follow [SemVer](https://semver.org/), so at that point any breaking change will require the spec to go to 2.0.0. -Currently implementors should expect breaking changes, though at some point, hopefully relatively soon (0.4?), we will declare that we don't *think* there -will be any more potential breaking changes. Though the full commitment to that won't be made until 1.0.0. +As of version 1.0 the specification follows [Semantic Versioning](https://semver.org/), so at that point any breaking change will require the spec to go to 2.0.0. ## Current Implementations & Examples diff --git a/examples/example.parquet b/examples/example.parquet index 6550284..481e66b 100644 Binary files a/examples/example.parquet and b/examples/example.parquet differ diff --git a/examples/example_metadata.json b/examples/example_metadata.json index 92204dc..ed69b7c 100644 --- a/examples/example_metadata.json +++ b/examples/example_metadata.json @@ -115,6 +115,6 @@ } }, "primary_column": "geometry", - "version": "1.0.0-dev" + "version": "1.1.0-dev" } } \ No newline at end of file diff --git a/format-specs/geoparquet.md b/format-specs/geoparquet.md index 3837ddd..6821c1c 100644 --- a/format-specs/geoparquet.md +++ b/format-specs/geoparquet.md @@ -8,7 +8,7 @@ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", "S ## Version and schema -This is version 1.0.0-dev of the GeoParquet specification. See the [JSON Schema](schema.json) to validate metadata for this version. +This is version 1.1.0-dev of the GeoParquet specification. See the [JSON Schema](schema.json) to validate metadata for this version. ## Geometry columns diff --git a/format-specs/schema.json b/format-specs/schema.json index 8133a46..ae31ee0 100644 --- a/format-specs/schema.json +++ b/format-specs/schema.json @@ -7,7 +7,7 @@ "properties": { "version": { "type": "string", - "const": "1.0.0-dev" + "const": "1.1.0-dev" }, "primary_column": { "type": "string", diff --git a/validator/.gitignore b/validator/.gitignore deleted file mode 100644 index 68bc17f..0000000 --- a/validator/.gitignore +++ /dev/null @@ -1,160 +0,0 @@ -# Byte-compiled / optimized / DLL files -__pycache__/ -*.py[cod] -*$py.class - -# C extensions -*.so - -# Distribution / packaging -.Python -build/ -develop-eggs/ -dist/ -downloads/ -eggs/ -.eggs/ -lib/ -lib64/ -parts/ -sdist/ -var/ -wheels/ -share/python-wheels/ -*.egg-info/ -.installed.cfg -*.egg -MANIFEST - -# PyInstaller -# Usually these files are written by a python script from a template -# before PyInstaller builds the exe, so as to inject date/other infos into it. -*.manifest -*.spec - -# Installer logs -pip-log.txt -pip-delete-this-directory.txt - -# Unit test / coverage reports -htmlcov/ -.tox/ -.nox/ -.coverage -.coverage.* -.cache -nosetests.xml -coverage.xml -*.cover -*.py,cover -.hypothesis/ -.pytest_cache/ -cover/ - -# Translations -*.mo -*.pot - -# Django stuff: -*.log -local_settings.py -db.sqlite3 -db.sqlite3-journal - -# Flask stuff: -instance/ -.webassets-cache - -# Scrapy stuff: -.scrapy - -# Sphinx documentation -docs/_build/ - -# PyBuilder -.pybuilder/ -target/ - -# Jupyter Notebook -.ipynb_checkpoints - -# IPython -profile_default/ -ipython_config.py - -# pyenv -# For a library or package, you might want to ignore these files since the code is -# intended to run in multiple environments; otherwise, check them in: -# .python-version - -# pipenv -# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. -# However, in case of collaboration, if having platform-specific dependencies or dependencies -# having no cross-platform support, pipenv may install dependencies that don't work, or not -# install all needed dependencies. -#Pipfile.lock - -# poetry -# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. -# This is especially recommended for binary packages to ensure reproducibility, and is more -# commonly ignored for libraries. -# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control -#poetry.lock - -# pdm -# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. -#pdm.lock -# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it -# in version control. -# https://pdm.fming.dev/#use-with-ide -.pdm.toml - -# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm -__pypackages__/ - -# Celery stuff -celerybeat-schedule -celerybeat.pid - -# SageMath parsed files -*.sage.py - -# Environments -.env -.venv -env/ -venv/ -ENV/ -env.bak/ -venv.bak/ - -# Spyder project settings -.spyderproject -.spyproject - -# Rope project settings -.ropeproject - -# mkdocs documentation -/site - -# mypy -.mypy_cache/ -.dmypy.json -dmypy.json - -# Pyre type checker -.pyre/ - -# pytype static type analyzer -.pytype/ - -# Cython debug symbols -cython_debug/ - -# PyCharm -# JetBrains specific template is maintained in a separate JetBrains.gitignore that can -# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore -# and can be added to the global gitignore or merged into this file. For a more nuclear -# option (not recommended) you can uncomment the following to ignore the entire idea folder. -#.idea/ diff --git a/validator/README.md b/validator/README.md deleted file mode 100644 index 092746d..0000000 --- a/validator/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# GeoParquet validator - -Command-line tools to validate a GeoParquet file. Using [JSON Schema](https://json-schema.org/). - -## Flavors - -- [GeoParquet validator - Python](./python) diff --git a/validator/python/README.md b/validator/python/README.md deleted file mode 100644 index 7a40cc6..0000000 --- a/validator/python/README.md +++ /dev/null @@ -1,40 +0,0 @@ -# GeoParquet validator - Python - -Command-line tool to validate a GeoParquet file. Written in Python. Using [JSON Schema](https://json-schema.org/). - -## Installation - -``` -pip install --no-binary geoparquet_validator . -``` - -**Update** - -``` -pip install --no-binary geoparquet_validator -U . -``` - -**Development** - -``` -pip install -e . -``` - -**Uninstall** - -``` -pip uninstall geoparquet_validator -``` - -## Usage - -``` -geoparquet_validator ../../examples/example.parquet -geoparquet_validator https://storage.googleapis.com/open-geodata/linz-examples/nz-buildings-outlines.parquet -``` - -The validator also supports remote files. - -- `http://` or `https://`: no further configuration is needed. -- `s3://`: `s3fs` needs to be installed (run `pip install .[s3]`) and you may need to set environment variables. Refer [here](https://s3fs.readthedocs.io/en/latest/#credentials) for how to define credentials. -- `gs://`: `gcsfs` needs to be installed (run `pip install .[gcs]`). By default, `gcsfs` will attempt to use your default gcloud credentials or, attempt to get credentials from the google metadata service, or fall back to anonymous access. diff --git a/validator/python/geoparquet_validator/__init__.py b/validator/python/geoparquet_validator/__init__.py deleted file mode 100755 index b19103d..0000000 --- a/validator/python/geoparquet_validator/__init__.py +++ /dev/null @@ -1,92 +0,0 @@ -import json -import click -import pyarrow.parquet as pq - -from pprint import pprint -from urllib.parse import urlparse -from importlib_resources import files -from jsonschema.validators import Draft7Validator -from pyarrow.fs import FSSpecHandler, PyFileSystem -from fsspec import AbstractFileSystem -from fsspec.implementations.http import HTTPFileSystem -from fsspec.implementations.local import LocalFileSystem - - -def choose_fsspec_fs(url_or_path: str) -> AbstractFileSystem: - """Choose fsspec filesystem by sniffing input url""" - parsed = urlparse(url_or_path) - - if parsed.scheme.startswith("http"): - return HTTPFileSystem() - - if parsed.scheme == "s3": - from s3fs import S3FileSystem - - return S3FileSystem() - - if parsed.scheme == "gs": - from gcsfs import GCSFileSystem - - return GCSFileSystem() - - # TODO: Add Azure - return LocalFileSystem() - - -def load_parquet_schema(url_or_path: str) -> pq.ParquetSchema: - """Load schema from local or remote Parquet file""" - fsspec_fs = choose_fsspec_fs(url_or_path) - pyarrow_fs = PyFileSystem(FSSpecHandler(fsspec_fs)) - return pq.read_schema(pyarrow_fs.open_input_file(url_or_path)) - - -def log(text: str, status="info"): - status_color = { - "info": "white", - "warning": "yellow", - "error": "red", - "success": "green"} - click.echo(click.style(text, fg=status_color[status])) - - -@click.command() -@click.argument("input_file") -def main(input_file): - schema_source = files("geoparquet_validator").joinpath("schema.json") - schema = json.loads(schema_source.read_text()) - - parquet_schema = load_parquet_schema(input_file) - - if b"geo" not in parquet_schema.metadata: - log("Parquet file schema does not have 'geo' key", "error") - exit(1) - - metadata = json.loads(parquet_schema.metadata[b"geo"]) - log("Metadata loaded from file:") - pprint(metadata) - - valid = True - log("Validating file...") - - errors = Draft7Validator(schema).iter_errors(metadata) - - for error in errors: - valid = False - log(f" - {error.json_path}: {error.message}", "warning") - if "description" in error.schema: - log(f" \"{error.schema['description']}\"", "warning") - - # Extra errors - if (metadata["primary_column"] not in metadata["columns"]): - valid = False - log("- $.primary_column: must be in $.columns", "warning") - - if valid: - log("This is a valid GeoParquet file.\n", "success") - else: - log("This is an invalid GeoParquet file.\n", "error") - exit(1) - - -if __name__ == "__main__": - main() diff --git a/validator/python/geoparquet_validator/schema.json b/validator/python/geoparquet_validator/schema.json deleted file mode 120000 index b667c23..0000000 --- a/validator/python/geoparquet_validator/schema.json +++ /dev/null @@ -1 +0,0 @@ -../../../format-specs/schema.json \ No newline at end of file diff --git a/validator/python/setup.py b/validator/python/setup.py deleted file mode 100644 index 330a16e..0000000 --- a/validator/python/setup.py +++ /dev/null @@ -1,28 +0,0 @@ -from setuptools import setup, find_packages - -setup( - name="geoparquet_validator", - version="0.0.1", - install_requires=[ - "jsonschema>=4.4", - "pyarrow>=7.0", - "fsspec>=2022.3", - "requests>=2.27", - "aiohttp>=3.8", - "click>=8.1", - "colorama>=0.4" - ], - extras_require={ - "s3": ["s3fs"], - "gcs": ["gcsfs"] - }, - packages=find_packages(), - package_data={ - "geoparquet_validator": ["schema.json"] - }, - entry_points={ - "console_scripts": [ - "geoparquet_validator=geoparquet_validator:main" - ] - } -)