From 87a130be61272a31fb71a2b29d17c25179b68cf7 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 7 Aug 2023 14:20:27 +0000 Subject: [PATCH 01/14] Add geodata processing to dataset tool --- poetry.lock | 196 +++++++++++++++++- pyproject.toml | 1 + .../dataset/resource_management.py | 52 ++++- .../{{cookiecutter.slug}}/datapackage.yaml | 3 + 4 files changed, 239 insertions(+), 13 deletions(-) diff --git a/poetry.lock b/poetry.lock index 25e48a6..2670100 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry and should not be changed by hand. +# This file is automatically @generated by Poetry 1.4.2 and should not be changed by hand. [[package]] name = "altair" @@ -616,6 +616,42 @@ files = [ [package.dependencies] colorama = {version = "*", markers = "platform_system == \"Windows\""} +[[package]] +name = "click-plugins" +version = "1.1.1" +description = "An extension module for click to enable registering CLI commands via setuptools entry-points." +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "click-plugins-1.1.1.tar.gz", hash = "sha256:46ab999744a9d831159c3411bb0c79346d94a444df9a3a3742e9ed63645f264b"}, + {file = "click_plugins-1.1.1-py2.py3-none-any.whl", hash = "sha256:5d262006d3222f5057fd81e1623d4443e41dcda5dc815c06b442aa3c02889fc8"}, +] + +[package.dependencies] +click = ">=4.0" + +[package.extras] +dev = ["coveralls", "pytest (>=3.6)", "pytest-cov", "wheel"] + +[[package]] +name = "cligj" +version = "0.7.2" +description = "Click params for commmand line interfaces to GeoJSON" +category = "main" +optional = false +python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, <4" +files = [ + {file = "cligj-0.7.2-py3-none-any.whl", hash = "sha256:c1ca117dbce1fe20a5809dc96f01e1c2840f6dcc939b3ddbb1111bf330ba82df"}, + {file = "cligj-0.7.2.tar.gz", hash = "sha256:a4bc13d623356b373c2c27c53dbd9c68cae5d526270bfa71f6c6fa69669c6b27"}, +] + +[package.dependencies] +click = ">=4.0" + +[package.extras] +test = ["pytest-cov"] + [[package]] name = "colorama" version = "0.4.6" @@ -1021,6 +1057,50 @@ files = [ [package.extras] devel = ["colorama", "json-spec", "jsonschema", "pylint", "pytest", "pytest-benchmark", "pytest-cache", "validictory"] +[[package]] +name = "fiona" +version = "1.9.4.post1" +description = "Fiona reads and writes spatial data files" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "Fiona-1.9.4.post1-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:d6483a20037db2209c8e9a0c6f1e552f807d03c8f42ed0c865ab500945a37c4d"}, + {file = "Fiona-1.9.4.post1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dbe158947099a83ad16f9acd3a21f50ff01114c64e2de67805e382e6b6e0083a"}, + {file = "Fiona-1.9.4.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2c2c7b09eecee3bb074ef8aa518cd6ab30eb663c6fdd0eff3c88d454a9746eaa"}, + {file = "Fiona-1.9.4.post1-cp310-cp310-win_amd64.whl", hash = "sha256:1da8b954f6f222c3c782bc285586ea8dd9d7e55e1bc7861da9cd772bca671660"}, + {file = "Fiona-1.9.4.post1-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:c671d8832287cda397621d79c5a635d52e4631f33a8f0e6fdc732a79a93cb96c"}, + {file = "Fiona-1.9.4.post1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b633a2e550e083805c638d2ab8059c283ca112aaea8241e170c012d2ee0aa905"}, + {file = "Fiona-1.9.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1faa625d5202b8403471bbc9f9c96b1bf9099cfcb0ee02a80a3641d3d02383e"}, + {file = "Fiona-1.9.4.post1-cp311-cp311-win_amd64.whl", hash = "sha256:39baf11ff0e4318397e2b2197de427b4eebdc49d4a9a7c1366f8a7ed682978a4"}, + {file = "Fiona-1.9.4.post1-cp37-cp37m-macosx_10_15_x86_64.whl", hash = "sha256:d93c993265f6378b23f47708c83bddb3377ca6814a1f0b5a0ae0bee9c8d72cf8"}, + {file = "Fiona-1.9.4.post1-cp37-cp37m-manylinux2014_x86_64.whl", hash = "sha256:b0387cae39e27f338fd948b3b50b6e6ce198cc4cec257fc91660849697c69dc3"}, + {file = "Fiona-1.9.4.post1-cp37-cp37m-win_amd64.whl", hash = "sha256:450561d308d3ce7c7e30294822b1de3f4f942033b703ddd4a91a7f7f5f506ca0"}, + {file = "Fiona-1.9.4.post1-cp38-cp38-macosx_10_15_x86_64.whl", hash = "sha256:71b023ef5248ebfa5524e7a875033f7db3bbfaf634b1b5c1ae36958d1eb82083"}, + {file = "Fiona-1.9.4.post1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:74511d3755695d75cea0f4ff6f5e0c6c5d5be8e0d46dafff124c6a219e99b1eb"}, + {file = "Fiona-1.9.4.post1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:285f3dd4f96aa0a3955ed469f0543375b20989731b2dddc85124453f11ac62bc"}, + {file = "Fiona-1.9.4.post1-cp38-cp38-win_amd64.whl", hash = "sha256:a670ea4262cb9140445bcfc97cbfd2f508a058be342f4a97e966b8ce7696601f"}, + {file = "Fiona-1.9.4.post1-cp39-cp39-macosx_10_15_x86_64.whl", hash = "sha256:ea7c44c15b3a653452b9b3173181490b7afc5f153b0473c145c43c0fbf90448b"}, + {file = "Fiona-1.9.4.post1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:7bfb1f49e0e53f6cd7ad64ae809d72646266b37a7b9881205977408b443a8d79"}, + {file = "Fiona-1.9.4.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1a585002a6385cc8ab0f66ddf3caf18711f531901906abd011a67a0cc89ab7b0"}, + {file = "Fiona-1.9.4.post1-cp39-cp39-win_amd64.whl", hash = "sha256:f5da66b723a876142937e683431bbaa5c3d81bb2ed3ec98941271bc99b7f8cd0"}, + {file = "Fiona-1.9.4.post1.tar.gz", hash = "sha256:5679d3f7e0d513035eb72e59527bb90486859af4405755dfc739138633106120"}, +] + +[package.dependencies] +attrs = ">=19.2.0" +certifi = "*" +click = ">=8.0,<9.0" +click-plugins = ">=1.0" +cligj = ">=0.5" +six = "*" + +[package.extras] +all = ["Fiona[calc,s3,test]"] +calc = ["shapely"] +s3 = ["boto3 (>=1.3.1)"] +test = ["Fiona[s3]", "pytest (>=7)", "pytest-cov", "pytz"] + [[package]] name = "flake8" version = "3.9.2" @@ -1110,6 +1190,25 @@ server = ["flask (>=1.1)", "gunicorn (>=20.0)"] spss = ["savReaderWriter (>=3.0)"] sql = ["sqlalchemy (>=1.3)"] +[[package]] +name = "geopandas" +version = "0.13.2" +description = "Geographic pandas extensions" +category = "main" +optional = false +python-versions = ">=3.8" +files = [ + {file = "geopandas-0.13.2-py3-none-any.whl", hash = "sha256:101cfd0de54bcf9e287a55b5ea17ebe0db53a5e25a28bacf100143d0507cabd9"}, + {file = "geopandas-0.13.2.tar.gz", hash = "sha256:e5b56d9c20800c77bcc0c914db3f27447a37b23b2cd892be543f5001a694a968"}, +] + +[package.dependencies] +fiona = ">=1.8.19" +packaging = "*" +pandas = ">=1.1.0" +pyproj = ">=3.0.1" +shapely = ">=1.7.1" + [[package]] name = "google-api-core" version = "2.11.0" @@ -3105,6 +3204,44 @@ files = [ [package.extras] diagrams = ["jinja2", "railroad-diagrams"] +[[package]] +name = "pyproj" +version = "3.6.0" +description = "Python interface to PROJ (cartographic projections and coordinate transformations library)" +category = "main" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pyproj-3.6.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e600f6a2771d3b41aeb2cc1efd96771ae9a01451013da1dd48ff272e7c6e34ef"}, + {file = "pyproj-3.6.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d7f6cd045df29aae960391dfe06a575c110af598f1dea5add8be6ca42332b0f5"}, + {file = "pyproj-3.6.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:557e6592855111c84eda176ddf6b130f55d5e2b9cb1c017b8c91b69f37f474f5"}, + {file = "pyproj-3.6.0-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:de6288b6ceabdeeac01abf627c74414822d322d8f55dc8efe4d29dedd27c5719"}, + {file = "pyproj-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e427ccdbb1763872416549bdfa9fa1f5f169054653c4daf674e71480cc39cf11"}, + {file = "pyproj-3.6.0-cp310-cp310-win32.whl", hash = "sha256:1283d3c1960edbb74828f5f3405b27578a9a27f7766ab6a3956f4bd851f08239"}, + {file = "pyproj-3.6.0-cp310-cp310-win_amd64.whl", hash = "sha256:9de1aab71234bfd3fd648a1152519b5ee152c43113d7d8ea52590a0140129501"}, + {file = "pyproj-3.6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:00fab048596c17572fa8980014ef117dbb2a445e6f7ba3b9ddfcc683efc598e7"}, + {file = "pyproj-3.6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ba5e7c8ddd6ed5a3f9fcf95ea80ba44c931913723de2ece841c94bb38b200c4a"}, + {file = "pyproj-3.6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:08dfc5c9533c78a97afae9d53b99b810a4a8f97c3be9eb2b8f323b726c736403"}, + {file = "pyproj-3.6.0-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18a8bdb87aeb41b60a2e91d32f623227de3569fb83b4c64b174c3a7c5b0ed3ae"}, + {file = "pyproj-3.6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:dfe392dfc0eba2248dc08c976a72f52ff9da2bddfddfd9ff5dcf18e8e88200c7"}, + {file = "pyproj-3.6.0-cp311-cp311-win32.whl", hash = "sha256:78276c6b0c831255c97c56dff7313a3571f327a284d8ac63d6a56437a72ed0e0"}, + {file = "pyproj-3.6.0-cp311-cp311-win_amd64.whl", hash = "sha256:8fbac2eb9a0e425d7d6b7c6f4ebacd675cf3bdef0c59887057b8b4b0374e7c12"}, + {file = "pyproj-3.6.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:95120d65cbc5983dfd877076f28dbc18b9b329cbee38ca6e217bb7a5a043c099"}, + {file = "pyproj-3.6.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:830e6de7cfe43853967afee5ef908dfd5aa72d1ec12af9b9e3fecc179886e346"}, + {file = "pyproj-3.6.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e342b3010b2b20134671564ff9a8c476e5e512bf589477480aded1a5813af7c8"}, + {file = "pyproj-3.6.0-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23787460fab85ba2f857ee60ffb2e8e21fd9bd5db9833c51c1c05b2a6d9f0be5"}, + {file = "pyproj-3.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:595376e4d3bb72b7dceeccbce0f4c43053d47561f17a1ad0224407e9980ee849"}, + {file = "pyproj-3.6.0-cp39-cp39-win32.whl", hash = "sha256:4d8a9773503085eada59b6892c96ddf686ab8cf64cfdc18ad744d13ee76dfa6f"}, + {file = "pyproj-3.6.0-cp39-cp39-win_amd64.whl", hash = "sha256:137a07404f937f264b11b7130cd4cfa00002dbe4333b222e8056db84849c2ea4"}, + {file = "pyproj-3.6.0-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:2799499a4045e4fb73e44c31bdacab0593a253a7a4b6baae6fdd27d604cf9bc2"}, + {file = "pyproj-3.6.0-pp39-pypy39_pp73-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f04f6297c615c3b17f835df2556ac8fb9b4f51f281e960437eaf0cd80e7ae26a"}, + {file = "pyproj-3.6.0-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a4d2d438b007cb1f8d5f6f308d53d7ff9a2508cff8f9da6e2a93b76ffd98aaf"}, + {file = "pyproj-3.6.0.tar.gz", hash = "sha256:a5b111865b3f0f8b77b3983f2fbe4dd6248fc09d3730295949977c8dcd988062"}, +] + +[package.dependencies] +certifi = "*" + [[package]] name = "pyright" version = "1.1.291" @@ -3828,6 +3965,61 @@ docs = ["furo", "jaraco.packaging (>=9)", "jaraco.tidelift (>=1.4)", "pygments-g testing = ["build[virtualenv]", "filelock (>=3.4.0)", "flake8 (<5)", "flake8-2020", "ini2toml[lite] (>=0.9)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pip (>=19.1)", "pip-run (>=8.8)", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=1.3)", "pytest-flake8", "pytest-mypy (>=0.9.1)", "pytest-perf", "pytest-timeout", "pytest-xdist", "tomli-w (>=1.0.0)", "virtualenv (>=13.0.0)", "wheel"] testing-integration = ["build[virtualenv]", "filelock (>=3.4.0)", "jaraco.envs (>=2.2)", "jaraco.path (>=3.2.0)", "pytest", "pytest-enabler", "pytest-xdist", "tomli", "virtualenv (>=13.0.0)", "wheel"] +[[package]] +name = "shapely" +version = "2.0.1" +description = "Manipulation and analysis of geometric objects" +category = "main" +optional = false +python-versions = ">=3.7" +files = [ + {file = "shapely-2.0.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b06d031bc64149e340448fea25eee01360a58936c89985cf584134171e05863f"}, + {file = "shapely-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:9a6ac34c16f4d5d3c174c76c9d7614ec8fe735f8f82b6cc97a46b54f386a86bf"}, + {file = "shapely-2.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:865bc3d7cc0ea63189d11a0b1120d1307ed7a64720a8bfa5be2fde5fc6d0d33f"}, + {file = "shapely-2.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45b4833235b90bc87ee26c6537438fa77559d994d2d3be5190dd2e54d31b2820"}, + {file = "shapely-2.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ce88ec79df55430e37178a191ad8df45cae90b0f6972d46d867bf6ebbb58cc4d"}, + {file = "shapely-2.0.1-cp310-cp310-win32.whl", hash = "sha256:01224899ff692a62929ef1a3f5fe389043e262698a708ab7569f43a99a48ae82"}, + {file = "shapely-2.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:da71de5bf552d83dcc21b78cc0020e86f8d0feea43e202110973987ffa781c21"}, + {file = "shapely-2.0.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:502e0a607f1dcc6dee0125aeee886379be5242c854500ea5fd2e7ac076b9ce6d"}, + {file = "shapely-2.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:7d3bbeefd8a6a1a1017265d2d36f8ff2d79d0162d8c141aa0d37a87063525656"}, + {file = "shapely-2.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f470a130d6ddb05b810fc1776d918659407f8d025b7f56d2742a596b6dffa6c7"}, + {file = "shapely-2.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4641325e065fd3e07d55677849c9ddfd0cf3ee98f96475126942e746d55b17c8"}, + {file = "shapely-2.0.1-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:90cfa4144ff189a3c3de62e2f3669283c98fb760cfa2e82ff70df40f11cadb39"}, + {file = "shapely-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70a18fc7d6418e5aea76ac55dce33f98e75bd413c6eb39cfed6a1ba36469d7d4"}, + {file = "shapely-2.0.1-cp311-cp311-win32.whl", hash = "sha256:09d6c7763b1bee0d0a2b84bb32a4c25c6359ad1ac582a62d8b211e89de986154"}, + {file = "shapely-2.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:d8f55f355be7821dade839df785a49dc9f16d1af363134d07eb11e9207e0b189"}, + {file = "shapely-2.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:83a8ec0ee0192b6e3feee9f6a499d1377e9c295af74d7f81ecba5a42a6b195b7"}, + {file = "shapely-2.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a529218e72a3dbdc83676198e610485fdfa31178f4be5b519a8ae12ea688db14"}, + {file = "shapely-2.0.1-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91575d97fd67391b85686573d758896ed2fc7476321c9d2e2b0c398b628b961c"}, + {file = "shapely-2.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c8b0d834b11be97d5ab2b4dceada20ae8e07bcccbc0f55d71df6729965f406ad"}, + {file = "shapely-2.0.1-cp37-cp37m-win32.whl", hash = "sha256:b4f0711cc83734c6fad94fc8d4ec30f3d52c1787b17d9dca261dc841d4731c64"}, + {file = "shapely-2.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:05c51a29336e604c084fb43ae5dbbfa2c0ef9bd6fedeae0a0d02c7b57a56ba46"}, + {file = "shapely-2.0.1-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:b519cf3726ddb6c67f6a951d1bb1d29691111eaa67ea19ddca4d454fbe35949c"}, + {file = "shapely-2.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:193a398d81c97a62fc3634a1a33798a58fd1dcf4aead254d080b273efbb7e3ff"}, + {file = "shapely-2.0.1-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e55698e0ed95a70fe9ff9a23c763acfe0bf335b02df12142f74e4543095e9a9b"}, + {file = "shapely-2.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f32a748703e7bf6e92dfa3d2936b2fbfe76f8ce5f756e24f49ef72d17d26ad02"}, + {file = "shapely-2.0.1-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1a34a23d6266ca162499e4a22b79159dc0052f4973d16f16f990baa4d29e58b6"}, + {file = "shapely-2.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d173d24e85e51510e658fb108513d5bc11e3fd2820db6b1bd0522266ddd11f51"}, + {file = "shapely-2.0.1-cp38-cp38-win32.whl", hash = "sha256:3cb256ae0c01b17f7bc68ee2ffdd45aebf42af8992484ea55c29a6151abe4386"}, + {file = "shapely-2.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:c7eed1fb3008a8a4a56425334b7eb82651a51f9e9a9c2f72844a2fb394f38a6c"}, + {file = "shapely-2.0.1-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:ac1dfc397475d1de485e76de0c3c91cc9d79bd39012a84bb0f5e8a199fc17bef"}, + {file = "shapely-2.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:33403b8896e1d98aaa3a52110d828b18985d740cc9f34f198922018b1e0f8afe"}, + {file = "shapely-2.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:2569a4b91caeef54dd5ae9091ae6f63526d8ca0b376b5bb9fd1a3195d047d7d4"}, + {file = "shapely-2.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a70a614791ff65f5e283feed747e1cc3d9e6c6ba91556e640636bbb0a1e32a71"}, + {file = "shapely-2.0.1-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c43755d2c46b75a7b74ac6226d2cc9fa2a76c3263c5ae70c195c6fb4e7b08e79"}, + {file = "shapely-2.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad81f292fffbd568ae71828e6c387da7eb5384a79db9b4fde14dd9fdeffca9a"}, + {file = "shapely-2.0.1-cp39-cp39-win32.whl", hash = "sha256:b50c401b64883e61556a90b89948297f1714dbac29243d17ed9284a47e6dd731"}, + {file = "shapely-2.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:bca57b683e3d94d0919e2f31e4d70fdfbb7059650ef1b431d9f4e045690edcd5"}, + {file = "shapely-2.0.1.tar.gz", hash = "sha256:66a6b1a3e72ece97fc85536a281476f9b7794de2e646ca8a4517e2e3c1446893"}, +] + +[package.dependencies] +numpy = ">=1.14" + +[package.extras] +docs = ["matplotlib", "numpydoc (>=1.1.0,<1.2.0)", "sphinx", "sphinx-book-theme", "sphinx-remove-toctrees"] +test = ["pytest", "pytest-cov"] + [[package]] name = "shellingham" version = "1.5.0.post1" @@ -4523,4 +4715,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "34ea7766877c41c5778a7c2339615fa7da868a2fe2bbeaa683505f9ac19f979d" +content-hash = "fefa554270da664690bbd98cc6e742e75960435c4ac4d178ae32a075717ff5e3" diff --git a/pyproject.toml b/pyproject.toml index b6ecae2..7db4164 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ lxml = "^4.9.1" pyarrow = "^11.0.0" duckdb = "^0.6.1" sqlfluff = "^1.4.5" +geopandas = "^0.13.2" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index 898831d..35251a6 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -13,6 +13,7 @@ from shutil import copyfile from typing import Any, Callable, Dict, Literal, TypedDict, TypeVar, cast from urllib.parse import urlencode +import geopandas as gpd import pandas as pd import pytest @@ -169,7 +170,6 @@ def get_df(self) -> pd.DataFrame: raise ValueError(f"Unhandled file type {self.path.suffix}") def get_resource(self, inline_data: bool = False) -> dict[str, Any]: - if self.has_resource_yaml: yaml = YAML(typ="safe") with open(self.resource_path, "r") as f: @@ -205,7 +205,7 @@ def get_schema_from_file( ) -> SchemaValidator: return update_table_schema(self.path, existing_schema) - def rebuild_yaml(self): + def rebuild_yaml(self, is_geodata: bool = False): """ Recreate yaml file from source file, preserving any custom values from previously existing yaml file """ @@ -218,6 +218,15 @@ def rebuild_yaml(self): desc["schema"] = self.get_schema_from_file(existing_desc.get("schema", None)) desc["path"] = self.path.name + # if geodata - drop geometry example from schema + if is_geodata: + new_fields = [] + for f in desc["schema"]["fields"]: + if f["name"] == "geometry": + f["example"] = "" + new_fields.append(f) + desc["schema"]["fields"] = new_fields + # ensure a blank title and description new_dict = {"title": None, "description": None, "custom": {}} @@ -337,7 +346,6 @@ def build_from_function(self): ) return None if ":" in build_module and " " not in build_module: - module, function = build_module.split(":") module = importlib.import_module(module) function = getattr(module, function) @@ -680,7 +688,6 @@ def derive_bump_rule_from_change(self) -> tuple[version_rules, str] | None: ) if current_data != previous_data: - dict_diff = diff_dicts(previous_data, current_data) rich.print(dict_diff) @@ -809,8 +816,13 @@ def rebuild_resource(self, slug: str): resource.rebuild_yaml() def rebuild_all_resources(self): + is_geodata = self.is_geodata() for resource in self.resources().values(): - resource.rebuild_yaml() + resource.rebuild_yaml(is_geodata=is_geodata) + + def is_geodata(self) -> bool: + desc = self.get_datapackage() + return desc["custom"].get("is_geodata", False) def get_datapackage(self) -> dict[str, Any]: yaml = YAML(typ="safe") @@ -897,15 +909,21 @@ def copy_resources(self): """ desc = self.get_datapackage() - csv_value = desc.get("custom", {}).get("formats", {}).get("csv", True) - parquet_value = desc.get("custom", {}).get("formats", {}).get("parquet", True) + formats = desc.get("custom", {}).get("formats", {}) + csv_value = formats.get("csv", True) + parquet_value = formats.get("parquet", True) + geojson_value = formats.get("geojson", True) + geopackage_value = formats.get("gpkg", True) csv_copy_query = """ copy (select * from {{ source }}) to {{ dest }} (format PARQUET); """ + exclude = "" + if desc["custom"].get("is_geodata", False): + exclude = "EXCLUDE geometry" parquet_copy_query = """ - copy (select * from {{ source }}) to {{ dest }} (HEADER, DELIMITER ','); + copy (select * {{ exclude }} from {{ source }}) to {{ dest }} (HEADER, DELIMITER ','); """ for r in self.resources().values(): @@ -916,12 +934,24 @@ def copy_resources(self): if parquet_value: parquet_file = self.build_path() / (r.path.stem + ".parquet") duck_query(csv_copy_query, source=r.path, dest=parquet_file) + if geojson_value or geopackage_value: + raise ValueError( + "Writing to geojson/geopackage from csv source not supported. Use parquet internally." + ) elif r.path.suffix == ".parquet": if parquet_value: copyfile(r.path, self.build_path() / r.path.name) if csv_value: csv_file = self.build_path() / (r.path.stem + ".csv") duck_query(parquet_copy_query, source=r.path, dest=csv_file) + if geojson_value: + geojson_path = self.build_path() / (r.path.stem + ".geojson") + gdf = gpd.read_parquet(r.path) + gdf.to_file(geojson_path, driver="GeoJSON") + if geopackage_value: + geopackage_path = self.build_path() / (r.path.stem + ".gpkg") + gdf = gpd.read_parquet(r.path) + gdf.to_file(geopackage_path, driver="GPKG") def get_datapackage_order(self) -> int: """ @@ -1134,6 +1164,9 @@ def build_excel(self): for sheet_name, df in sheets.items(): short_sheet_name = sheet_name[-31:] # only allow 31 characters + # if geometry is column - remove it + if "geometry" in df.columns: + df = df.drop(columns=["geometry"]) df.to_excel(writer, sheet_name=short_sheet_name, index=False) for column in df: @@ -1142,7 +1175,6 @@ def build_excel(self): col_idx = df.columns.get_loc(column) if column_length <= 50: - writer.sheets[short_sheet_name].set_column( col_idx, col_idx, column_length ) @@ -1236,7 +1268,6 @@ def convert_to_array_from_comma(value: t) -> list[t]: # for instance splitting comma seperated fields to arrays for resource_slug, modify_maps in composite_options["modify"].items(): for column, modify_type in modify_maps.items(): - # split specified columns to arrays and update the schema if modify_type == "comma-to-array": for resource in datapackage["resources"]: @@ -1282,7 +1313,6 @@ def build_markdown(self): ... def print_status(self): - resources = list(self.resources().values()) df = pd.DataFrame( diff --git a/src/data_common/resources/dataset_template/{{cookiecutter.slug}}/datapackage.yaml b/src/data_common/resources/dataset_template/{{cookiecutter.slug}}/datapackage.yaml index 94cce6d..89c9176 100644 --- a/src/data_common/resources/dataset_template/{{cookiecutter.slug}}/datapackage.yaml +++ b/src/data_common/resources/dataset_template/{{cookiecutter.slug}}/datapackage.yaml @@ -31,6 +31,9 @@ custom: formats: csv: true parquet: true + gpkg: false + geojson: false + is_geodata: false composite: xlsx: include: all From a88eba7f032a5580025cd4dc5dd76bec90350978 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 7 Aug 2023 14:41:02 +0000 Subject: [PATCH 02/14] Update pyyaml --- poetry.lock | 77 ++++++++++++++++++++++++++++---------------------- pyproject.toml | 1 - 2 files changed, 44 insertions(+), 34 deletions(-) diff --git a/poetry.lock b/poetry.lock index 2670100..dff4c12 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3421,42 +3421,53 @@ files = [ ] [[package]] -name = "PyYAML" -version = "5.4.1" +name = "pyyaml" +version = "6.0.1" description = "YAML parser and emitter for Python" category = "main" optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*" +python-versions = ">=3.6" files = [ - {file = "PyYAML-5.4.1-cp27-cp27m-macosx_10_9_x86_64.whl", hash = "sha256:3b2b1824fe7112845700f815ff6a489360226a5609b96ec2190a45e62a9fc922"}, - {file = "PyYAML-5.4.1-cp27-cp27m-win32.whl", hash = "sha256:129def1b7c1bf22faffd67b8f3724645203b79d8f4cc81f674654d9902cb4393"}, - {file = "PyYAML-5.4.1-cp27-cp27m-win_amd64.whl", hash = "sha256:4465124ef1b18d9ace298060f4eccc64b0850899ac4ac53294547536533800c8"}, - {file = "PyYAML-5.4.1-cp27-cp27mu-manylinux1_x86_64.whl", hash = "sha256:bb4191dfc9306777bc594117aee052446b3fa88737cd13b7188d0e7aa8162185"}, - {file = "PyYAML-5.4.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:6c78645d400265a062508ae399b60b8c167bf003db364ecb26dcab2bda048253"}, - {file = "PyYAML-5.4.1-cp36-cp36m-manylinux1_x86_64.whl", hash = "sha256:4e0583d24c881e14342eaf4ec5fbc97f934b999a6828693a99157fde912540cc"}, - {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_aarch64.whl", hash = "sha256:72a01f726a9c7851ca9bfad6fd09ca4e090a023c00945ea05ba1638c09dc3347"}, - {file = "PyYAML-5.4.1-cp36-cp36m-manylinux2014_s390x.whl", hash = "sha256:895f61ef02e8fed38159bb70f7e100e00f471eae2bc838cd0f4ebb21e28f8541"}, - {file = "PyYAML-5.4.1-cp36-cp36m-win32.whl", hash = "sha256:3bd0e463264cf257d1ffd2e40223b197271046d09dadf73a0fe82b9c1fc385a5"}, - {file = "PyYAML-5.4.1-cp36-cp36m-win_amd64.whl", hash = "sha256:e4fac90784481d221a8e4b1162afa7c47ed953be40d31ab4629ae917510051df"}, - {file = "PyYAML-5.4.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:5accb17103e43963b80e6f837831f38d314a0495500067cb25afab2e8d7a4018"}, - {file = "PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl", hash = "sha256:e1d4970ea66be07ae37a3c2e48b5ec63f7ba6804bdddfdbd3cfd954d25a82e63"}, - {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_aarch64.whl", hash = "sha256:cb333c16912324fd5f769fff6bc5de372e9e7a202247b48870bc251ed40239aa"}, - {file = "PyYAML-5.4.1-cp37-cp37m-manylinux2014_s390x.whl", hash = "sha256:fe69978f3f768926cfa37b867e3843918e012cf83f680806599ddce33c2c68b0"}, - {file = "PyYAML-5.4.1-cp37-cp37m-win32.whl", hash = "sha256:dd5de0646207f053eb0d6c74ae45ba98c3395a571a2891858e87df7c9b9bd51b"}, - {file = "PyYAML-5.4.1-cp37-cp37m-win_amd64.whl", hash = "sha256:08682f6b72c722394747bddaf0aa62277e02557c0fd1c42cb853016a38f8dedf"}, - {file = "PyYAML-5.4.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:d2d9808ea7b4af864f35ea216be506ecec180628aced0704e34aca0b040ffe46"}, - {file = "PyYAML-5.4.1-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:8c1be557ee92a20f184922c7b6424e8ab6691788e6d86137c5d93c1a6ec1b8fb"}, - {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:fd7f6999a8070df521b6384004ef42833b9bd62cfee11a09bda1079b4b704247"}, - {file = "PyYAML-5.4.1-cp38-cp38-manylinux2014_s390x.whl", hash = "sha256:bfb51918d4ff3d77c1c856a9699f8492c612cde32fd3bcd344af9be34999bfdc"}, - {file = "PyYAML-5.4.1-cp38-cp38-win32.whl", hash = "sha256:fa5ae20527d8e831e8230cbffd9f8fe952815b2b7dae6ffec25318803a7528fc"}, - {file = "PyYAML-5.4.1-cp38-cp38-win_amd64.whl", hash = "sha256:0f5f5786c0e09baddcd8b4b45f20a7b5d61a7e7e99846e3c799b05c7c53fa696"}, - {file = "PyYAML-5.4.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:294db365efa064d00b8d1ef65d8ea2c3426ac366c0c4368d930bf1c5fb497f77"}, - {file = "PyYAML-5.4.1-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:74c1485f7707cf707a7aef42ef6322b8f97921bd89be2ab6317fd782c2d53183"}, - {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:d483ad4e639292c90170eb6f7783ad19490e7a8defb3e46f97dfe4bacae89122"}, - {file = "PyYAML-5.4.1-cp39-cp39-manylinux2014_s390x.whl", hash = "sha256:fdc842473cd33f45ff6bce46aea678a54e3d21f1b61a7750ce3c498eedfe25d6"}, - {file = "PyYAML-5.4.1-cp39-cp39-win32.whl", hash = "sha256:49d4cdd9065b9b6e206d0595fee27a96b5dd22618e7520c33204a4a3239d5b10"}, - {file = "PyYAML-5.4.1-cp39-cp39-win_amd64.whl", hash = "sha256:c20cfa2d49991c8b4147af39859b167664f2ad4561704ee74c1de03318e898db"}, - {file = "PyYAML-5.4.1.tar.gz", hash = "sha256:607774cbba28732bfa802b54baa7484215f530991055bb562efbed5b2f20a45e"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"}, + {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"}, + {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"}, + {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"}, + {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"}, + {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"}, + {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"}, + {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"}, + {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, + {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"}, + {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"}, + {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"}, + {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"}, + {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"}, + {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"}, + {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"}, + {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"}, + {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"}, + {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"}, + {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"}, + {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"}, + {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"}, + {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] [[package]] @@ -4715,4 +4726,4 @@ files = [ [metadata] lock-version = "2.0" python-versions = ">=3.10,<3.11" -content-hash = "fefa554270da664690bbd98cc6e742e75960435c4ac4d178ae32a075717ff5e3" +content-hash = "5e6c63d8cdd5908c182672a502e7200f4abed104c4e8292e9a14e5104feac4ae" diff --git a/pyproject.toml b/pyproject.toml index 7db4164..68c6b25 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ python = ">=3.10,<3.11" numpy = "1.21.0" openpyxl = "3.0.7" pandas = "1.4.2" -PyYAML = "5.4.1" scikit-learn = "^1.0.2" unicodecsv = "0.14.1" xlrd = "2.0.1" From 8ea4cf7663d8eecd8fc6351c45f4f0653e22a61e Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 7 Aug 2023 15:25:41 +0000 Subject: [PATCH 03/14] Add typing exemptions --- src/data_common/charting/download.py | 4 ++-- src/data_common/charting/sw_theme.py | 2 +- src/data_common/charting/theme.py | 2 +- src/data_common/pandas/df_extensions/space.py | 8 +------- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/src/data_common/charting/download.py b/src/data_common/charting/download.py index 76e6852..df56f50 100644 --- a/src/data_common/charting/download.py +++ b/src/data_common/charting/download.py @@ -39,12 +39,12 @@ def json_to_chart(json_spec: str) -> alt.Chart: del di_copy["datasets"] del di_copy["width"] c = Chart.from_dict(di_copy) - chart += c + chart += c # type: ignore else: del di["width"] del di["config"]["view"] chart = Chart.from_dict(di) - return chart + return chart # type: ignore def get_chart_from_url(url: str, n: int = 0) -> alt.Chart: diff --git a/src/data_common/charting/sw_theme.py b/src/data_common/charting/sw_theme.py index bc6d292..6ca81ae 100644 --- a/src/data_common/charting/sw_theme.py +++ b/src/data_common/charting/sw_theme.py @@ -118,7 +118,7 @@ def color_scale( use_palette = palette[: len(domain)] if reverse: use_palette = use_palette[::-1] - return alt.Scale(domain=domain, range=use_palette) + return alt.Scale(domain=domain, range=use_palette) # type: ignore font = "Lato" diff --git a/src/data_common/charting/theme.py b/src/data_common/charting/theme.py index 3b2935a..2e1316a 100644 --- a/src/data_common/charting/theme.py +++ b/src/data_common/charting/theme.py @@ -131,7 +131,7 @@ def color_scale( use_palette = palette[: len(domain)] if reverse: use_palette = use_palette[::-1] - return alt.Scale(domain=domain, range=use_palette) + return alt.Scale(domain=domain, range=use_palette) # type: ignore font = "Source Sans Pro" diff --git a/src/data_common/pandas/df_extensions/space.py b/src/data_common/pandas/df_extensions/space.py index 6df9673..d7042dd 100644 --- a/src/data_common/pandas/df_extensions/space.py +++ b/src/data_common/pandas/df_extensions/space.py @@ -125,14 +125,12 @@ def t(x): self.label_df = label_df def set_k(self, k: int) -> "Cluster": - new = copy.deepcopy(self) new.k = k return new def get_label_name(self, n, include_short=True) -> str: - short_label = n name = self.label_names.get(self.k, {}).get(n, short_label) if include_short: @@ -205,7 +203,6 @@ def map_from_anchor(self, anchor: pd.DataFrame | Path) -> dict[int, int]: return mapping.to_dict() def get_label_options(self) -> list: - return [self.get_label_name(x) for x in range(1, self.k + 1)] def get_cluster_label_ids(self) -> pd.Series: @@ -215,7 +212,6 @@ def get_cluster_label_ids(self) -> pd.Series: return labels def get_cluster_labels(self, include_short=True) -> ArrayLike: - labels = self.get_cluster_label_ids() def f(x): @@ -227,7 +223,6 @@ def f(x): label_array = get_cluster_labels def get_cluster_descs(self) -> ArrayLike: - labels = self.get_cluster_label_ids() labels = labels.apply(lambda x: self.get_label_desc(n=x)) return np.array(labels) @@ -331,7 +326,7 @@ def _get_clusters(self, k: int): """ fetch k means results for this cluster """ - km = KMeans(n_clusters=k, random_state=self.default_seed, n_init=10) + km = KMeans(n_clusters=k, random_state=self.default_seed, n_init=10) # type: ignore return km.fit(self.df) def get_clusters(self, k: int): @@ -622,7 +617,6 @@ def join_distance(df_label_dict: Dict[str, pd.DataFrame]) -> pd.DataFrame: """ def prepare(df, label): - return ( df.set_index(list(df.columns[:2])) .rename(columns={"distance": label}) From f1f8eb10d2a63bcd9301e3ff690e40e498905fd4 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Mon, 7 Aug 2023 16:16:56 +0000 Subject: [PATCH 04/14] Make updating run notebook optional --- src/data_common/management/run_notebook.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/data_common/management/run_notebook.py b/src/data_common/management/run_notebook.py index 27058aa..0cb12eb 100644 --- a/src/data_common/management/run_notebook.py +++ b/src/data_common/management/run_notebook.py @@ -3,7 +3,7 @@ from pathlib import Path -def run_notebook(notebook_filename: Path): +def run_notebook(notebook_filename: Path, save: bool = True): """ Run a notebook as part of another process """ @@ -12,6 +12,8 @@ def run_notebook(notebook_filename: Path): nb = nbformat.read(f, as_version=4) ep = ExecutePreprocessor(timeout=600) ep.preprocess(nb, {"metadata": {"path": "notebooks/"}}) - with open(notebook_filename, "w", encoding="utf-8") as f: - nbformat.write(nb, f) + if save: + print(f"Saving notebook: {notebook_filename}") + with open(notebook_filename, "w", encoding="utf-8") as f: + nbformat.write(nb, f) print("Done") From 472a1d74ceb9020e0dc2da055453adf50879e08e Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Tue, 8 Aug 2023 09:54:04 +0000 Subject: [PATCH 05/14] Change where notebook settings are stored --- src/data_common/management/settings.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/data_common/management/settings.py b/src/data_common/management/settings.py index 7ab85b0..41aa195 100644 --- a/src/data_common/management/settings.py +++ b/src/data_common/management/settings.py @@ -25,7 +25,11 @@ def get_settings( settings_file = Path(*top_level, toml_file) - data = toml.load(settings_file)["notebook"]["settings"] + try: + data = toml.load(settings_file)["tool"]["notebook"]["settings"] + except KeyError: + # backward compatibiiity for invalid toml + data = toml.load(settings_file)["notebook"]["settings"] env_data = {} if env_file and Path(*top_level, env_file).exists(): From d368fd5d1de3661f6b6de75cd696d2d044039036 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Wed, 9 Aug 2023 14:27:13 +0000 Subject: [PATCH 06/14] formatting fixes --- src/data_common/dataset/jekyll_management.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/data_common/dataset/jekyll_management.py b/src/data_common/dataset/jekyll_management.py index ec0012c..39985ab 100644 --- a/src/data_common/dataset/jekyll_management.py +++ b/src/data_common/dataset/jekyll_management.py @@ -12,7 +12,6 @@ def markdown_with_frontmatter( data: dict[str, Any], dest: Path, content: str = "", from_file: Path | None = None ): - if content and from_file: raise ValueError("Trying to use contents and from_file arguments") @@ -31,7 +30,6 @@ def markdown_with_frontmatter( def render_download_format_to_dir(items: list[dict[str, Any]], output_dir: Path): - if output_dir.exists() is False: output_dir.mkdir() # remove existing files @@ -52,7 +50,6 @@ def render_download_format_to_dir(items: list[dict[str, Any]], output_dir: Path) def render_sources_to_dir(items: list[dict[str, Any]], output_dir: Path): - if output_dir.exists() is False: output_dir.mkdir() # remove existing files @@ -103,7 +100,7 @@ def make_version_info_page(items: list[dict[str, Any]], output_dir: Path): df = pd.DataFrame(items)[["name", "title", "version", "full_version"]] for name, d in df.groupby("name"): - safe_name = name.replace("-", "_") + safe_name = str(name).replace("-", "_") data_dict = { "name": name, "title": d["title"].iloc[0], From edbef720229296ba4741fa4ca36e7b17b4774c32 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Wed, 9 Aug 2023 14:27:59 +0000 Subject: [PATCH 07/14] Helper functions for validation tests in typing --- src/data_common/helpers/typing.py | 119 ++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 src/data_common/helpers/typing.py diff --git a/src/data_common/helpers/typing.py b/src/data_common/helpers/typing.py new file mode 100644 index 0000000..85ae348 --- /dev/null +++ b/src/data_common/helpers/typing.py @@ -0,0 +1,119 @@ +from typing import ( + get_args, + Any, + Type, + TypeVar, + Callable, + Generic, + ParamSpec, + get_type_hints, +) + +from inspect import signature + +T = TypeVar("T") +P = ParamSpec("P") + + +class ValidationTest(Generic[T]): + root_type: Type[T] + test: Callable[[T], Any] + error: Callable[[T], Exception] + + def __init__( + self, + root_type: Type[T], + test: Callable[[T], Any], + error: Callable[[T], Exception], + ): + self.root_type = root_type + self.test = test + self.error = error + + def __call__(self, *args, **kwargs): + return self.test(*args, **kwargs) + + +def inspect_function(func): + sig = signature(func) + parameters = sig.parameters + args = [] + kwargs = {} + + for param_name, param in parameters.items(): + if param.default == param.empty: + args.append(param_name) + else: + kwargs[param_name] = param.default + + return args, kwargs + + +def merge_args_kwargs(func, *args, **kwargs): + expected_args, expected_kwargs = inspect_function(func) + + if len(args) > len(expected_args): + raise ValueError( + f"Function expects {len(expected_args)} positional arguments, but {len(args)} were provided." + ) + + merged_kwargs = expected_kwargs.copy() + + for i, arg in enumerate(args): + merged_kwargs[expected_args[i]] = arg + + merged_kwargs.update(kwargs) + + return merged_kwargs + + +def enforce_types(func: Callable[P, T]) -> Callable[P, T]: + """ + This lets us move some basic validation items into the type hint structure + """ + type_hints = get_type_hints(func, include_extras=True) + expected_args, expected_kwargs = inspect_function(func) + + def wrapper(*args: P.args, **kwargs: P.kwargs): + if len(args) > len(expected_args): + raise ValueError( + f"Function expects {len(expected_args)} positional arguments, but {len(args)} were provided." + ) + + merged_kwargs = expected_kwargs.copy() + + for i, arg in enumerate(args): + merged_kwargs[expected_args[i]] = arg + + merged_kwargs.update(kwargs) + + for arg, type_ in type_hints.items(): + if arg == "return": + continue + parameter_value = merged_kwargs[arg] + enforce_type(parameter_value, type_) + value = func(*args, **kwargs) + if "return" in type_hints: + enforce_type(value, type_hints["return"]) + return value + + return wrapper + + +def enforce_type(object: T, annotated_type: Type[T]) -> None: + meta_data = get_args(annotated_type) + + if not meta_data: + if not isinstance(object, annotated_type): + raise TypeError(f"Expected {annotated_type} but got {type(object)}") + + if meta_data: + type_ = meta_data[0] + + if not isinstance(object, type_): + raise TypeError(f"Expected {type_} but got {type(object)}") + tests = meta_data[1:] + + for test in tests: + if not test(object): + raise test.error(object) From a0260459ed46f3ab82b4aff0fd25df68a44e4d1b Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Wed, 9 Aug 2023 15:54:42 +0000 Subject: [PATCH 08/14] Add new decorator based approach for duck queries --- src/data_common/db/duck.py | 118 ++++++++++++++++++++++++++++++++++--- 1 file changed, 109 insertions(+), 9 deletions(-) diff --git a/src/data_common/db/duck.py b/src/data_common/db/duck.py index d4dfbfb..6121705 100644 --- a/src/data_common/db/duck.py +++ b/src/data_common/db/duck.py @@ -1,14 +1,23 @@ -import inspect from functools import lru_cache from pathlib import Path -from typing import Any, Literal, Callable - +from typing import Any, Literal, Callable, Protocol, runtime_checkable, Union import duckdb import jinja2 import pandas as pd import toml +@runtime_checkable +class DuckView(Protocol): + query: str + + +@runtime_checkable +class DuckMacro(Protocol): + args: list[str] + macro: str + + @lru_cache def get_settings(toml_file: str = "pyproject.toml") -> dict: """ @@ -122,17 +131,57 @@ def __truediv__(self, other: str) -> "DuckUrl": return DuckUrl(f"{url}/{other}") +SourceType = Path | DuckUrl | pd.DataFrame + + +@runtime_checkable +class SourceView(Protocol): + @property + def source(self) -> SourceType: + ... + + class DuckQuery: def __init__(self): self.ddb = duckdb.connect(":memory:") self.https: bool = False + self.variables = {} + self._last_query: DuckResponse | None = None + + def set_jinja_variable(self, name: str, value: Any) -> "DuckQuery": + """ + Set jinja variables that can then be used in queries + """ + self.variables[name] = value + return self + + @property + def last_query(self): + """ + Get query for last view registered + """ + if not self._last_query: + raise ValueError("No previous query to execute") + return self._last_query def activate_https(self) -> None: if self.https is False: self.ddb.execute("install httpfs; load httpfs") - def register(self, name: str, item: pd.DataFrame | DuckUrl | Path) -> None: + def as_source(self, item: SourceView) -> "DuckResponse": + """ + Decorator to convert something implementing SourceView to a DuckResponse + """ + name = item.__name__ # type: ignore + source = getattr(item, "source", None) + + if source is None: + raise ValueError("Class must have a source attribute") + self.register(name, source) + return self.view(name) + + def register(self, name: str, item: SourceType) -> None: if isinstance(item, DuckUrl): self.activate_https() self.ddb.execute( @@ -156,14 +205,38 @@ def add_view(self, name: str, query: str) -> "DuckQuery": self.ddb.execute(f"CREATE OR REPLACE VIEW {name} AS {query}") return self + def as_view(self, cls: DuckView) -> "DuckResponse": + """ + Decorator to convert something implementing DuckView to a DuckResponse + """ + + query = getattr(cls, "query", None) + + if query is None: + raise ValueError("Class must have a query method") + + store_as_view = getattr(cls, "store_as_view", None) # type: ignore + + if store_as_view is None: + store_as_view: str = cls.__name__ # type: ignore + + return self.query(query, store_as=store_as_view) + + def view(self, view_name: str): + """ """ + return self.query(f"SELECT * FROM {view_name}") + def query( - self, query: str | Path, store_as: str | None = None, **kwargs: Any + self, query: str | Path | DuckView, store_as: str | None = None, **kwargs: Any ) -> DuckResponse: """ Execute a query """ + if isinstance(query, DuckView): + return self.as_view(query) + + query_vars = self.variables | kwargs - # if the query is a path, read it in if isinstance(query, Path) or query.endswith(".sql"): path = Path(query) if not path.exists(): @@ -189,11 +262,11 @@ def process_kwarg(key: str, value: Any) -> Any: return value - if kwargs: + if query_vars: env = jinja2.Environment() template = env.from_string(query) - args = {k: process_kwarg(k, v) for k, v in kwargs.items()} + args = {k: process_kwarg(k, v) for k, v in query_vars.items()} rendered_query = template.render(**args) else: @@ -202,9 +275,36 @@ def process_kwarg(key: str, value: Any) -> Any: if store_as: self.ddb.execute(f"CREATE OR REPLACE VIEW {store_as} AS {rendered_query}") rendered_query = f"SELECT * FROM {store_as}" - return DuckResponse(self, rendered_query) + + response = DuckResponse(self, rendered_query) + + self._last_query = response + return response + + def as_macro(self, item: DuckMacro): + name = item.__name__ # type: ignore + + args = getattr(item, "args", None) + + if args is None: + raise ValueError("Macro must have an args attribute") + + macro = getattr(item, "macro", None) + + if macro is None: + raise ValueError("Macro must have a macro method") + + macro_query = f""" + CREATE OR REPLACE MACRO {name}({", ".join(args)}) AS + {macro} + """ + self.query(macro_query).run() + + return item def macro(self, func: Callable[..., str]) -> None: + # depricated: converts a function + # prefer 'as_macro' for clarity # get function name name = func.__name__ # get arguments From 69f4b5780f6c8a721be653e016cdf600c81c67cc Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Wed, 9 Aug 2023 15:55:20 +0000 Subject: [PATCH 09/14] Add helpers for working with parquet files --- src/data_common/helpers/parquet.py | 104 +++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 src/data_common/helpers/parquet.py diff --git a/src/data_common/helpers/parquet.py b/src/data_common/helpers/parquet.py new file mode 100644 index 0000000..92c12e9 --- /dev/null +++ b/src/data_common/helpers/parquet.py @@ -0,0 +1,104 @@ +from pathlib import Path +import pyarrow as pa +import pyarrow.parquet as pq +import pandas as pd +import geopandas as gpd +import math +from tqdm import tqdm +from geopandas.io.arrow import _arrow_to_geopandas + + +def write_split_parquet( + from_file: Path, + output_path: Path, + chunk_size: int = 1000, + compression: str = "GZIP", + silent: bool = False, +): + """ + Split a Parquet file into multiple Parquet files. + """ + # initialize output directory + if not output_path.exists(): + output_path.mkdir(parents=True) + + if output_path.exists() and not output_path.is_dir(): + raise ValueError("Output path is not a directory.") + + else: + for file in output_path.iterdir(): + file.unlink() + table = pa.parquet.read_table(from_file) + + # Calculate the total number of records + total_records = table.num_rows + + # Calculate the number of chunks needed + num_chunks = math.ceil(total_records / chunk_size) + + # Split the table into chunks and write to separate Parquet files + for chunk_idx in tqdm(list(range(num_chunks)), disable=silent): + start_idx = chunk_idx * chunk_size + end_idx = min((chunk_idx + 1) * chunk_size, total_records) + + # Slice the table to create a new chunk + chunk_table = table.slice(start_idx, end_idx - start_idx) + + # Write the chunk to a Parquet file + output_file = output_path / f"{chunk_idx}.parquet" + pq.write_table(chunk_table, output_file, compression=compression) + + +def read_parquet_directory_to_table(directory_path: Path) -> pa.Table: + """ + Read all Parquet files in a directory and combine them into a single PyArrow Table. + """ + parquet_files = [ + file for file in directory_path.iterdir() if file.suffix == ".parquet" + ] + if not parquet_files: + raise ValueError("No Parquet files found in the directory.") + + # Read Parquet files and combine them into a single DataFrame + tables = [] + for file in parquet_files: + table = pq.read_table(file) + tables.append(table) + + return pa.concat_tables(tables) + + +def read_parquet_directory(directory_path: Path) -> pd.DataFrame: + """ + Read all Parquet files in a directory and combine them into a single Pandas DataFrame. + """ + table = read_parquet_directory_to_table(directory_path) + return table.to_pandas() + + +def read_split_geoparquet(fromdir: Path) -> gpd.GeoDataFrame: + """ + Read all Parquet files in a directory and combine them into a single GeoPandas DataFrame. + """ + table = read_parquet_directory_to_table(fromdir) + # convert pyarrow table to geopandas dataframe + return _arrow_to_geopandas(table) + + +def open_geo_file(file_path: Path) -> gpd.GeoDataFrame: + """ + Open a GeoFile (GeoJSON, Shapefile, GeoPackage, etc.) and return a GeoDataFrame. + """ + + # if the file_path is a directory return a GeoDataFrame + if file_path.is_dir(): + return read_split_geoparquet(file_path) + # if the file_name is "*.parquet", get the parent directory and return a GeoDataFrame + elif file_path.name == "*.parquet": + return read_split_geoparquet(file_path.parent) + # if it's another parquet file, return that using the normal method + elif file_path.suffix == ".parquet": + return gpd.read_parquet(file_path) + # if it's a GeoJSON, Shapefile, GeoPackage, etc. return that using the normal method + else: + return gpd.read_file(file_path) From 1a81beb1f3142335df518cc92de31306b7aea532 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Thu, 10 Aug 2023 10:39:19 +0000 Subject: [PATCH 10/14] Do not generate geoformats by default --- src/data_common/dataset/resource_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index 35251a6..ec7c5bc 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -912,8 +912,8 @@ def copy_resources(self): formats = desc.get("custom", {}).get("formats", {}) csv_value = formats.get("csv", True) parquet_value = formats.get("parquet", True) - geojson_value = formats.get("geojson", True) - geopackage_value = formats.get("gpkg", True) + geojson_value = formats.get("geojson", False) + geopackage_value = formats.get("gpkg", False) csv_copy_query = """ copy (select * from {{ source }}) to {{ dest }} (format PARQUET); From 0030f25cae42e112f9e782ec40e005f3b9996323 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Thu, 10 Aug 2023 10:53:49 +0000 Subject: [PATCH 11/14] Fix CSV render from parquet bug - Actually need to run the transform queries. --- src/data_common/dataset/resource_management.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index 35251a6..6d74a8b 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -933,7 +933,7 @@ def copy_resources(self): copyfile(r.path, self.build_path() / r.path.name) if parquet_value: parquet_file = self.build_path() / (r.path.stem + ".parquet") - duck_query(csv_copy_query, source=r.path, dest=parquet_file) + duck_query(csv_copy_query, source=r.path, dest=parquet_file).run() if geojson_value or geopackage_value: raise ValueError( "Writing to geojson/geopackage from csv source not supported. Use parquet internally." @@ -943,7 +943,12 @@ def copy_resources(self): copyfile(r.path, self.build_path() / r.path.name) if csv_value: csv_file = self.build_path() / (r.path.stem + ".csv") - duck_query(parquet_copy_query, source=r.path, dest=csv_file) + duck_query( + parquet_copy_query, + exclude=exclude, + source=r.path, + dest=csv_file, + ).run() if geojson_value: geojson_path = self.build_path() / (r.path.stem + ".geojson") gdf = gpd.read_parquet(r.path) From 2dcf2af2a0d0ff1fe4f415cf69d63a35ea2c66f4 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Fri, 11 Aug 2023 08:15:21 +0000 Subject: [PATCH 12/14] Remove __index_level_0__ from csv exports --- src/data_common/dataset/resource_management.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index 1ec53ce..f504df6 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -918,9 +918,12 @@ def copy_resources(self): csv_copy_query = """ copy (select * from {{ source }}) to {{ dest }} (format PARQUET); """ - exclude = "" + + # __index_level_0__ is an internal parquet column that duckdb has access to + # but we don't want to export + exclude = "EXCLUDE __index_level_0__" if desc["custom"].get("is_geodata", False): - exclude = "EXCLUDE geometry" + exclude = "EXCLUDE __index_level_0__, geometry" parquet_copy_query = """ copy (select * {{ exclude }} from {{ source }}) to {{ dest }} (HEADER, DELIMITER ','); From 00a006415e01d86fce14982eb018341f0adaf60d Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Fri, 11 Aug 2023 08:30:46 +0000 Subject: [PATCH 13/14] Make all composite types remove geodata --- .../dataset/resource_management.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index f504df6..34ee6e1 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -11,7 +11,7 @@ from dataclasses import dataclass from pathlib import Path from shutil import copyfile -from typing import Any, Callable, Dict, Literal, TypedDict, TypeVar, cast +from typing import Any, Callable, Literal, TypedDict, TypeVar, cast from urllib.parse import urlencode import geopandas as gpd @@ -21,9 +21,7 @@ import xlsxwriter import re -from frictionless import Schema, describe, validate -from pyparsing import any_open_tag -from rich.markdown import Markdown +from frictionless import describe, validate from rich.table import Table from ruamel.yaml import YAML @@ -169,15 +167,18 @@ def get_df(self) -> pd.DataFrame: else: raise ValueError(f"Unhandled file type {self.path.suffix}") - def get_resource(self, inline_data: bool = False) -> dict[str, Any]: + def get_resource( + self, inline_data: bool = False, is_geodata: bool = False + ) -> dict[str, Any]: if self.has_resource_yaml: yaml = YAML(typ="safe") - with open(self.resource_path, "r") as f: + with self.resource_path.open("r") as f: resource = yaml.load(f) if inline_data: - resource["data"] = ( - self.get_df().fillna(value="").to_dict(orient="records") - ) + df = self.get_df() + if is_geodata and "geometry" in df.columns: + df = df.drop(columns=["geometry"]) + resource["data"] = df.fillna(value="").to_dict(orient="records") resource["format"] = "json" del resource["scheme"] del resource["path"] @@ -209,8 +210,6 @@ def rebuild_yaml(self, is_geodata: bool = False): """ Recreate yaml file from source file, preserving any custom values from previously existing yaml file """ - from frictionless.resource.resource import Resource - existing_desc = self.get_resource() desc = describe(self.path) desc.update(existing_desc) @@ -271,7 +270,7 @@ def rebuild_yaml(self, is_geodata: bool = False): yaml_str = yaml_str.replace("- no\n", "- 'no'\n") yaml_str = yaml_str.replace("- yes\n", "- 'yes'\n") - with open(self.resource_path, "w") as f: + with self.resource_path.open("w") as f: f.write(yaml_str) print(f"Updated config for {self.slug} to {self.resource_path}") @@ -1141,7 +1140,7 @@ def get_composite_options( return composite_options - def build_excel(self): + def build_excel(self, is_geodata: bool = False): """ Build a single excel file for all resources """ @@ -1173,7 +1172,7 @@ def build_excel(self): for sheet_name, df in sheets.items(): short_sheet_name = sheet_name[-31:] # only allow 31 characters # if geometry is column - remove it - if "geometry" in df.columns: + if is_geodata and "geometry" in df.columns: df = df.drop(columns=["geometry"]) df.to_excel(writer, sheet_name=short_sheet_name, index=False) @@ -1193,7 +1192,7 @@ def build_excel(self): writer.save() - def build_sqlite(self): + def build_sqlite(self, is_geodata: bool = False): """ Create a composite sqlite file for all resources with metadata as a seperate table. @@ -1216,7 +1215,10 @@ def build_sqlite(self): for slug, resource in self.resources().items(): if slug not in allowed_resource_slugs: continue - sheets[slug] = resource.get_df() + df = resource.get_df() + if is_geodata and "geometry" in df.columns: + df = df.drop(columns=["geometry"]) + sheets[slug] = df meta_df = resource.get_metadata_df() meta_df["resource"] = slug metadata.append(meta_df) @@ -1232,7 +1234,7 @@ def build_sqlite(self): df.to_sql(name, con, index=False) con.close() - def build_composite_json(self): + def build_composite_json(self, is_geodata: bool = False): """ This builds a composite json file that inlines the data as json. It can have less resources than the total, and some modifiers on the data. @@ -1251,7 +1253,7 @@ def build_composite_json(self): ] datapackage["resources"] = [ - x.get_resource(inline_data=True) + x.get_resource(inline_data=True, is_geodata=is_geodata) for x in self.resources().values() if x.slug in allowed_resource_slugs ] @@ -1310,9 +1312,10 @@ def build_composites(self): """ Create composite files for the datapackage """ - self.build_excel() - self.build_sqlite() - self.build_composite_json() + is_geodata = self.is_geodata() + self.build_excel(is_geodata) + self.build_sqlite(is_geodata) + self.build_composite_json(is_geodata) def build_markdown(self): """ From bdca4e075c10ee1392ea1b148ed4fe60bcac7e28 Mon Sep 17 00:00:00 2001 From: Alex Parsons Date: Fri, 11 Aug 2023 11:01:42 +0000 Subject: [PATCH 14/14] Fix geodata-less csv query --- src/data_common/dataset/resource_management.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/data_common/dataset/resource_management.py b/src/data_common/dataset/resource_management.py index 34ee6e1..4f8bd75 100644 --- a/src/data_common/dataset/resource_management.py +++ b/src/data_common/dataset/resource_management.py @@ -920,9 +920,9 @@ def copy_resources(self): # __index_level_0__ is an internal parquet column that duckdb has access to # but we don't want to export - exclude = "EXCLUDE __index_level_0__" + exclude = "EXCLUDE (__index_level_0__)" if desc["custom"].get("is_geodata", False): - exclude = "EXCLUDE __index_level_0__, geometry" + exclude = "EXCLUDE (__index_level_0__, geometry)" parquet_copy_query = """ copy (select * {{ exclude }} from {{ source }}) to {{ dest }} (HEADER, DELIMITER ',');