From 93e066ed270409542a5479ac555baf8a0aa06e53 Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Fri, 19 Jul 2024 09:39:20 -0400 Subject: [PATCH] added csv_to_parquet function --- pyproject.toml | 3 +- requirements.txt | 448 ++++++++++++++++++++++++++++++++++++++-- rialto_airflow/utils.py | 22 +- test/test_utils.py | 12 ++ 4 files changed, 463 insertions(+), 22 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9b3cd25..78f57c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,8 @@ dependencies = [ "dimcli", "polars", "pyalex", - "more-itertools" + "more-itertools", + "pyarrow" ] [tool.pytest.ini_options] diff --git a/requirements.txt b/requirements.txt index bd461c0..5370b5e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,96 +1,445 @@ # This file was autogenerated by uv via the following command: -# uv pip compile pyproject.toml -o requirements.txt +# uv pip compile - -o requirements.txt +aiohttp==3.9.5 + # via apache-airflow-providers-http +aiosignal==1.3.1 + # via aiohttp alabaster==0.7.16 # via sphinx +alembic==1.13.1 + # via apache-airflow +anyio==4.4.0 + # via httpx +apache-airflow==2.9.2 + # via + # apache-airflow-providers-common-io + # apache-airflow-providers-common-sql + # apache-airflow-providers-fab + # apache-airflow-providers-ftp + # apache-airflow-providers-http + # apache-airflow-providers-imap + # apache-airflow-providers-smtp + # apache-airflow-providers-sqlite +apache-airflow-providers-common-io==1.3.2 + # via apache-airflow +apache-airflow-providers-common-sql==1.14.0 + # via + # apache-airflow + # apache-airflow-providers-sqlite +apache-airflow-providers-fab==1.1.1 + # via apache-airflow +apache-airflow-providers-ftp==3.9.1 + # via apache-airflow +apache-airflow-providers-http==4.11.1 + # via apache-airflow +apache-airflow-providers-imap==3.6.1 + # via apache-airflow +apache-airflow-providers-smtp==1.7.1 + # via apache-airflow +apache-airflow-providers-sqlite==3.8.1 + # via apache-airflow +apispec==6.6.1 + # via flask-appbuilder +argcomplete==3.4.0 + # via apache-airflow +asgiref==3.8.1 + # via + # apache-airflow + # apache-airflow-providers-http asttokens==2.4.1 # via stack-data +attrs==23.2.0 + # via + # aiohttp + # apache-airflow + # jsonschema + # referencing babel==2.15.0 - # via sphinx + # via + # flask-babel + # sphinx +blinker==1.8.2 + # via apache-airflow +cachelib==0.9.0 + # via + # flask-caching + # flask-session certifi==2024.6.2 - # via requests + # via + # httpcore + # httpx + # requests +cffi==1.16.0 + # via cryptography charset-normalizer==3.3.2 # via requests click==8.1.7 - # via dimcli + # via + # clickclick + # dimcli + # flask + # flask-appbuilder +clickclick==20.10.2 + # via connexion +colorama==0.4.6 + # via flask-appbuilder +colorlog==4.8.0 + # via apache-airflow commonmark==0.9.1 # via recommonmark +configupdater==3.2 + # via apache-airflow +connexion==2.14.2 + # via apache-airflow +cron-descriptor==1.4.3 + # via apache-airflow +croniter==2.0.5 + # via apache-airflow +cryptography==42.0.8 + # via apache-airflow decorator==5.1.1 # via ipython +deprecated==1.2.14 + # via + # apache-airflow + # limits + # opentelemetry-api + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +dill==0.3.8 + # via apache-airflow dimcli==1.2 - # via rialto-airflow (pyproject.toml) +dnspython==2.6.1 + # via email-validator docutils==0.21.2 # via + # python-daemon # recommonmark # sphinx +email-validator==2.1.2 + # via flask-appbuilder executing==2.0.1 # via stack-data +flask==2.2.5 + # via + # apache-airflow + # apache-airflow-providers-fab + # connexion + # flask-appbuilder + # flask-babel + # flask-caching + # flask-jwt-extended + # flask-limiter + # flask-login + # flask-session + # flask-sqlalchemy + # flask-wtf +flask-appbuilder==4.4.1 + # via apache-airflow-providers-fab +flask-babel==2.0.0 + # via flask-appbuilder +flask-caching==2.3.0 + # via apache-airflow +flask-jwt-extended==4.6.0 + # via flask-appbuilder +flask-limiter==3.7.0 + # via flask-appbuilder +flask-login==0.6.3 + # via + # apache-airflow-providers-fab + # flask-appbuilder +flask-session==0.5.0 + # via apache-airflow +flask-sqlalchemy==2.5.1 + # via flask-appbuilder +flask-wtf==1.2.1 + # via + # apache-airflow + # flask-appbuilder +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +fsspec==2024.6.0 + # via + # apache-airflow + # universal-pathlib +google-re2==1.1.20240601 + # via + # apache-airflow + # apache-airflow-providers-fab +googleapis-common-protos==1.63.1 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +grpcio==1.64.1 + # via opentelemetry-exporter-otlp-proto-grpc +gunicorn==22.0.0 + # via apache-airflow +h11==0.14.0 + # via httpcore +httpcore==1.0.5 + # via httpx +httpx==0.27.0 + # via apache-airflow idna==3.7 - # via requests + # via + # anyio + # email-validator + # httpx + # requests + # yarl imagesize==1.4.1 # via sphinx +importlib-metadata==7.1.0 + # via opentelemetry-api +importlib-resources==6.4.0 + # via limits +inflection==0.5.1 + # via connexion +iniconfig==2.0.0 + # via pytest ipython==8.25.0 # via dimcli +itsdangerous==2.2.0 + # via + # apache-airflow + # connexion + # flask + # flask-wtf jedi==0.19.1 # via ipython jinja2==3.1.4 - # via sphinx + # via + # apache-airflow + # flask + # flask-babel + # python-nvd3 + # sphinx +jmespath==1.0.1 + # via apache-airflow-providers-fab +jsonschema==4.22.0 + # via + # apache-airflow + # connexion + # flask-appbuilder +jsonschema-specifications==2023.12.1 + # via jsonschema +lazy-object-proxy==1.10.0 + # via apache-airflow +limits==3.12.0 + # via flask-limiter +linkify-it-py==2.0.3 + # via apache-airflow +lockfile==0.12.2 + # via + # apache-airflow + # python-daemon +mako==1.3.5 + # via alembic +markdown-it-py==3.0.0 + # via + # apache-airflow + # mdit-py-plugins + # rich markupsafe==2.1.5 - # via jinja2 + # via + # apache-airflow + # jinja2 + # mako + # werkzeug + # wtforms +marshmallow==3.21.3 + # via + # flask-appbuilder + # marshmallow-oneofschema + # marshmallow-sqlalchemy +marshmallow-oneofschema==3.1.1 + # via apache-airflow +marshmallow-sqlalchemy==0.28.2 + # via flask-appbuilder matplotlib-inline==0.1.7 # via ipython +mdit-py-plugins==0.4.1 + # via apache-airflow +mdurl==0.1.2 + # via markdown-it-py +methodtools==0.4.7 + # via apache-airflow more-itertools==10.3.0 - # via rialto-airflow (pyproject.toml) + # via apache-airflow-providers-common-sql +multidict==6.0.5 + # via + # aiohttp + # yarl numpy==1.26.4 # via # dimcli # pandas + # pyarrow +opentelemetry-api==1.25.0 + # via + # apache-airflow + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-exporter-otlp==1.25.0 + # via apache-airflow +opentelemetry-exporter-otlp-proto-common==1.25.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-exporter-otlp-proto-grpc==1.25.0 + # via opentelemetry-exporter-otlp +opentelemetry-exporter-otlp-proto-http==1.25.0 + # via opentelemetry-exporter-otlp +opentelemetry-proto==1.25.0 + # via + # opentelemetry-exporter-otlp-proto-common + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-sdk==1.25.0 + # via + # opentelemetry-exporter-otlp-proto-grpc + # opentelemetry-exporter-otlp-proto-http +opentelemetry-semantic-conventions==0.46b0 + # via opentelemetry-sdk +ordered-set==4.1.0 + # via flask-limiter packaging==24.1 # via + # apache-airflow + # apispec + # connexion # dimcli + # gunicorn + # limits + # marshmallow + # marshmallow-sqlalchemy + # pytest # sphinx pandas==2.2.2 - # via - # rialto-airflow (pyproject.toml) - # dimcli + # via dimcli parso==0.8.4 # via jedi +pathspec==0.12.1 + # via apache-airflow +pendulum==3.0.0 + # via apache-airflow pexpect==4.9.0 # via ipython -polars==0.20.31 - # via rialto-airflow (pyproject.toml) +pluggy==1.5.0 + # via + # apache-airflow + # pytest +polars==1.2.1 +prison==0.2.1 + # via flask-appbuilder prompt-toolkit==3.0.47 # via # dimcli # ipython +protobuf==4.25.3 + # via + # googleapis-common-protos + # opentelemetry-proto +psutil==5.9.8 + # via apache-airflow ptyprocess==0.7.0 # via pexpect pure-eval==0.2.2 # via stack-data pyalex==0.14 - # via rialto-airflow (pyproject.toml) +pyarrow==17.0.0 +pycparser==2.22 + # via cffi pygments==2.18.0 # via + # apache-airflow # dimcli # ipython + # rich # sphinx +pyjwt==2.8.0 + # via + # apache-airflow + # flask-appbuilder + # flask-jwt-extended +pytest==8.2.2 +python-daemon==3.0.1 + # via apache-airflow python-dateutil==2.9.0.post0 - # via pandas + # via + # apache-airflow + # croniter + # flask-appbuilder + # pandas + # pendulum + # time-machine python-dotenv==1.0.1 - # via rialto-airflow (pyproject.toml) +python-nvd3==0.16.0 + # via apache-airflow +python-slugify==8.0.4 + # via + # apache-airflow + # python-nvd3 pytz==2024.1 - # via pandas + # via + # croniter + # flask-babel + # pandas +pyyaml==6.0.1 + # via + # apispec + # clickclick + # connexion recommonmark==0.7.1 # via dimcli +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications requests==2.32.3 # via - # rialto-airflow (pyproject.toml) + # apache-airflow + # apache-airflow-providers-http + # connexion # dimcli + # opentelemetry-exporter-otlp-proto-http # pyalex + # requests-toolbelt # sphinx +requests-toolbelt==1.0.0 + # via apache-airflow-providers-http +rfc3339-validator==0.1.4 + # via apache-airflow +rich==13.7.1 + # via + # apache-airflow + # flask-limiter + # rich-argparse +rich-argparse==1.5.2 + # via apache-airflow +rpds-py==0.18.1 + # via + # jsonschema + # referencing +setproctitle==1.3.3 + # via apache-airflow +setuptools==70.0.0 + # via python-daemon six==1.16.0 # via # asttokens + # prison # python-dateutil + # rfc3339-validator + # wirerope +sniffio==1.3.1 + # via + # anyio + # httpx snowballstemmer==2.2.0 # via sphinx sphinx==7.3.7 @@ -107,19 +456,78 @@ sphinxcontrib-qthelp==1.0.7 # via sphinx sphinxcontrib-serializinghtml==1.1.10 # via sphinx +sqlalchemy==1.4.52 + # via + # alembic + # apache-airflow + # flask-appbuilder + # flask-sqlalchemy + # marshmallow-sqlalchemy + # sqlalchemy-jsonfield + # sqlalchemy-utils +sqlalchemy-jsonfield==1.0.2 + # via apache-airflow +sqlalchemy-utils==0.41.2 + # via flask-appbuilder +sqlparse==0.5.0 + # via apache-airflow-providers-common-sql stack-data==0.6.3 # via ipython +tabulate==0.9.0 + # via apache-airflow +tenacity==8.4.1 + # via apache-airflow +termcolor==2.4.0 + # via apache-airflow +text-unidecode==1.3 + # via python-slugify +time-machine==2.14.1 + # via pendulum tqdm==4.66.4 # via dimcli traitlets==5.14.3 # via # ipython # matplotlib-inline +typing-extensions==4.12.2 + # via + # alembic + # flask-limiter + # limits + # opentelemetry-sdk tzdata==2024.1 - # via pandas + # via + # pandas + # pendulum +uc-micro-py==1.0.3 + # via linkify-it-py +unicodecsv==0.14.1 + # via apache-airflow +universal-pathlib==0.2.2 + # via apache-airflow urllib3==2.2.1 # via # pyalex # requests wcwidth==0.2.13 # via prompt-toolkit +werkzeug==2.2.3 + # via + # apache-airflow + # connexion + # flask + # flask-appbuilder + # flask-jwt-extended + # flask-login +wirerope==0.4.7 + # via methodtools +wrapt==1.16.0 + # via deprecated +wtforms==3.1.2 + # via + # flask-appbuilder + # flask-wtf +yarl==1.9.4 + # via aiohttp +zipp==3.19.2 + # via importlib-metadata diff --git a/rialto_airflow/utils.py b/rialto_airflow/utils.py index a562032..3f43c3e 100644 --- a/rialto_airflow/utils.py +++ b/rialto_airflow/utils.py @@ -1,7 +1,12 @@ import csv import datetime -from pathlib import Path import re +import sys +from itertools import batched +from pathlib import Path + +import pyarrow +from pyarrow.parquet import ParquetWriter def create_snapshot_dir(data_dir): @@ -63,3 +68,18 @@ def normalize_doi(doi): doi = re.sub("^doi: ", "", doi) return doi + + +def csv_to_parquet(csv_file, parquet_file, batch_size=10_000): + csv.field_size_limit(sys.maxsize) + + csv_input = open(csv_file) + reader = csv.DictReader(csv_input) + + # naively assume all columns are strings + schema = pyarrow.schema([(name, pyarrow.string()) for name in reader.fieldnames]) + + with ParquetWriter(open(parquet_file, "wb"), schema, compression="zstd") as writer: + for rows in batched(reader, batch_size): + table = pyarrow.Table.from_pylist(rows, schema) + writer.write_table(table) diff --git a/test/test_utils.py b/test/test_utils.py index fea4891..6c5055a 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,6 +2,7 @@ from pathlib import Path import pytest +import polars from rialto_airflow import utils @@ -67,3 +68,14 @@ def test_normalize_doi(): == "10.1103/physrevlett.96.07390" ) assert utils.normalize_doi(" doi: 10.1234/5678 ") == "10.1234/5678" + + +def test_csv_to_parquet(tmp_path): + csv_file = Path("test/data/authors.csv") + parquet_file = tmp_path / "authors.parquet" + utils.csv_to_parquet(csv_file, parquet_file) + + assert parquet_file.is_file() + df = polars.read_parquet(parquet_file) + + assert df.shape == (10, 2)