diff --git a/pyproject.toml b/pyproject.toml index 9b3c1d96..403198b1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -26,3 +26,8 @@ pypi-data = "pypi_data.cli:app" [build-system] requires = ["hatchling"] build-backend = "hatchling.build" + +[dependency-groups] +dev = [ + "ruff>=0.8.2", +] diff --git a/src/pypi_data/combine_parquet.py b/src/pypi_data/combine_parquet.py index 5163db4f..bf440525 100644 --- a/src/pypi_data/combine_parquet.py +++ b/src/pypi_data/combine_parquet.py @@ -75,7 +75,7 @@ async def fill_buffer( client: httpx.AsyncClient, repositories: Deque[CodeRepository], directory: Path, - schema_merge: pa.Schema + schema_merge: pa.Schema, ) -> bool: while repositories: time_hashing_ns = 0 @@ -102,10 +102,13 @@ async def fill_buffer( start_load_time = time.perf_counter_ns() table = await asyncio.to_thread( - lambda: pq.read_table(pa.py_buffer(memoryview(dataset_bytes))) - .combine_chunks() + lambda: pq.read_table( + pa.py_buffer(memoryview(dataset_bytes)) + ).combine_chunks() ) - table_batches = table.cast(pa.unify_schemas([table.schema, schema_merge], promote_options="permissive")).to_batches(max_chunksize=2_500_000) + table_batches = table.cast( + pa.unify_schemas([table.schema, schema_merge], promote_options="permissive") + ).to_batches(max_chunksize=2_500_000) del dataset_bytes, table time_loading_ns += time.perf_counter_ns() - start_load_time @@ -181,7 +184,12 @@ async def combine_parquet( while repositories: if ( await fill_buffer( - buffer, max_buffer_size, client, repositories, directory, schema_merge + buffer, + max_buffer_size, + client, + repositories, + directory, + schema_merge, ) is False ): @@ -193,25 +201,34 @@ async def combine_parquet( first_key, first_buffer = buffer.popleft() keys.append(first_key) - with pyarrow.output_stream( - roll_up_path, compression=None, buffer_size=IO_BUFFER_SIZE - ) as fd, pq.ParquetWriter( - fd, - compression="zstd", - compression_level=9, - write_statistics=True, - write_batch_size=1024 * 10, - data_page_size=1024 * 1024 * 5, - schema=pa.unify_schemas( - [first_buffer.schema, schema_merge], promote_options="permissive" - ), - ) as writer: + with ( + pyarrow.output_stream( + roll_up_path, compression=None, buffer_size=IO_BUFFER_SIZE + ) as fd, + pq.ParquetWriter( + fd, + compression="zstd", + compression_level=9, + write_statistics=True, + write_batch_size=1024 * 10, + data_page_size=1024 * 1024 * 5, + schema=pa.unify_schemas( + [first_buffer.schema, schema_merge], + promote_options="permissive", + ), + ) as writer, + ): append_buffer(fd, writer, first_buffer, roll_up_path, target_size) while buffer or repositories: if not buffer: res = await fill_buffer( - buffer, max_buffer_size, client, repositories, directory, schema_merge + buffer, + max_buffer_size, + client, + repositories, + directory, + schema_merge, ) if res is None: continue diff --git a/uv.lock b/uv.lock index 60901986..90ea7a36 100644 --- a/uv.lock +++ b/uv.lock @@ -368,6 +368,11 @@ dependencies = [ { name = "typer-slim" }, ] +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + [package.metadata] requires-dist = [ { name = "hishel", extras = ["sqlite"], specifier = ">=0.0.33" }, @@ -384,6 +389,9 @@ requires-dist = [ { name = "typer-slim", specifier = ">=0.12.5" }, ] +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.8.2" }] + [[package]] name = "requests" version = "2.32.3" @@ -399,6 +407,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6", size = 64928 }, ] +[[package]] +name = "ruff" +version = "0.8.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5e/2b/01245f4f3a727d60bebeacd7ee6d22586c7f62380a2597ddb22c2f45d018/ruff-0.8.2.tar.gz", hash = "sha256:b84f4f414dda8ac7f75075c1fa0b905ac0ff25361f42e6d5da681a465e0f78e5", size = 3349020 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/29/366be70216dba1731a00a41f2f030822b0c96c7c4f3b2c0cdce15cbace74/ruff-0.8.2-py3-none-linux_armv6l.whl", hash = "sha256:c49ab4da37e7c457105aadfd2725e24305ff9bc908487a9bf8d548c6dad8bb3d", size = 10530649 }, + { url = "https://files.pythonhosted.org/packages/63/82/a733956540bb388f00df5a3e6a02467b16c0e529132625fe44ce4c5fb9c7/ruff-0.8.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:ec016beb69ac16be416c435828be702ee694c0d722505f9c1f35e1b9c0cc1bf5", size = 10274069 }, + { url = "https://files.pythonhosted.org/packages/3d/12/0b3aa14d1d71546c988a28e1b412981c1b80c8a1072e977a2f30c595cc4a/ruff-0.8.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:f05cdf8d050b30e2ba55c9b09330b51f9f97d36d4673213679b965d25a785f3c", size = 9909400 }, + { url = "https://files.pythonhosted.org/packages/23/08/f9f08cefb7921784c891c4151cce6ed357ff49e84b84978440cffbc87408/ruff-0.8.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:60f578c11feb1d3d257b2fb043ddb47501ab4816e7e221fbb0077f0d5d4e7b6f", size = 10766782 }, + { url = "https://files.pythonhosted.org/packages/e4/71/bf50c321ec179aa420c8ec40adac5ae9cc408d4d37283a485b19a2331ceb/ruff-0.8.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:cbd5cf9b0ae8f30eebc7b360171bd50f59ab29d39f06a670b3e4501a36ba5897", size = 10286316 }, + { url = "https://files.pythonhosted.org/packages/f2/83/c82688a2a6117539aea0ce63fdf6c08e60fe0202779361223bcd7f40bd74/ruff-0.8.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b402ddee3d777683de60ff76da801fa7e5e8a71038f57ee53e903afbcefdaa58", size = 11338270 }, + { url = "https://files.pythonhosted.org/packages/7f/d7/bc6a45e5a22e627640388e703160afb1d77c572b1d0fda8b4349f334fc66/ruff-0.8.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:705832cd7d85605cb7858d8a13d75993c8f3ef1397b0831289109e953d833d29", size = 12058579 }, + { url = "https://files.pythonhosted.org/packages/da/3b/64150c93946ec851e6f1707ff586bb460ca671581380c919698d6a9267dc/ruff-0.8.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:32096b41aaf7a5cc095fa45b4167b890e4c8d3fd217603f3634c92a541de7248", size = 11615172 }, + { url = "https://files.pythonhosted.org/packages/e4/9e/cf12b697ea83cfe92ec4509ae414dc4c9b38179cc681a497031f0d0d9a8e/ruff-0.8.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e769083da9439508833cfc7c23e351e1809e67f47c50248250ce1ac52c21fb93", size = 12882398 }, + { url = "https://files.pythonhosted.org/packages/a9/27/96d10863accf76a9c97baceac30b0a52d917eb985a8ac058bd4636aeede0/ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5fe716592ae8a376c2673fdfc1f5c0c193a6d0411f90a496863c99cd9e2ae25d", size = 11176094 }, + { url = "https://files.pythonhosted.org/packages/eb/10/cd2fd77d4a4e7f03c29351be0f53278a393186b540b99df68beb5304fddd/ruff-0.8.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:81c148825277e737493242b44c5388a300584d73d5774defa9245aaef55448b0", size = 10771884 }, + { url = "https://files.pythonhosted.org/packages/71/5d/beabb2ff18870fc4add05fa3a69a4cb1b1d2d6f83f3cf3ae5ab0d52f455d/ruff-0.8.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d261d7850c8367704874847d95febc698a950bf061c9475d4a8b7689adc4f7fa", size = 10382535 }, + { url = "https://files.pythonhosted.org/packages/ae/29/6b3fdf3ad3e35b28d87c25a9ff4c8222ad72485ab783936b2b267250d7a7/ruff-0.8.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:1ca4e3a87496dc07d2427b7dd7ffa88a1e597c28dad65ae6433ecb9f2e4f022f", size = 10886995 }, + { url = "https://files.pythonhosted.org/packages/e9/dc/859d889b4d9356a1a2cdbc1e4a0dda94052bc5b5300098647e51a58c430b/ruff-0.8.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:729850feed82ef2440aa27946ab39c18cb4a8889c1128a6d589ffa028ddcfc22", size = 11220750 }, + { url = "https://files.pythonhosted.org/packages/0b/08/e8f519f61f1d624264bfd6b8829e4c5f31c3c61193bc3cff1f19dbe7626a/ruff-0.8.2-py3-none-win32.whl", hash = "sha256:ac42caaa0411d6a7d9594363294416e0e48fc1279e1b0e948391695db2b3d5b1", size = 8729396 }, + { url = "https://files.pythonhosted.org/packages/f8/d4/ba1c7ab72aba37a2b71fe48ab95b80546dbad7a7f35ea28cf66fc5cea5f6/ruff-0.8.2-py3-none-win_amd64.whl", hash = "sha256:2aae99ec70abf43372612a838d97bfe77d45146254568d94926e8ed5bbb409ea", size = 9594729 }, + { url = "https://files.pythonhosted.org/packages/23/34/db20e12d3db11b8a2a8874258f0f6d96a9a4d631659d54575840557164c8/ruff-0.8.2-py3-none-win_arm64.whl", hash = "sha256:fb88e2a506b70cfbc2de6fae6681c4f944f7dd5f2fe87233a7233d888bad73e8", size = 9035131 }, +] + [[package]] name = "sniffio" version = "1.3.1"