From 11693785f41cc42c8ecf14abc73273dc042a3744 Mon Sep 17 00:00:00 2001 From: Tom Forbes Date: Sun, 20 Oct 2024 12:31:48 +0100 Subject: [PATCH] 2.5 million batch size --- src/pypi_data/combine_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pypi_data/combine_parquet.py b/src/pypi_data/combine_parquet.py index 3e9cf834..ae8b5be7 100644 --- a/src/pypi_data/combine_parquet.py +++ b/src/pypi_data/combine_parquet.py @@ -64,7 +64,7 @@ async def fill_buffer( log.info(f"Downloaded, reading {path}") table = pq.read_table(path, memory_map=True).combine_chunks() - for idx, batch in enumerate(table.to_batches(max_chunksize=2_000_000)): + for idx, batch in enumerate(table.to_batches(max_chunksize=2_500_000)): batch: RecordBatch digest = hashlib.sha256() for item in batch.column("path").cast(pyarrow.large_binary()).to_pylist():