Skip to content

Commit

Permalink
Hack schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Dec 5, 2024
1 parent afd8b93 commit ccf36e6
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 113 deletions.
6 changes: 5 additions & 1 deletion src/pypi_data/combine_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,8 @@ async def combine_parquet(
f"Configured buffer size: {ByteSize(max_buffer_size).human_readable(decimal=True)}"
)

schema_hack = pa.schema([("repository", pa.int64())])

async with httpx.AsyncClient(follow_redirects=True) as client:
while repositories:
if (
Expand All @@ -199,7 +201,9 @@ async def combine_parquet(
write_statistics=True,
write_batch_size=1024 * 10,
data_page_size=1024 * 1024 * 5,
schema=first_buffer.schema,
schema=pa.unify_schemas(
[first_buffer.schema, schema_hack], promote_options="permissive"
),
) as writer:
append_buffer(fd, writer, first_buffer, roll_up_path, target_size)

Expand Down
Loading

0 comments on commit ccf36e6

Please sign in to comment.