Skip to content

Commit

Permalink
Zero copy hashing
Browse files Browse the repository at this point in the history
  • Loading branch information
orf committed Oct 20, 2024
1 parent 740a8ec commit 17067c2
Showing 1 changed file with 5 additions and 4 deletions.
9 changes: 5 additions & 4 deletions src/pypi_data/combine_parquet.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,10 +102,11 @@ async def fill_buffer(
batch: RecordBatch

start_hash_time = time.perf_counter_ns()
digest = hashlib.sha256()
for item in batch.column("path").cast(pyarrow.large_binary()).to_pylist():
digest.update(item)
digest = digest.hexdigest()

# Hash the path column with zero copies.
data_buffer = batch.column("path").cast(pyarrow.large_binary()).buffers()[1]
digest = hashlib.sha256(memoryview(data_buffer)).hexdigest()

time_hashing_ns += time.perf_counter_ns() - start_hash_time

buffer.append(((repo.number, digest), batch))
Expand Down

0 comments on commit 17067c2

Please sign in to comment.