Skip to content

Commit

Permalink
fea: add parquet format
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 18, 2024
1 parent ddea441 commit f92a10c
Showing 1 changed file with 17 additions and 12 deletions.
29 changes: 17 additions & 12 deletions scripts/nsa/save_pages_from_db_to_file.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,3 @@
import json

import pandas as pd
import pymongo
import typer
Expand All @@ -20,10 +18,17 @@ def fetch_documents(collection, batch_size=1000):
yield doc


def write_to_jsonl_in_chunks(file_path, collection, batch_size=1000):
with open(file_path, "w") as file:
for doc in tqdm(fetch_documents(collection, batch_size)):
file.write(json.dumps(doc) + "\n") # Write each document as a JSON line
def write_to_parquet_in_chunks(file_path, collection, batch_size=1000):
buffer = []
for doc in tqdm(fetch_documents(collection, batch_size)):
buffer.append(doc)
if len(buffer) >= batch_size:
df = pd.DataFrame(buffer)
df.to_parquet(file_path, engine="pyarrow", compression="snappy", append=True)
buffer = []
if buffer:
df = pd.DataFrame(buffer)
df.to_parquet(file_path, engine="pyarrow", compression="snappy", append=True)


def main(
Expand All @@ -36,13 +41,13 @@ def main(

NSA_DATA_PATH.mkdir(parents=True, exist_ok=True)

# Save document pages in JSONL format
docs_output_path = NSA_DATA_PATH / "pages.jsonl"
write_to_jsonl_in_chunks(docs_output_path, docs_col)
# Save document pages in Parquet format
docs_output_path = NSA_DATA_PATH / "pages.parquet"
write_to_parquet_in_chunks(docs_output_path, docs_col)

# Save document errors in JSONL format
errors_output_path = NSA_DATA_PATH / "errors.jsonl"
write_to_jsonl_in_chunks(errors_output_path, errors_col)
# Save document errors in Parquet format
errors_output_path = NSA_DATA_PATH / "errors.parquet"
write_to_parquet_in_chunks(errors_output_path, errors_col)


typer.run(main)

0 comments on commit f92a10c

Please sign in to comment.