Skip to content

Commit

Permalink
fix: fix oom problem
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 17, 2024
1 parent 717379e commit ddea441
Showing 1 changed file with 23 additions and 9 deletions.
32 changes: 23 additions & 9 deletions scripts/nsa/save_pages_from_db_to_file.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import json

import pandas as pd
import pymongo
import typer
Expand All @@ -11,6 +13,19 @@
DB_URI = "mongodb://localhost:27017/"


def fetch_documents(collection, batch_size=1000):
cursor = collection.find().batch_size(batch_size)
for doc in cursor:
doc["_id"] = str(doc["_id"]) # Convert ObjectId to string
yield doc


def write_to_jsonl_in_chunks(file_path, collection, batch_size=1000):
with open(file_path, "w") as file:
for doc in tqdm(fetch_documents(collection, batch_size)):
file.write(json.dumps(doc) + "\n") # Write each document as a JSON line


def main(
db_uri: str = typer.Option(DB_URI),
) -> None:
Expand All @@ -20,15 +35,14 @@ def main(
errors_col = db["document_pages_errors"]

NSA_DATA_PATH.mkdir(parents=True, exist_ok=True)
output_path = NSA_DATA_PATH / "pages.json"
data = pd.DataFrame(tqdm(docs_col.find()))
data["_id"] = data["_id"].astype(str)
data.to_json(output_path, orient="records", indent=4)

output_path = NSA_DATA_PATH / "errors.json"
data = pd.DataFrame(tqdm(errors_col.find()))
data["_id"] = data["_id"].astype(str)
data.to_json(output_path, orient="records", indent=4)

# Save document pages in JSONL format
docs_output_path = NSA_DATA_PATH / "pages.jsonl"
write_to_jsonl_in_chunks(docs_output_path, docs_col)

# Save document errors in JSONL format
errors_output_path = NSA_DATA_PATH / "errors.jsonl"
write_to_jsonl_in_chunks(errors_output_path, errors_col)


typer.run(main)

0 comments on commit ddea441

Please sign in to comment.