Skip to content

Commit

Permalink
feat: save to multiple files
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 18, 2024
1 parent f92a10c commit 3ef6c1b
Showing 1 changed file with 11 additions and 5 deletions.
16 changes: 11 additions & 5 deletions scripts/nsa/save_pages_from_db_to_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,24 +11,30 @@
DB_URI = "mongodb://localhost:27017/"


def fetch_documents(collection, batch_size=1000):
def fetch_documents(collection, batch_size=5000):
cursor = collection.find().batch_size(batch_size)
for doc in cursor:
doc["_id"] = str(doc["_id"]) # Convert ObjectId to string
yield doc


def write_to_parquet_in_chunks(file_path, collection, batch_size=1000):
def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_size=50000):
buffer = []
chunk_index = 0

for doc in tqdm(fetch_documents(collection, batch_size)):
buffer.append(doc)
if len(buffer) >= batch_size:
if len(buffer) >= chunk_size:
df = pd.DataFrame(buffer)
df.to_parquet(file_path, engine="pyarrow", compression="snappy", append=True)
chunk_file = file_path.parent / f"{file_path.stem}_chunk_{chunk_index}.parquet"
df.to_parquet(chunk_file, engine="pyarrow", compression="snappy")
buffer = []
chunk_index += 1

if buffer:
df = pd.DataFrame(buffer)
df.to_parquet(file_path, engine="pyarrow", compression="snappy", append=True)
chunk_file = file_path.parent / f"{file_path.stem}_chunk_{chunk_index}.parquet"
df.to_parquet(chunk_file, engine="pyarrow", compression="snappy")


def main(
Expand Down

0 comments on commit 3ef6c1b

Please sign in to comment.