Skip to content

Commit

Permalink
feat: reduce memroy usage
Browse files Browse the repository at this point in the history
  • Loading branch information
asawczyn committed Sep 18, 2024
1 parent 47073e0 commit 447376d
Showing 1 changed file with 2 additions and 1 deletion.
3 changes: 2 additions & 1 deletion scripts/nsa/save_pages_from_db_to_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def fetch_documents(collection, batch_size=5000):
yield doc


def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_size=100000):
def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_size=50000):
file_path.mkdir(parents=True, exist_ok=True)
buffer = []
chunk_index = 0
Expand All @@ -29,6 +29,7 @@ def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_siz
df = pd.DataFrame(buffer)
chunk_file = file_path.parent / f"{file_path.stem}_chunk_{chunk_index}.parquet"
df.to_parquet(chunk_file, engine="pyarrow", compression="snappy")
del df
buffer = []
chunk_index += 1

Expand Down

0 comments on commit 447376d

Please sign in to comment.