From 447376d888dd4f83edeea98f8f761e2b0f231d34 Mon Sep 17 00:00:00 2001 From: Albert Sawczyn Date: Wed, 18 Sep 2024 11:32:13 +0200 Subject: [PATCH] feat: reduce memroy usage --- scripts/nsa/save_pages_from_db_to_file.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/nsa/save_pages_from_db_to_file.py b/scripts/nsa/save_pages_from_db_to_file.py index 7d049ff..1c5de09 100644 --- a/scripts/nsa/save_pages_from_db_to_file.py +++ b/scripts/nsa/save_pages_from_db_to_file.py @@ -18,7 +18,7 @@ def fetch_documents(collection, batch_size=5000): yield doc -def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_size=100000): +def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_size=50000): file_path.mkdir(parents=True, exist_ok=True) buffer = [] chunk_index = 0 @@ -29,6 +29,7 @@ def write_to_parquet_in_chunks(file_path, collection, batch_size=5000, chunk_siz df = pd.DataFrame(buffer) chunk_file = file_path.parent / f"{file_path.stem}_chunk_{chunk_index}.parquet" df.to_parquet(chunk_file, engine="pyarrow", compression="snappy") + del df buffer = [] chunk_index += 1