From ea33387d95e13eab1e8ebbf14a0c76b9aa398acd Mon Sep 17 00:00:00 2001 From: "jakub.binkowski" Date: Sat, 1 Jun 2024 17:15:11 +0000 Subject: [PATCH] Make dataset dump skipping embedding column --- scripts/dataset/dump_pl_dataset.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/dataset/dump_pl_dataset.py b/scripts/dataset/dump_pl_dataset.py index 0b809b4..24b746a 100644 --- a/scripts/dataset/dump_pl_dataset.py +++ b/scripts/dataset/dump_pl_dataset.py @@ -50,7 +50,9 @@ def main( for offset in trange(start_offset, num_docs, chunk_size, desc="Chunks"): docs = list( tqdm( - collection.find(query, batch_size=batch_size).skip(offset).limit(chunk_size), + collection.find(query, {"embedding": 0}, batch_size=batch_size) + .skip(offset) + .limit(chunk_size), total=chunk_size, leave=False, desc="Documents in chunk",