From 58e357f2ef8e5982c7408ab2ace822b923de03e3 Mon Sep 17 00:00:00 2001 From: panxuchen Date: Mon, 18 Nov 2024 17:50:42 +0800 Subject: [PATCH] fix output bug --- .../ops/deduplicator/ray_redis_minhash_deduplicator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py index 72c250af1..203fcf059 100644 --- a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py @@ -302,8 +302,8 @@ def filter_with_union_find(table: pa.Table) -> pa.Table: batch_format='pyarrow').groupby( HashKeys.minhash).aggregate( UnionFn(union_find)).materialize() - result = dataset_with_id.map_batches(filter_with_union_find, - batch_format='pyarrow') + result = dataset_with_id.map_batches( + filter_with_union_find, batch_format='pyarrow').materialize() logger.info(f'Keep {result.count()} samples after MinHash dedup.') union_find.clean() return result