diff --git a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py index 72c250af1..203fcf059 100644 --- a/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_redis_minhash_deduplicator.py @@ -302,8 +302,8 @@ def filter_with_union_find(table: pa.Table) -> pa.Table: batch_format='pyarrow').groupby( HashKeys.minhash).aggregate( UnionFn(union_find)).materialize() - result = dataset_with_id.map_batches(filter_with_union_find, - batch_format='pyarrow') + result = dataset_with_id.map_batches( + filter_with_union_find, batch_format='pyarrow').materialize() logger.info(f'Keep {result.count()} samples after MinHash dedup.') union_find.clean() return result