Skip to content

Commit

Permalink
fix comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pan-x-c committed Nov 20, 2024
1 parent 58e357f commit e1b76f5
Show file tree
Hide file tree
Showing 8 changed files with 21 additions and 38 deletions.
11 changes: 4 additions & 7 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -632,19 +632,16 @@ process:
- video_deduplicator: # deduplicator to deduplicate samples at document-level using exact matching of videos between documents.
consider_text: false # whether to consider text hash together with video hash when applying deduplication.
- ray_video_deduplicator: # the simple video deduplicator that can run on multi-nodes using md5 hashing exact matching method
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
redis_address: 'redis://localhost:6379' # the address of the redis instance
- ray_image_deduplicator: # the simple image deduplicator that can deduplicate samples at document-level using exact matching of images between documents.
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
redis_address: 'redis://localhost:6379' # the address of the redis instance
method: phash # hash method for image. One of [phash, dhash, whash, ahash]
- ray_document_deduplicator: # the simple document deduplicator that can run on multi-nodes using md5 hashing exact matching method
redis_host: 'redis_host' # the host of the redis instance
redis_port: 6380 # the port of redis instance, please note that the default port of redis is 6379 which is the same as default port for ray, so we need to modify the default redis config to use it in other port
redis_address: 'redis://localhost:6379' # the address of the redis instance
lowercase: false # whether to convert text to lower case
ignore_non_character: false # whether to ignore non-alphabet characters, including whitespaces, digits, and punctuations
- ray_redis_minhash_deduplicator: # the document deduplicator that can run on multi-nodes using minhashLSH algorithm
redis_address: 'redis://localhost:6379' # the address of the redis instance
redis_address: 'redis://localhost:6379' # the address of the redis instance
tokenization: space # tokenization method for text. One of [space, punctuation, character, sentencepiece]
window_size: 5 # window size of shingling
num_permutations: 256 # number of permutations in minhash computing
Expand Down
13 changes: 5 additions & 8 deletions data_juicer/ops/deduplicator/ray_basic_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,20 @@ class RayBasicDeduplicator(Filter):
EMPTY_HASH_VALUE = 'EMPTY'

def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
redis_address: str = 'redis://localhost:6379',
*args,
**kwargs):
"""
Initialization.
:param redis_host: the hostname of redis server
:param redis_port: the port of redis server
:param redis_address: the address of redis server
:param args: extra args
:param kwargs: extra args
"""
super().__init__(*args, **kwargs)
self.redis_host = redis_host
self.redis_port = redis_port
self.redis_address = redis_address
# TODO: add a barrier to ensure that flushdb is performed before
# the operator is called
r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0)
r = redis.from_url(url=redis_address)
r.flushdb(0)

def calculate_hash(self, sample, context=False):
Expand All @@ -44,7 +41,7 @@ def calculate_hash(self, sample, context=False):

def compute_stats_single(self, sample, context=False):
# init redis client
r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0)
r = redis.from_url(url=self.redis_address)
# compute hash
md5_value = self.calculate_hash(sample, context)
# check existing
Expand Down
9 changes: 3 additions & 6 deletions data_juicer/ops/deduplicator/ray_document_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,24 +17,21 @@ class RayDocumentDeduplicator(RayBasicDeduplicator):
"""

def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
redis_address: str = 'redis://localhost:6379',
lowercase: bool = False,
ignore_non_character: bool = False,
*args,
**kwargs):
"""
Initialization method.
:param redis_host: the hostname of redis server
:param redis_port: the port of redis server
:param redis_address: the address of redis server
:param lowercase: Whether to convert sample text to lower case
:param ignore_non_character: Whether to ignore non-alphabet
characters, including whitespaces, digits, and punctuations
:param args: extra args
:param kwargs: extra args.
"""
super().__init__(redis_host=redis_host,
redis_port=redis_port,
super().__init__(redis_address=redis_address,
*args,
**kwargs)
self.lowercase = lowercase
Expand Down
9 changes: 3 additions & 6 deletions data_juicer/ops/deduplicator/ray_image_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,17 @@ class RayImageDeduplicator(RayBasicDeduplicator):
"""

def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
redis_address: str = 'redis://localhost:6379',
method: str = 'phash',
*args,
**kwargs):
"""
Initialization.
:param redis_host: the hostname of redis server
:param redis_port: the port of redis server
:param redis_address: the address of redis server
:param args: extra args
:param kwargs: extra args
"""
super().__init__(redis_host=redis_host,
redis_port=redis_port,
super().__init__(redis_address=redis_address,
*args,
**kwargs)
if method not in HASH_METHOD:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,7 @@ def clean(self):
@OPERATORS.register_module(OP_NAME)
class RayRedisMinhashDeduplicator(Deduplicator):
"""
A basic exact matching deduplicator for RAY.
Although its functionality is deduplication,
it is implemented as Filter sub-class.
A MinhashLSH deduplicator based on RAY and Redis.
"""

def __init__(
Expand Down
9 changes: 3 additions & 6 deletions data_juicer/ops/deduplicator/ray_video_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,19 +21,16 @@ class RayVideoDeduplicator(RayBasicDeduplicator):
"""

def __init__(self,
redis_host: str = 'localhost',
redis_port: PositiveInt = 6380,
redis_address: str = 'redis://localhost:6379',
*args,
**kwargs):
"""
Initialization.
:param redis_host: the hostname of redis server
:param redis_port: the port of redis server
:param redis_address: the address of redis server
:param args: extra args
:param kwargs: extra args
"""
super().__init__(redis_host=redis_host,
redis_port=redis_port,
super().__init__(redis_address=redis_address,
*args,
**kwargs)

Expand Down
2 changes: 1 addition & 1 deletion docs/Operators.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ The operators in Data-Juicer are categorized into 5 types.
| [ Formatter ]( #formatter ) | 9 | Discovers, loads, and canonicalizes source data |
| [ Mapper ]( #mapper ) | 58 | Edits and transforms samples |
| [ Filter ]( #filter ) | 44 | Filters out low-quality samples |
| [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples |
| [ Deduplicator ]( #deduplicator ) | 9 | Detects and removes duplicate samples |
| [ Selector ]( #selector ) | 4 | Selects top samples based on ranking |


Expand Down
2 changes: 1 addition & 1 deletion docs/Operators_ZH.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
| [ Formatter ]( #formatter ) | 9 | 发现、加载、规范化原始数据 |
| [ Mapper ]( #mapper ) | 58 | 对数据样本进行编辑和转换 |
| [ Filter ]( #filter ) | 44 | 过滤低质量样本 |
| [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 |
| [ Deduplicator ]( #deduplicator ) | 9 | 识别、删除重复样本 |
| [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 |

下面列出所有具体算子,每种算子都通过多个标签来注明其主要功能。
Expand Down

0 comments on commit e1b76f5

Please sign in to comment.