Skip to content

Commit

Permalink
update dataset_path parsing in config
Browse files Browse the repository at this point in the history
  • Loading branch information
cyruszhang committed Dec 10, 2024
1 parent cb5b80a commit daf7a85
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 6 deletions.
13 changes: 8 additions & 5 deletions data_juicer/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,19 +415,22 @@ def init_setup_from_cfg(cfg: Namespace):

# check and get dataset dir
if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path):
logger.warning('dataset_path config is set and a valid local path')
cfg.dataset_path = os.path.abspath(cfg.dataset_path)
if os.path.isdir(cfg.dataset_path):
cfg.dataset_dir = cfg.dataset_path
else:
cfg.dataset_dir = os.path.dirname(cfg.dataset_path)
elif cfg.dataset_path == '':
logger.warning('dataset_path is empty by default.')
elif cfg.dataset_path == '' and cfg.get('dataset', None):
logger.warning('dataset_path config is empty; dataset is present')
cfg.dataset_dir = ''
else:
logger.warning(f'dataset_path [{cfg.dataset_path}] is not a valid '
f'local path. Please check and retry, otherwise we '
f'will treat it as a remote dataset or a mixture of '
f'several datasets.')
f'local path, AND dataset is not present. '
f'Please check and retry, otherwise we '
f'will treat dataset_path as a remote dataset or a '
f'mixture of several datasets.')

cfg.dataset_dir = ''

# check number of processes np
Expand Down
8 changes: 8 additions & 0 deletions tests/core/data/test_config_list.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
project_name: 'dataset-ondisk-list'
dataset:
- type: 'ondisk'
path:
- 'sample.json'
- type: 'ondisk'
path:
- 'sample.txt'
15 changes: 14 additions & 1 deletion tests/core/test_dataset_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,20 @@ def test_dataset_builder_ondisk_config(self):
cfg = init_configs(args=f'--config {test_config_file}'.split())
self.assertIsInstance(cfg, Namespace)
self.assertEqual(cfg.project_name, 'dataset-ondisk-json')
self.assertEqual(cfg.dataset, {'path': ['sample.json'], 'type': 'ondisk'})
self.assertEqual(cfg.dataset,
{'path': ['sample.json'], 'type': 'ondisk'})
self.assertEqual(not cfg.dataset_path, True)

def test_dataset_builder_ondisk_config_list(self):
test_config_file = './data/test_config_list.yaml'
out = StringIO()
with redirect_stdout(out):
cfg = init_configs(args=f'--config {test_config_file}'.split())
self.assertIsInstance(cfg, Namespace)
self.assertEqual(cfg.project_name, 'dataset-ondisk-list')
self.assertEqual(cfg.dataset,[
{'path': ['sample.json'], 'type': 'ondisk'},
{'path': ['sample.txt'], 'type': 'ondisk'}])
self.assertEqual(not cfg.dataset_path, True)


Expand Down

0 comments on commit daf7a85

Please sign in to comment.