From daf7a85c7783fa69e0ba80a75290941c2243e31b Mon Sep 17 00:00:00 2001 From: cyruszhang Date: Tue, 10 Dec 2024 11:13:24 -0800 Subject: [PATCH] update dataset_path parsing in config --- data_juicer/config/config.py | 13 ++++++++----- tests/core/data/test_config_list.yaml | 8 ++++++++ tests/core/test_dataset_builder.py | 15 ++++++++++++++- 3 files changed, 30 insertions(+), 6 deletions(-) create mode 100644 tests/core/data/test_config_list.yaml diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index 990fd0e45..18571e34a 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -415,19 +415,22 @@ def init_setup_from_cfg(cfg: Namespace): # check and get dataset dir if cfg.get('dataset_path', None) and os.path.exists(cfg.dataset_path): + logger.warning('dataset_path config is set and a valid local path') cfg.dataset_path = os.path.abspath(cfg.dataset_path) if os.path.isdir(cfg.dataset_path): cfg.dataset_dir = cfg.dataset_path else: cfg.dataset_dir = os.path.dirname(cfg.dataset_path) - elif cfg.dataset_path == '': - logger.warning('dataset_path is empty by default.') + elif cfg.dataset_path == '' and cfg.get('dataset', None): + logger.warning('dataset_path config is empty; dataset is present') cfg.dataset_dir = '' else: logger.warning(f'dataset_path [{cfg.dataset_path}] is not a valid ' - f'local path. Please check and retry, otherwise we ' - f'will treat it as a remote dataset or a mixture of ' - f'several datasets.') + f'local path, AND dataset is not present. ' + f'Please check and retry, otherwise we ' + f'will treat dataset_path as a remote dataset or a ' + f'mixture of several datasets.') + cfg.dataset_dir = '' # check number of processes np diff --git a/tests/core/data/test_config_list.yaml b/tests/core/data/test_config_list.yaml new file mode 100644 index 000000000..61ad61162 --- /dev/null +++ b/tests/core/data/test_config_list.yaml @@ -0,0 +1,8 @@ +project_name: 'dataset-ondisk-list' +dataset: + - type: 'ondisk' + path: + - 'sample.json' + - type: 'ondisk' + path: + - 'sample.txt' \ No newline at end of file diff --git a/tests/core/test_dataset_builder.py b/tests/core/test_dataset_builder.py index 63b0b8343..32bd04e2f 100644 --- a/tests/core/test_dataset_builder.py +++ b/tests/core/test_dataset_builder.py @@ -59,7 +59,20 @@ def test_dataset_builder_ondisk_config(self): cfg = init_configs(args=f'--config {test_config_file}'.split()) self.assertIsInstance(cfg, Namespace) self.assertEqual(cfg.project_name, 'dataset-ondisk-json') - self.assertEqual(cfg.dataset, {'path': ['sample.json'], 'type': 'ondisk'}) + self.assertEqual(cfg.dataset, + {'path': ['sample.json'], 'type': 'ondisk'}) + self.assertEqual(not cfg.dataset_path, True) + + def test_dataset_builder_ondisk_config_list(self): + test_config_file = './data/test_config_list.yaml' + out = StringIO() + with redirect_stdout(out): + cfg = init_configs(args=f'--config {test_config_file}'.split()) + self.assertIsInstance(cfg, Namespace) + self.assertEqual(cfg.project_name, 'dataset-ondisk-list') + self.assertEqual(cfg.dataset,[ + {'path': ['sample.json'], 'type': 'ondisk'}, + {'path': ['sample.txt'], 'type': 'ondisk'}]) self.assertEqual(not cfg.dataset_path, True)