data folders should be together

Signed-off-by: Dushyant Behl <[email protected]>
foundation-model-stack · Nov 28, 2024 · 82b548f · 82b548f
1 parent 70252af
commit 82b548f
Show file tree

Hide file tree

Showing 44 changed files with 16 additions and 17 deletions.
diff --git a/tests/acceleration/test_acceleration_framework.py b/tests/acceleration/test_acceleration_framework.py
@@ -54,13 +54,13 @@
 from tuning.utils.import_utils import is_fms_accelerate_available
 
 # for some reason the CI will raise an import error if we try to import
-# these from tests.testdata
+# these from tests.artifacts.testdata
 TWITTER_COMPLAINTS_JSON_FORMAT = os.path.join(
-    os.path.dirname(__file__), "../data/twitter_complaints_json.json"
+    os.path.dirname(__file__), "../artifacts/testdata/twitter_complaints_json.json"
 )
 TWITTER_COMPLAINTS_TOKENIZED = os.path.join(
     os.path.dirname(__file__),
-    "../data/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
+    "../artifacts/testdata/twitter_complaints_tokenized_with_maykeye_tinyllama_v0.json",
 )
 
 # pylint: disable=import-error

diff --git a/tests/predefined_data_configs/__init__.py → ...facts/predefined_data_configs/__init__.py b/tests/predefined_data_configs/__init__.py → ...facts/predefined_data_configs/__init__.py
@@ -25,6 +25,6 @@
 PRETOKENIZE_JSON_DATA_YAML = os.path.join(
     PREDEFINED_DATA_CONFIGS, "pretokenized_json_data.yaml"
 )
-TOKENIZE_AND_INSTRUCTION_MASKING_YAML = os.path.join(
-    PREDEFINED_DATA_CONFIGS, "tokenize_and_instruction_masking.yaml"
+TOKENIZE_AND_INPUT_MASKING_YAML = os.path.join(
+    PREDEFINED_DATA_CONFIGS, "tokenize_and_input_masking.yaml"
 )
diff --git a/...d_data_configs/apply_custom_template.yaml → ...d_data_configs/apply_custom_template.yaml b/...d_data_configs/apply_custom_template.yaml → ...d_data_configs/apply_custom_template.yaml
diff --git a/..._data_configs/pretokenized_json_data.yaml → ..._data_configs/pretokenized_json_data.yaml b/..._data_configs/pretokenized_json_data.yaml → ..._data_configs/pretokenized_json_data.yaml
diff --git a/...igs/tokenize_and_apply_input_masking.yaml → ...igs/tokenize_and_apply_input_masking.yaml b/...igs/tokenize_and_apply_input_masking.yaml → ...igs/tokenize_and_apply_input_masking.yaml
diff --git a/tests/testdata/__init__.py → tests/artifacts/testdata/__init__.py b/tests/testdata/__init__.py → tests/artifacts/testdata/__init__.py
diff --git a/tests/testdata/empty_data.json → tests/artifacts/testdata/empty_data.json b/tests/testdata/empty_data.json → tests/artifacts/testdata/empty_data.json
diff --git a/tests/testdata/malformatted_data.json → ...artifacts/testdata/malformatted_data.json b/tests/testdata/malformatted_data.json → ...artifacts/testdata/malformatted_data.json
diff --git a/tests/testdata/trainercontroller/__init__.py → ...ts/testdata/trainercontroller/__init__.py b/tests/testdata/trainercontroller/__init__.py → ...ts/testdata/trainercontroller/__init__.py
diff --git a/...oller/epoch-level-eval-loss-patience.yaml → ...oller/epoch-level-eval-loss-patience.yaml b/...oller/epoch-level-eval-loss-patience.yaml → ...oller/epoch-level-eval-loss-patience.yaml
diff --git a/...inercontroller/epoch-level-eval-loss.yaml → ...inercontroller/epoch-level-eval-loss.yaml b/...inercontroller/epoch-level-eval-loss.yaml → ...inercontroller/epoch-level-eval-loss.yaml
diff --git a/...controller/epoch-level-training-loss.yaml → ...controller/epoch-level-training-loss.yaml b/...controller/epoch-level-training-loss.yaml → ...controller/epoch-level-training-loss.yaml
diff --git a/...ta/trainercontroller/exposed_metrics.yaml → ...ta/trainercontroller/exposed_metrics.yaml b/...ta/trainercontroller/exposed_metrics.yaml → ...ta/trainercontroller/exposed_metrics.yaml
diff --git a/...correct_source_event_exposed_metrics.yaml → ...correct_source_event_exposed_metrics.yaml b/...correct_source_event_exposed_metrics.yaml → ...correct_source_event_exposed_metrics.yaml
diff --git a/...ata/trainercontroller/log_controller.yaml → ...ata/trainercontroller/log_controller.yaml b/...ata/trainercontroller/log_controller.yaml → ...ata/trainercontroller/log_controller.yaml
diff --git a/...trainercontroller/loss_custom_metric.yaml → ...trainercontroller/loss_custom_metric.yaml b/...trainercontroller/loss_custom_metric.yaml → ...trainercontroller/loss_custom_metric.yaml
diff --git a/...inercontroller/loss_custom_operation.yaml → ...inercontroller/loss_custom_operation.yaml b/...inercontroller/loss_custom_operation.yaml → ...inercontroller/loss_custom_operation.yaml
diff --git a/...loss_custom_operation_invalid_action.yaml → ...loss_custom_operation_invalid_action.yaml b/...loss_custom_operation_invalid_action.yaml → ...loss_custom_operation_invalid_action.yaml
diff --git a/...rainercontroller/loss_invalid_metric.yaml → ...rainercontroller/loss_invalid_metric.yaml b/...rainercontroller/loss_invalid_metric.yaml → ...rainercontroller/loss_invalid_metric.yaml
diff --git a/...nercontroller/loss_invalid_operation.yaml → ...nercontroller/loss_invalid_operation.yaml b/...nercontroller/loss_invalid_operation.yaml → ...nercontroller/loss_invalid_operation.yaml
diff --git a/...roller/loss_invalid_operation_action.yaml → ...roller/loss_invalid_operation_action.yaml b/...roller/loss_invalid_operation_action.yaml → ...roller/loss_invalid_operation_action.yaml
diff --git a/...ainercontroller/loss_invalid_trigger.yaml → ...ainercontroller/loss_invalid_trigger.yaml b/...ainercontroller/loss_invalid_trigger.yaml → ...ainercontroller/loss_invalid_trigger.yaml
diff --git a/.../trainercontroller/loss_on_threshold.yaml → .../trainercontroller/loss_on_threshold.yaml b/.../trainercontroller/loss_on_threshold.yaml → .../trainercontroller/loss_on_threshold.yaml
diff --git a/...loss_on_threshold_with_trainer_state.yaml → ...loss_on_threshold_with_trainer_state.yaml b/...loss_on_threshold_with_trainer_state.yaml → ...loss_on_threshold_with_trainer_state.yaml
diff --git a/...ercontroller/loss_unavailable_metric.yaml → ...ercontroller/loss_unavailable_metric.yaml b/...ercontroller/loss_unavailable_metric.yaml → ...ercontroller/loss_unavailable_metric.yaml
diff --git a/...ntroller/loss_with_invalid_type_rule.yaml → ...ntroller/loss_with_invalid_type_rule.yaml b/...ntroller/loss_with_invalid_type_rule.yaml → ...ntroller/loss_with_invalid_type_rule.yaml
diff --git a/...oller/loss_with_malicious_input_rule.yaml → ...oller/loss_with_malicious_input_rule.yaml b/...oller/loss_with_malicious_input_rule.yaml → ...oller/loss_with_malicious_input_rule.yaml
diff --git a/...ntroller/loss_with_malicious_os_rule.yaml → ...ntroller/loss_with_malicious_os_rule.yaml b/...ntroller/loss_with_malicious_os_rule.yaml → ...ntroller/loss_with_malicious_os_rule.yaml
diff --git a/...troller/non-decreasing-training-loss.yaml → ...troller/non-decreasing-training-loss.yaml b/...troller/non-decreasing-training-loss.yaml → ...troller/non-decreasing-training-loss.yaml
diff --git a/...s/testdata/trainercontroller/on-save.yaml → ...s/testdata/trainercontroller/on-save.yaml b/...s/testdata/trainercontroller/on-save.yaml → ...s/testdata/trainercontroller/on-save.yaml
diff --git a/...controller/thresholded-training-loss.yaml → ...controller/thresholded-training-loss.yaml b/...controller/thresholded-training-loss.yaml → ...controller/thresholded-training-loss.yaml
diff --git a/...data/twitter_complaints_input_output.json → ...data/twitter_complaints_input_output.json b/...data/twitter_complaints_input_output.json → ...data/twitter_complaints_input_output.json
diff --git a/...ata/twitter_complaints_input_output.jsonl → ...ata/twitter_complaints_input_output.jsonl b/...ata/twitter_complaints_input_output.jsonl → ...ata/twitter_complaints_input_output.jsonl
diff --git a/tests/testdata/twitter_complaints_small.json → ...ts/testdata/twitter_complaints_small.json b/tests/testdata/twitter_complaints_small.json → ...ts/testdata/twitter_complaints_small.json
diff --git a/...s/testdata/twitter_complaints_small.jsonl → ...s/testdata/twitter_complaints_small.jsonl b/...s/testdata/twitter_complaints_small.jsonl → ...s/testdata/twitter_complaints_small.jsonl
diff --git a/..._tokenized_with_maykeye_tinyllama_v0.json → ..._tokenized_with_maykeye_tinyllama_v0.json b/..._tokenized_with_maykeye_tinyllama_v0.json → ..._tokenized_with_maykeye_tinyllama_v0.json
diff --git a/...tokenized_with_maykeye_tinyllama_v0.jsonl → ...tokenized_with_maykeye_tinyllama_v0.jsonl b/...tokenized_with_maykeye_tinyllama_v0.jsonl → ...tokenized_with_maykeye_tinyllama_v0.jsonl
diff --git a/tests/build/test_launch_script.py b/tests/build/test_launch_script.py
@@ -26,7 +26,7 @@
 # First Party
 from build.accelerate_launch import main
 from build.utils import serialize_args, get_highest_checkpoint
-from tests.testdata import TWITTER_COMPLAINTS_DATA_JSONL
+from tests.artifacts.testdata import TWITTER_COMPLAINTS_DATA_JSONL
 from tuning.utils.error_logging import (
     USER_ERROR_EXIT_CODE,
     INTERNAL_ERROR_EXIT_CODE,

diff --git a/tests/data/test_data_handlers.py b/tests/data/test_data_handlers.py
@@ -21,7 +21,7 @@
 import pytest
 
 # First Party
-from tests.testdata import MODEL_NAME, TWITTER_COMPLAINTS_DATA_JSONL
+from tests.artifacts.testdata import MODEL_NAME, TWITTER_COMPLAINTS_DATA_JSONL
 
 # Local
 from tuning.data.data_handlers import apply_custom_data_formatting_template

diff --git a/tests/data/test_data_preprocessing_utils.py b/tests/data/test_data_preprocessing_utils.py
@@ -25,12 +25,12 @@
 import yaml
 
 # First Party
-from tests.predefined_data_configs import (
+from tests.artifacts.predefined_data_configs import (
     APPLY_CUSTOM_TEMPLATE_YAML,
     PRETOKENIZE_JSON_DATA_YAML,
-    TOKENIZE_AND_INSTRUCTION_MASKING_YAML,
+    TOKENIZE_AND_INPUT_MASKING_YAML,
 )
-from tests.testdata import (
+from tests.artifacts.testdata import (
     MODEL_NAME,
     TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
     TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL,
@@ -408,11 +408,11 @@ def test_validate_args_pretokenized(data_args, packing):
         (PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSON),
         (PRETOKENIZE_JSON_DATA_YAML, TWITTER_COMPLAINTS_TOKENIZED_JSONL),
         (
-            TOKENIZE_AND_INSTRUCTION_MASKING_YAML,
+            TOKENIZE_AND_INPUT_MASKING_YAML,
             TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSON,
         ),
         (
-            TOKENIZE_AND_INSTRUCTION_MASKING_YAML,
+            TOKENIZE_AND_INPUT_MASKING_YAML,
             TWITTER_COMPLAINTS_DATA_INPUT_OUTPUT_JSONL,
         ),
     ],

diff --git a/tests/test_sft_trainer.py b/tests/test_sft_trainer.py
@@ -31,7 +31,7 @@
 # First Party
 from build.utils import serialize_args
 from scripts.run_inference import TunedCausalLM
-from tests.testdata import (
+from tests.artifacts.testdata import (
     EMPTY_DATA,
     MALFORMATTED_DATA,
     MODEL_NAME,

diff --git a/tests/trainercontroller/test_tuning_trainercontroller.py b/tests/trainercontroller/test_tuning_trainercontroller.py
@@ -30,7 +30,7 @@
 from tests.trainercontroller.custom_operation_invalid_action import (
     CustomOperationInvalidAction,
 )
-import tests.testdata.trainercontroller as td
+import tests.artifacts.testdata.trainercontroller as td
 
 # Local
 import tuning.config.configs as config

diff --git a/tests/utils/test_tokenizer_data_utils.py b/tests/utils/test_tokenizer_data_utils.py
@@ -3,7 +3,7 @@
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 # First Party
-from tests.testdata import MODEL_NAME
+from tests.artifacts.testdata import MODEL_NAME
 
 # Local
 # First party

diff --git a/tuning/config/configs.py b/tuning/config/configs.py
@@ -99,8 +99,7 @@ class DataArguments:
         default=None,
         metadata={
             "help": "data config file which specifies the data preprocessing logic to apply.\
-                     Supports both JSON and YAML based config files.\
-                     for examples see examples/predefined_data_configs"
+                     Supports both JSON and YAML based config files."
         },
     )