Unbreak image datasets in model prediction component (#3469)

* Intermediate state. * Unit tests for image datasets. * Build image datasets without using df append. * Undo changes to download_dependencies.py. * Cosmetics. * More cosmetics. * Even more cosmetics.
Azure · Oct 9, 2024 · 597d616 · 597d616
1 parent eed062d
commit 597d616
Show file tree

Hide file tree

Showing 3 changed files with 217 additions and 11 deletions.
diff --git a/assets/training/model_evaluation/src/image_dataset.py b/assets/training/model_evaluation/src/image_dataset.py
@@ -193,15 +193,20 @@ def get_classification_dataset(
  # labels: {test_dataset_wrapper.num_classes}"
  )
 
- df = pd.DataFrame(columns=input_column_names + [label_column_name])
+ # Initialize the rows of the output dataframe to the empty list.
+ frame_rows = []
+
  for index in range(len(test_dataset_wrapper)):
  image_path = test_dataset_wrapper.get_image_full_path(index)
  if is_valid_image(image_path):
  # sending image_paths instead of base64 encoded string as oss flavor doesnt take bytes as input.
- df = df.append({
+ frame_rows.append({
  input_column_names[0]: image_path,
  label_column_name: test_dataset_wrapper.label_at_index(index)
- }, ignore_index=True)
+ })
+
+ # Make the output dataframe.
+ df = pd.DataFrame(data=frame_rows, columns=input_column_names + [label_column_name])
 
  return df
 
@@ -253,7 +258,9 @@ def get_object_detection_dataset(
  f"# test images: {len(test_dataset)}, # labels: {test_dataset.num_classes}"
  )
  test_dataset_wrapper = RuntimeDetectionDatasetAdapter(test_dataset)
- df = pd.DataFrame(columns=input_column_names + [label_column_name])
+
+ # Initialize the rows of the output dataframe to the empty list.
+ frame_rows = []
 
  counter = 0
  for index in range(len(test_dataset_wrapper)):
@@ -262,12 +269,15 @@ def get_object_detection_dataset(
 
  if is_valid_image(image_path):
  counter += 1
- df = df.append({
+ frame_rows.append({
  input_column_names[0]: base64.encodebytes(read_image(image_path)).decode("utf-8"),
  input_column_names[1]: image_meta_info,
  input_column_names[2]: ". ".join(test_dataset.classes),
  label_column_name: label,
- }, ignore_index=True)
+ })
+
+ # Make the output dataframe.
+ df = pd.DataFrame(data=frame_rows, columns=input_column_names + [label_column_name])
 
  logger.info(f"Total number of valid images: {counter}")
  return df
@@ -300,8 +310,8 @@ def get_generation_dataset(
  mltable = load(mltable_path)
  mltable_dataframe = mltable.to_pandas_dataframe()
 
- # Initialize the output dataframe with the input and label columns.
- df = pd.DataFrame(columns=input_column_names + [label_column_name])
+ # Initialize the rows of the output dataframe to the empty list.
+ frame_rows = []
 
  # Go through all (image_url, captions) pairs and make a (prompt, image_url) from each pair. The model will generate
  # a synthetic image from the prompt and the set of synthetic images will be compared with the set of original ones.
@@ -310,16 +320,18 @@ def get_generation_dataset(
  ):
  # Go through all captions (split according to special separator).
  for caption in captions.split(GenerationLiterals.CAPTION_SEPARATOR):
- df = df.append(
+ frame_rows.append(
  {
  # The model input is a text prompt.
  input_column_names[0]: caption,
  # The original image is passed through via the label column.
  label_column_name: image_url,
- },
- ignore_index=True
+ }
  )
 
+ # Make the output dataframe.
+ df = pd.DataFrame(data=frame_rows, columns=input_column_names + [label_column_name])
+
  return df
 
 

diff --git a/assets/training/model_evaluation/tests/__init__.py b/assets/training/model_evaluation/tests/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Init file."""
diff --git a/assets/training/model_evaluation/tests/test_image_dataset.py b/assets/training/model_evaluation/tests/test_image_dataset.py
@@ -0,0 +1,190 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Test image dataset implementations."""
+
+import json
+import os
+import pytest
+import sys
+import tempfile
+
+from unittest.mock import patch
+
+from azureml.acft.common_components.image.runtime_common.common.dataset_helper import AmlDatasetHelper
+
+MODEL_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.dirname(__file__)), "./src"))
+sys.path.append(MODEL_DIR)
+from constants import TASK # noqa: E402
+from image_dataset import get_image_dataset # noqa: E402
+
+
+DATASET_PER_TASK = {
+ TASK.IMAGE_CLASSIFICATION: [
+ {"image_url": "AmlDatastore://images/a/image1.jpg", "label": 0},
+ {"image_url": "AmlDatastore://images/a/image2.jpg", "label": 1},
+ ],
+ TASK.IMAGE_OBJECT_DETECTION: [
+ {
+ "image_url": "AmlDatastore://images/b/image1.png",
+ "label": [{"label": 0, "topX": 0.0, "topY": 0.0, "bottomX": 0.5, "bottomY": 0.5}],
+ },
+ {
+ "image_url": "AmlDatastore://images/b/image2.png",
+ "label": [{"label": 1, "topX": 0.5, "topY": 0.5, "bottomX": 1.0, "bottomY": 1.0}],
+ },
+ ],
+ TASK.IMAGE_GENERATION: [
+ {"image_url": "example.com/image1.png", "label": "an example"},
+ {"image_url": "example.com/image2.png", "label": "another example"},
+ ],
+}
+MLTABLE_CONTENTS_PER_TASK = {
+ TASK.IMAGE_CLASSIFICATION: (
+ "paths:\n"
+ " - file: {file_name}\n"
+ "transformations:\n"
+ " - read_json_lines:\n"
+ " encoding: utf8\n"
+ " invalid_lines: error\n"
+ " include_path_column: false\n"
+ " - convert_column_types:\n"
+ " - columns: image_url\n"
+ " column_type: stream_info\n"
+ "type: mltable\n"
+ ),
+ TASK.IMAGE_OBJECT_DETECTION: (
+ "paths:\n"
+ " - file: {file_name}\n"
+ "transformations:\n"
+ " - read_json_lines:\n"
+ " encoding: utf8\n"
+ " invalid_lines: error\n"
+ " include_path_column: false\n"
+ " - convert_column_types:\n"
+ " - columns: image_url\n"
+ " column_type: stream_info\n"
+ "type: mltable\n"
+ ),
+ TASK.IMAGE_GENERATION: (
+ "paths:\n"
+ "- file: {file_name}\n"
+ "transformations:\n"
+ "- read_json_lines:\n"
+ " encoding: utf8\n"
+ " include_path_column: false\n"
+ " invalid_lines: error\n"
+ " partition_size: 20971520\n"
+ " path_column: Path\n"
+ "- convert_column_types:\n"
+ " - column_type: stream_info\n"
+ " columns: image_url\n"
+ "type: mltable\n"
+ ),
+}
+
+
+class MockWorkspace:
+ """Mock workspace."""
+
+ def __init__(self, subscription_id, resource_group, workspace_name, location, workspace_id):
+ """Make mock workspace."""
+ self.subscription_id = subscription_id
+ self.resource_group = resource_group
+ self._workspace_name = workspace_name
+ self.location = location
+ self._workspace_id_internal = workspace_id
+ self.name = workspace_name
+
+
+class MockExperiment:
+ """Mock experiment."""
+
+ def __init__(self, workspace, id):
+ """Make mock experiment."""
+ self.workspace = workspace
+ self.id = id
+
+
+class MockRun:
+ """Mock run."""
+
+ def __init__(self, id):
+ """Make mock run."""
+ self.id = id
+
+
+class MockRunContext:
+ """Mock run context."""
+
+ def __init__(self, experiment, run_id, parent_run_id):
+ """Make mock run context."""
+ self.experiment = experiment
+ self._run_id = run_id
+ self.id = run_id
+ self.parent = MockRun(parent_run_id)
+
+
+def get_mock_run_context():
+ """Make mock run context."""
+ TEST_EXPERIMENT_ID = "22222222-2222-2222-2222-222222222222"
+ TEST_REGION = "eastus"
+ TEST_PARENT_RUN_ID = "bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb"
+ TEST_RESOURCE_GROUP = "testrg"
+ TEST_RUN_ID = "aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa"
+ TEST_SUBSCRIPTION_ID = "00000000-0000-0000-0000-000000000000"
+ TEST_WORKSPACE_ID = "11111111-1111-1111-111111111111"
+ TEST_WORKSPACE_NAME = "testws"
+
+ ws = MockWorkspace(
+ subscription_id=TEST_SUBSCRIPTION_ID,
+ resource_group=TEST_RESOURCE_GROUP,
+ workspace_name=TEST_WORKSPACE_NAME,
+ location=TEST_REGION,
+ workspace_id=TEST_WORKSPACE_ID,
+ )
+ experiment = MockExperiment(workspace=ws, id=TEST_EXPERIMENT_ID)
+ return MockRunContext(experiment, run_id=TEST_RUN_ID, parent_run_id=TEST_PARENT_RUN_ID)
+
+
+@pytest.mark.parametrize("task_type,input_column_names,label_column_name", [
+ (TASK.IMAGE_CLASSIFICATION, ["image_url"], "label"),
+ (TASK.IMAGE_OBJECT_DETECTION, ["image_url"], "label"),
+ (TASK.IMAGE_GENERATION, ["prompt"], "label"),
+])
+def test_image_dataset(task_type, input_column_names, label_column_name):
+ """Test image dataset on small example."""
+ with tempfile.TemporaryDirectory() as directory_name:
+ # Save the jsonl file.
+ dataset = DATASET_PER_TASK[task_type]
+ with open(os.path.join(directory_name, "dataset.jsonl"), "wt") as f:
+ for r in dataset:
+ f.write(json.dumps(r) + "\n")
+
+ # Save the MLTable file.
+ mltable_str = MLTABLE_CONTENTS_PER_TASK[task_type].format(file_name="dataset.jsonl")
+ with open(os.path.join(directory_name, "MLTable"), "wt") as f:
+ f.write(mltable_str)
+
+ # Make blank image files for image classification and object detection tasks, to simulate downloading.
+ if task_type in [TASK.IMAGE_CLASSIFICATION, TASK.IMAGE_OBJECT_DETECTION]:
+ for r in dataset:
+ image_file_name_tokens = r["image_url"].replace("AmlDatastore://", "").split("/")
+ os.makedirs(os.path.join(directory_name, *image_file_name_tokens[:-1]), exist_ok=True)
+ open(os.path.join(directory_name, *image_file_name_tokens), "wb").close()
+
+ # Load the MLTable.
+ with patch("azureml.core.Run.get_context", get_mock_run_context), \
+ patch(
+ "azureml.acft.common_components.image.runtime_common.common.utils.download_or_mount_image_files"
+ ), \
+ patch.object(AmlDatasetHelper, "get_data_dir", return_value=directory_name):
+ df = get_image_dataset(task_type, directory_name, input_column_names, label_column_name)
+
+ # Compare the loaded dataset with the original.
+ if task_type == TASK.IMAGE_GENERATION:
+ loaded_dataset = [{k: row[k] for k in ["prompt", "label"]} for _, row in df.iterrows()]
+ for r1, r2 in zip(
+ sorted(dataset, key=lambda x: x["label"]), sorted(loaded_dataset, key=lambda x: x["prompt"])
+ ):
+ assert r2 == {"prompt": r1["label"], "label": r1["image_url"]}