[Data] Clarify schema validation error (#48882)

```python ray.data.range(1).groupby("does_not_exist").count().materialize() ``` **Before** ``` ValueError: The column 'does_not_exist' does not exist in the schema 'Column Type ------ ---- id int64'. ``` **After** ``` ValueError: You specified the column 'does_not_exist', but there's no such column in the dataset. The dataset has columns: {'id'} ``` --------- Signed-off-by: Balaji Veeramani <[email protected]>
ray-project · Nov 27, 2024 · 3bd3a02 · 3bd3a02
1 parent 3d7fc8f
commit 3bd3a02
Show file tree

Hide file tree

Showing 2 changed files with 7 additions and 13 deletions.
diff --git a/python/ray/data/_internal/planner/exchange/sort_task_spec.py b/python/ray/data/_internal/planner/exchange/sort_task_spec.py
@@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
             for column in self._columns:
                 if column not in schema_names_set:
                     raise ValueError(
-                        "The column '{}' does not exist in the "
-                        "schema '{}'.".format(column, schema)
+                        f"You specified the column '{column}', but there's no such "
+                        "column in the dataset. The dataset has columns: "
+                        f"{schema_names_set}"
                     )
 
     @property

diff --git a/python/ray/data/tests/test_execution_optimizer.py b/python/ray/data/tests/test_execution_optimizer.py
@@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert extract_values("id", ds.sort("id").take_all()) == list(range(10))
 
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds.sort(invalid_col_name).take_all()
 
     ds_named = ray.data.from_items(
@@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
     assert [d["col1"] for d in r1] == [7, 5, 3, 1]
     assert [d["col2"] for d in r2] == [8, 6, 4, 2]
 
-    with pytest.raises(
-        ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
-    ):
+    with pytest.raises(ValueError, match="there's no such column in the dataset"):
         ds_named.sort(invalid_col_name).take_all()
 
 
@@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle):
 def test_aggregate_validate_keys(ray_start_regular_shared):
     ds = ray.data.range(10)
     invalid_col_name = "invalid_column"
-    with pytest.raises(
-        ValueError, match=f"The column '{invalid_col_name}' does not exist"
-    ):
+    with pytest.raises(ValueError):
         ds.groupby(invalid_col_name).count()
 
     ds_named = ray.data.from_items(
@@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared):
 
     with pytest.raises(
         ValueError,
-        match=f"The column '{invalid_col_name}' does not exist in the schema",
+        match="there's no such column in the dataset",
     ):
         ds_named.groupby(invalid_col_name).count()