Skip to content

Commit

Permalink
[Data] Clarify schema validation error (#48882)
Browse files Browse the repository at this point in the history
```python
ray.data.range(1).groupby("does_not_exist").count().materialize()
```

**Before**
```
ValueError: The column 'does_not_exist' does not exist in the schema 'Column  Type
------  ----
id      int64'.
```

**After**
```
ValueError: You specified the column 'does_not_exist', but there's no such column in the dataset. The dataset has columns: {'id'}
```

---------

Signed-off-by: Balaji Veeramani <[email protected]>
  • Loading branch information
bveeramani authored Nov 27, 2024
1 parent 3d7fc8f commit 3bd3a02
Show file tree
Hide file tree
Showing 2 changed files with 7 additions and 13 deletions.
5 changes: 3 additions & 2 deletions python/ray/data/_internal/planner/exchange/sort_task_spec.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,9 @@ def validate_schema(self, schema: Optional[Union[type, "pyarrow.lib.Schema"]]):
for column in self._columns:
if column not in schema_names_set:
raise ValueError(
"The column '{}' does not exist in the "
"schema '{}'.".format(column, schema)
f"You specified the column '{column}', but there's no such "
"column in the dataset. The dataset has columns: "
f"{schema_names_set}"
)

@property
Expand Down
15 changes: 4 additions & 11 deletions python/ray/data/tests/test_execution_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -1145,9 +1145,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
assert extract_values("id", ds.sort("id").take_all()) == list(range(10))

invalid_col_name = "invalid_column"
with pytest.raises(
ValueError, match=f"The column '{invalid_col_name}' does not exist"
):
with pytest.raises(ValueError, match="there's no such column in the dataset"):
ds.sort(invalid_col_name).take_all()

ds_named = ray.data.from_items(
Expand All @@ -1165,10 +1163,7 @@ def test_sort_validate_keys(ray_start_regular_shared):
assert [d["col1"] for d in r1] == [7, 5, 3, 1]
assert [d["col2"] for d in r2] == [8, 6, 4, 2]

with pytest.raises(
ValueError,
match=f"The column '{invalid_col_name}' does not exist in the schema",
):
with pytest.raises(ValueError, match="there's no such column in the dataset"):
ds_named.sort(invalid_col_name).take_all()


Expand Down Expand Up @@ -1279,9 +1274,7 @@ def test_aggregate_e2e(ray_start_regular_shared, use_push_based_shuffle):
def test_aggregate_validate_keys(ray_start_regular_shared):
ds = ray.data.range(10)
invalid_col_name = "invalid_column"
with pytest.raises(
ValueError, match=f"The column '{invalid_col_name}' does not exist"
):
with pytest.raises(ValueError):
ds.groupby(invalid_col_name).count()

ds_named = ray.data.from_items(
Expand All @@ -1308,7 +1301,7 @@ def test_aggregate_validate_keys(ray_start_regular_shared):

with pytest.raises(
ValueError,
match=f"The column '{invalid_col_name}' does not exist in the schema",
match="there's no such column in the dataset",
):
ds_named.groupby(invalid_col_name).count()

Expand Down

0 comments on commit 3bd3a02

Please sign in to comment.