Skip to content

Commit

Permalink
Error on shared column names when writing (#1335)
Browse files Browse the repository at this point in the history
* Error on shared column names when writing

* add test

* Release note
  • Loading branch information
ivirshup authored Jan 29, 2024
1 parent 299ca97 commit d07306f
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 0 deletions.
13 changes: 13 additions & 0 deletions anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -663,10 +663,23 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})):
if reserved in df.columns:
raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.")
group = f.require_group(key)
if not df.columns.is_unique:
duplicates = list(df.columns[df.columns.duplicated()])
raise ValueError(
f"Found repeated column names: {duplicates}. Column names must be unique."
)
col_names = [check_key(c) for c in df.columns]
group.attrs["column-order"] = col_names

if df.index.name is not None:
if df.index.name in col_names and not pd.Series(
df.index, index=df.index
).equals(df[df.index.name]):
raise ValueError(
f"DataFrame.index.name ({df.index.name!r}) is also used by a column "
"whose values are different. This is not supported. Please make sure "
"the values are the same, or use a different name."
)
index_name = df.index.name
else:
index_name = "_index"
Expand Down
29 changes: 29 additions & 0 deletions anndata/tests/test_io_elementwise.py
Original file line number Diff line number Diff line change
Expand Up @@ -300,3 +300,32 @@ def test_read_zarr_from_group(tmp_path, consolidated):
with read_func(pth) as z:
expected = ad.read_zarr(z["table/table"])
assert_equal(adata, expected)


def test_dataframe_column_uniqueness(store):
repeated_cols = pd.DataFrame(np.ones((3, 2)), columns=["a", "a"])

with pytest_8_raises(
ValueError,
match=r"Found repeated column names: \['a'\]\. Column names must be unique\.",
):
write_elem(store, "repeated_cols", repeated_cols)

index_shares_col_name = pd.DataFrame(
{"col_name": [1, 2, 3]}, index=pd.Index([1, 3, 2], name="col_name")
)

with pytest_8_raises(
ValueError,
match=r"DataFrame\.index\.name \('col_name'\) is also used by a column whose values are different\.",
):
write_elem(store, "index_shares_col_name", index_shares_col_name)

index_shared_okay = pd.DataFrame(
{"col_name": [1, 2, 3]}, index=pd.Index([1, 2, 3], name="col_name")
)

write_elem(store, "index_shared_okay", index_shared_okay)
result = read_elem(store["index_shared_okay"])

assert_equal(result, index_shared_okay)
1 change: 1 addition & 0 deletions docs/release-notes/0.10.6.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
```

* Defer import of zarr in test helpers, as scanpy CI job relies on them {pr}`1343` {user}`ilan-gold`
* Writing a dataframe with non-unique column names now throws an error, instead of silently overwriting {pr}`1335` {user}`ivirshup`

```{rubric} Documentation
```
Expand Down

0 comments on commit d07306f

Please sign in to comment.