From c5eb978d359ebc3b4a835c40a81fb7eef7d363af Mon Sep 17 00:00:00 2001 From: ejolly Date: Mon, 2 May 2022 16:59:02 -0400 Subject: [PATCH 1/4] Fix issue when splitting and combining datasets with mixed or non-string column and index names --- neighbors/tests/test_utils.py | 6 ++++++ neighbors/utils.py | 14 ++++++++++++-- requirements-dev.txt | 6 +++++- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/neighbors/tests/test_utils.py b/neighbors/tests/test_utils.py index 9be51f8..947b269 100644 --- a/neighbors/tests/test_utils.py +++ b/neighbors/tests/test_utils.py @@ -129,17 +129,23 @@ def test_create_sparse_mask(simulate_wide_data): expected_items = int(simulate_wide_data.shape[1] * (1 - 0.10)) assert mask.shape == simulate_wide_data.shape assert all(mask.sum(1) == expected_items) + assert mask.index.name == simulate_wide_data.index.name + assert mask.columns.name == simulate_wide_data.columns.name mask = create_sparse_mask(simulate_wide_data, n_mask_items=19) assert mask.shape == simulate_wide_data.shape expected_items = int(simulate_wide_data.shape[1] - 19) assert all(mask.sum(1) == expected_items) + assert mask.index.name == simulate_wide_data.index.name + assert mask.columns.name == simulate_wide_data.columns.name masked_data = simulate_wide_data[mask] assert isinstance(masked_data, pd.DataFrame) assert masked_data.shape == simulate_wide_data.shape assert ~simulate_wide_data.isnull().any().any() assert masked_data.isnull().any().any() + assert mask.index.name == simulate_wide_data.index.name + assert mask.columns.name == simulate_wide_data.columns.name def test_flatten_dataframe(simulate_wide_data): diff --git a/neighbors/utils.py b/neighbors/utils.py index 8aed9ee..5349ba0 100644 --- a/neighbors/utils.py +++ b/neighbors/utils.py @@ -206,7 +206,10 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None): for _ in range(data.shape[0]) ] ) - return pd.DataFrame(mask, index=data.index, columns=data.columns) + out = pd.DataFrame(mask, index=data.index, columns=data.columns) + out.index.name = data.index.name + out.columns.name = data.columns.name + return out def flatten_dataframe(data: pd.DataFrame) -> list: @@ -223,7 +226,14 @@ def flatten_dataframe(data: pd.DataFrame) -> list: if not isinstance(data, pd.DataFrame): raise TypeError("input must be a pandas dataframe") - out = zip(product(data.index, data.columns), data.to_numpy().ravel()) + # Force index and columns names to be strings so that unflatten can more reliably + # auto-infer index and column names when they aren't passed, i.e doesn't add + # additional columns or rows with numeric versions of the same name + out = data.copy() + out.index = list(map(str, out.index)) + out.columns = list(map(str, out.columns)) + + out = zip(product(out.index, out.columns), out.to_numpy().ravel()) return np.array([(elem[0][0], elem[0][1], elem[1]) for elem in out]) diff --git a/requirements-dev.txt b/requirements-dev.txt index 66bd6a9..919d991 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -3,4 +3,8 @@ black mkdocs mkdocs-material mkdocstrings -mkdocs-jupyter \ No newline at end of file +mkdocs-jupyter +pycodestyle +pytest +pytest-sugar +pytest-xdist \ No newline at end of file From d8c393aa3dd9e335f6622e4b9cf04de250bc3bb8 Mon Sep 17 00:00:00 2001 From: ejolly Date: Mon, 2 May 2022 17:59:23 -0400 Subject: [PATCH 2/4] Fixes #34, #36 --- README.md | 2 +- docs/index.md | 2 +- neighbors/utils.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index be34262..ac10bdb 100644 --- a/README.md +++ b/README.md @@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the [ documentat ```python from neighbors.models import NNMF_sgd -from neighbors.utils create_user_item_matrix, estimate_performance +from neighbors.utils import create_user_item_matrix, estimate_performance # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating' # convert it to a (possibly sparse) user x item matrix diff --git a/docs/index.md b/docs/index.md index 089b79e..d9b007f 100644 --- a/docs/index.md +++ b/docs/index.md @@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the 3 usage tuto ```python from neighbors.models import NNMF_sgd -from neighbors.utils create_user_item_matrix, estimate_performance +from neighbors.utils import create_user_item_matrix, estimate_performance # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating' # convert it to a (possibly sparse) user x item matrix diff --git a/neighbors/utils.py b/neighbors/utils.py index 5349ba0..5968b17 100644 --- a/neighbors/utils.py +++ b/neighbors/utils.py @@ -413,7 +413,7 @@ def estimate_performance( data (pd.DataFrame): a users x item dataframe n_iter (int, optional): number of repetitions for dense data. Defaults to 10. n_folds (int, optional): number of folds for CV on sparse data. Defaults to 10. - n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration; Defaults to masking out 20% of observed values + n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration. Defaults to masking out 20% of observed values. **Ignored if input data is already sparse.** return_agg (bool, optional): Return mean and std over repetitions rather than the reptitions themselves Defaults to True. return_full_performance (bool, optional): return the performance against both "observed" and "missing" or just "missing" values if using dense data and `n_iter`. Likewise return performance of both "train" and "test" or just "test" splits if using sparse data and `n_folds`; Default False agg_stats (list): string names of statistics to compute over repetitions. Must be accepted by `pd.DataFrame.agg`; Default ('mean', 'std') From 4695364b0a78606bd87d9b0254189776114b2693 Mon Sep 17 00:00:00 2001 From: ejolly Date: Mon, 2 May 2022 18:01:50 -0400 Subject: [PATCH 3/4] version bump --- neighbors/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/neighbors/version.py b/neighbors/version.py index 3dc1f76..485f44a 100644 --- a/neighbors/version.py +++ b/neighbors/version.py @@ -1 +1 @@ -__version__ = "0.1.0" +__version__ = "0.1.1" From d8d52bb0741fc82993969511f3c677264d967bac Mon Sep 17 00:00:00 2001 From: ejolly Date: Mon, 2 May 2022 20:10:29 -0400 Subject: [PATCH 4/4] Better fix for str or numeric column and index names --- neighbors/tests/test_utils.py | 38 +++++++++++----- neighbors/utils.py | 82 ++++++++--------------------------- 2 files changed, 47 insertions(+), 73 deletions(-) diff --git a/neighbors/tests/test_utils.py b/neighbors/tests/test_utils.py index 947b269..940ebe0 100644 --- a/neighbors/tests/test_utils.py +++ b/neighbors/tests/test_utils.py @@ -13,6 +13,7 @@ flatten_dataframe, unflatten_dataframe, split_train_test, + get_sparsity, Mean, ) @@ -157,15 +158,8 @@ def test_flatten_dataframe(simulate_wide_data): def test_unflatten_dataframe(simulate_wide_data): out = flatten_dataframe(simulate_wide_data) - new = unflatten_dataframe( - out, index=simulate_wide_data.index, columns=simulate_wide_data.columns - ) - assert new.equals(simulate_wide_data) - new = unflatten_dataframe(out) + new = unflatten_dataframe(out, like_dataframe=simulate_wide_data) assert new.equals(simulate_wide_data) - new = unflatten_dataframe( - out, num_rows=simulate_wide_data.shape[0], num_cols=simulate_wide_data.shape[1] - ) def test_split_train_test(simulate_wide_data): @@ -177,7 +171,10 @@ def test_split_train_test(simulate_wide_data): assert train.notnull().sum().sum() == int(4 / 5 * simulate_wide_data.size) # 4/5 of dense data should be sparse for testing, i.e. 1/5 data folds assert test.notnull().sum().sum() == int(1 / 5 * simulate_wide_data.size) - assert train.add(test, fill_value=0).equals(simulate_wide_data) + # Put them together and there should be no null values + full = train.add(test, fill_value=0) + assert not full.isnull().any().any() + assert full.equals(simulate_wide_data) # Sparse data mask = create_sparse_mask(simulate_wide_data, n_mask_items=0.1) @@ -190,9 +187,30 @@ def test_split_train_test(simulate_wide_data): # Train and test should be more sparse than original assert train.isnull().sum().sum() > masked_data.isnull().sum().sum() assert test.isnull().sum().sum() > masked_data.isnull().sum().sum() - assert train.add(test, fill_value=0).equals(masked_data) + + # Put them together and sparsity should be n_mask_items above + full = train.add(test, fill_value=0) + # Also testing get_sparsity + effective_sparsity = get_sparsity(full) + assert np.allclose(effective_sparsity, 0.1) + assert full.equals(masked_data) + train_not_null = train.notnull().sum().sum() test_not_null = test.notnull().sum().sum() # And adhere close the expected train/test split assert np.allclose(train_not_null / masked_not_null, 0.8, atol=0.12) assert np.allclose(test_not_null / masked_not_null, 0.2, atol=0.12) + + # Numeric column names + s = simulate_wide_data.copy() + s.columns = range(s.shape[1]) + splits = split_train_test(s, n_folds=5) + shapes = list(map(lambda s: (s[0].shape, s[1].shape), splits)) + assert all( + list( + map( + lambda sp: sp[0] == sp[1] and sp[0] == s.shape and sp[1] == s.shape, + shapes, + ) + ) + ) diff --git a/neighbors/utils.py b/neighbors/utils.py index 5968b17..86e887c 100644 --- a/neighbors/utils.py +++ b/neighbors/utils.py @@ -212,7 +212,7 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None): return out -def flatten_dataframe(data: pd.DataFrame) -> list: +def flatten_dataframe(data: pd.DataFrame) -> np.ndarray: """ Given a 2d dataframe return a numpy array of arrays organized as (row_idx, col_idx, val). This function is analgous to numpy.ravel or numpy.flatten for arrays, with the addition of the row and column indices for each value @@ -226,37 +226,23 @@ def flatten_dataframe(data: pd.DataFrame) -> list: if not isinstance(data, pd.DataFrame): raise TypeError("input must be a pandas dataframe") - # Force index and columns names to be strings so that unflatten can more reliably - # auto-infer index and column names when they aren't passed, i.e doesn't add - # additional columns or rows with numeric versions of the same name - out = data.copy() - out.index = list(map(str, out.index)) - out.columns = list(map(str, out.columns)) - - out = zip(product(out.index, out.columns), out.to_numpy().ravel()) + out = zip(product(data.index, data.columns), data.to_numpy().ravel()) return np.array([(elem[0][0], elem[0][1], elem[1]) for elem in out]) def unflatten_dataframe( data: np.ndarray, - columns=None, - index=None, - num_rows=None, - num_cols=None, - index_name=None, - columns_name=None, + like_dataframe: pd.DataFrame, ) -> pd.DataFrame: """ Reverse a flatten_dataframe operation to reconstruct the original unflattened dataframe Args: data (np.ndarray): n_items x 3 numpy array where columns represent row_idx, col_idx, and val at the location. - columns (list, optional): column names of new dataframe. Defaults to None. - index (list, optional): row names of new dataframe. Defaults to None. - num_rows (int, optional): total number of rows. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(row_idx) - num_cols (int, optional): total number of cols. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(col_idx) index_name (str; optional): Name of rows; Default None columns_name (str; optional): Name of columns; Default None + column_order (iterable; optional): Order of columns; Default lexiographic order based on input array + row_order (iterable; optional): Order of rows; Default lexiographic order based on input array Returns: pd.DataFrame: original unflattened dataframe @@ -264,35 +250,20 @@ def unflatten_dataframe( if not isinstance(data, np.ndarray): raise TypeError("input should be a numpy array") - if index is None and num_rows is None: - index = list(dict.fromkeys(data[:, 0])) - num_rows = len(index) - elif index is not None and num_rows is None: - num_rows = len(index) - elif index is None and num_rows is not None: - index = list(dict.fromkeys(data[:, 0])) - if len(index) != num_rows: - raise ValueError( - "num_rows does not match the number of unique row_idx values in data" - ) - if columns is None and num_cols is None: - columns = list(dict.fromkeys(data[:, 1])) - num_cols = len(columns) - elif columns is not None and num_cols is None: - num_cols = len(columns) - elif columns is None and num_cols is not None: - columns = list(dict.fromkeys(data[:, 1])) - if len(columns) != num_cols: - raise ValueError( - "num_cols does not match the number of unique col_idx values in data" - ) - out = np.empty((num_rows, num_cols)) + if not isinstance(like_dataframe, pd.DataFrame): + raise TypeError("like_dataframe should be a pandas dataframe") + + out = np.empty((like_dataframe.shape[0], like_dataframe.shape[1])) out[:] = np.nan - out = pd.DataFrame(out, index=index, columns=columns) + out = pd.DataFrame(out, index=like_dataframe.index, columns=like_dataframe.columns) + for elem in data: - out.loc[elem[0], elem[1]] = np.float(elem[2]) - out.index.name = index_name - out.columns.name = columns_name + row = elem[0].astype(type(like_dataframe.index[0])) + col = elem[1].astype(type(like_dataframe.columns[0])) + out.loc[row, col] = np.float(elem[2]) + out.index.name = like_dataframe.index.name + out.columns.name = like_dataframe.columns.name + return out @@ -355,7 +326,6 @@ def split_train_test( """ random_state = check_random_state(random_state) - num_rows, num_cols = data.shape flat = flatten_dataframe(data) if shuffle: random_state.shuffle(flat) @@ -369,22 +339,8 @@ def split_train_test( train = np.array([elem for elem in chain(flat[:start], flat[stop:])]) test = np.array([elem for elem in flat[start:stop]]) - yield unflatten_dataframe( - train, - num_rows=num_rows, - num_cols=num_cols, - index=data.index, - columns=data.columns, - index_name=data.index.name, - columns_name=data.columns.name, - ), unflatten_dataframe( - test, - num_rows=num_rows, - num_cols=num_cols, - index=data.index, - columns=data.columns, - index_name=data.index.name, - columns_name=data.columns.name, + yield unflatten_dataframe(train, like_dataframe=data), unflatten_dataframe( + test, like_dataframe=data )