From c5eb978d359ebc3b4a835c40a81fb7eef7d363af Mon Sep 17 00:00:00 2001
From: ejolly <eshin.jolly@gmail.com>
Date: Mon, 2 May 2022 16:59:02 -0400
Subject: [PATCH 1/4] Fix issue when splitting and combining datasets with
 mixed or non-string column and index names

---
 neighbors/tests/test_utils.py |  6 ++++++
 neighbors/utils.py            | 14 ++++++++++++--
 requirements-dev.txt          |  6 +++++-
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/neighbors/tests/test_utils.py b/neighbors/tests/test_utils.py
index 9be51f8..947b269 100644
--- a/neighbors/tests/test_utils.py
+++ b/neighbors/tests/test_utils.py
@@ -129,17 +129,23 @@ def test_create_sparse_mask(simulate_wide_data):
     expected_items = int(simulate_wide_data.shape[1] * (1 - 0.10))
     assert mask.shape == simulate_wide_data.shape
     assert all(mask.sum(1) == expected_items)
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
     mask = create_sparse_mask(simulate_wide_data, n_mask_items=19)
     assert mask.shape == simulate_wide_data.shape
     expected_items = int(simulate_wide_data.shape[1] - 19)
     assert all(mask.sum(1) == expected_items)
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
     masked_data = simulate_wide_data[mask]
     assert isinstance(masked_data, pd.DataFrame)
     assert masked_data.shape == simulate_wide_data.shape
     assert ~simulate_wide_data.isnull().any().any()
     assert masked_data.isnull().any().any()
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
 
 def test_flatten_dataframe(simulate_wide_data):
diff --git a/neighbors/utils.py b/neighbors/utils.py
index 8aed9ee..5349ba0 100644
--- a/neighbors/utils.py
+++ b/neighbors/utils.py
@@ -206,7 +206,10 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None):
             for _ in range(data.shape[0])
         ]
     )
-    return pd.DataFrame(mask, index=data.index, columns=data.columns)
+    out = pd.DataFrame(mask, index=data.index, columns=data.columns)
+    out.index.name = data.index.name
+    out.columns.name = data.columns.name
+    return out
 
 
 def flatten_dataframe(data: pd.DataFrame) -> list:
@@ -223,7 +226,14 @@ def flatten_dataframe(data: pd.DataFrame) -> list:
     if not isinstance(data, pd.DataFrame):
         raise TypeError("input must be a pandas dataframe")
 
-    out = zip(product(data.index, data.columns), data.to_numpy().ravel())
+    # Force index and columns names to be strings so that unflatten can more reliably
+    # auto-infer index and column names when they aren't passed, i.e doesn't add
+    # additional columns or rows with numeric versions of the same name
+    out = data.copy()
+    out.index = list(map(str, out.index))
+    out.columns = list(map(str, out.columns))
+
+    out = zip(product(out.index, out.columns), out.to_numpy().ravel())
     return np.array([(elem[0][0], elem[0][1], elem[1]) for elem in out])
 
 
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 66bd6a9..919d991 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -3,4 +3,8 @@ black
 mkdocs
 mkdocs-material
 mkdocstrings
-mkdocs-jupyter
\ No newline at end of file
+mkdocs-jupyter
+pycodestyle
+pytest
+pytest-sugar
+pytest-xdist
\ No newline at end of file

From d8c393aa3dd9e335f6622e4b9cf04de250bc3bb8 Mon Sep 17 00:00:00 2001
From: ejolly <eshin.jolly@gmail.com>
Date: Mon, 2 May 2022 17:59:23 -0400
Subject: [PATCH 2/4] Fixes #34, #36

---
 README.md          | 2 +-
 docs/index.md      | 2 +-
 neighbors/utils.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index be34262..ac10bdb 100644
--- a/README.md
+++ b/README.md
@@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the [ documentat
 
 ```python  
 from neighbors.models import NNMF_sgd
-from neighbors.utils create_user_item_matrix, estimate_performance
+from neighbors.utils import create_user_item_matrix, estimate_performance
 
 # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
 # convert it to a (possibly sparse) user x item matrix
diff --git a/docs/index.md b/docs/index.md
index 089b79e..d9b007f 100644
--- a/docs/index.md
+++ b/docs/index.md
@@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the 3 usage tuto
 
 ```python  
 from neighbors.models import NNMF_sgd
-from neighbors.utils create_user_item_matrix, estimate_performance
+from neighbors.utils import create_user_item_matrix, estimate_performance
 
 # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
 # convert it to a (possibly sparse) user x item matrix
diff --git a/neighbors/utils.py b/neighbors/utils.py
index 5349ba0..5968b17 100644
--- a/neighbors/utils.py
+++ b/neighbors/utils.py
@@ -413,7 +413,7 @@ def estimate_performance(
         data (pd.DataFrame): a users x item dataframe
         n_iter (int, optional): number of repetitions for dense data. Defaults to 10.
         n_folds (int, optional): number of folds for CV on sparse data. Defaults to 10.
-        n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration; Defaults to masking out 20% of observed values
+        n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration. Defaults to masking out 20% of observed values. **Ignored if input data is already sparse.**
         return_agg (bool, optional): Return mean and std over repetitions rather than the reptitions themselves Defaults to True.
         return_full_performance (bool, optional): return the performance against both "observed" and "missing" or just "missing" values if using dense data and `n_iter`. Likewise return performance of both "train" and "test" or just "test" splits if using sparse data and `n_folds`; Default False
         agg_stats (list): string names of statistics to compute over repetitions. Must be accepted by `pd.DataFrame.agg`; Default ('mean', 'std')

From 4695364b0a78606bd87d9b0254189776114b2693 Mon Sep 17 00:00:00 2001
From: ejolly <eshin.jolly@gmail.com>
Date: Mon, 2 May 2022 18:01:50 -0400
Subject: [PATCH 3/4] version bump

---
 neighbors/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/neighbors/version.py b/neighbors/version.py
index 3dc1f76..485f44a 100644
--- a/neighbors/version.py
+++ b/neighbors/version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"

From d8d52bb0741fc82993969511f3c677264d967bac Mon Sep 17 00:00:00 2001
From: ejolly <eshin.jolly@gmail.com>
Date: Mon, 2 May 2022 20:10:29 -0400
Subject: [PATCH 4/4] Better fix for str or numeric column and index names

---
 neighbors/tests/test_utils.py | 38 +++++++++++-----
 neighbors/utils.py            | 82 ++++++++---------------------------
 2 files changed, 47 insertions(+), 73 deletions(-)

diff --git a/neighbors/tests/test_utils.py b/neighbors/tests/test_utils.py
index 947b269..940ebe0 100644
--- a/neighbors/tests/test_utils.py
+++ b/neighbors/tests/test_utils.py
@@ -13,6 +13,7 @@
     flatten_dataframe,
     unflatten_dataframe,
     split_train_test,
+    get_sparsity,
     Mean,
 )
 
@@ -157,15 +158,8 @@ def test_flatten_dataframe(simulate_wide_data):
 
 def test_unflatten_dataframe(simulate_wide_data):
     out = flatten_dataframe(simulate_wide_data)
-    new = unflatten_dataframe(
-        out, index=simulate_wide_data.index, columns=simulate_wide_data.columns
-    )
-    assert new.equals(simulate_wide_data)
-    new = unflatten_dataframe(out)
+    new = unflatten_dataframe(out, like_dataframe=simulate_wide_data)
     assert new.equals(simulate_wide_data)
-    new = unflatten_dataframe(
-        out, num_rows=simulate_wide_data.shape[0], num_cols=simulate_wide_data.shape[1]
-    )
 
 
 def test_split_train_test(simulate_wide_data):
@@ -177,7 +171,10 @@ def test_split_train_test(simulate_wide_data):
         assert train.notnull().sum().sum() == int(4 / 5 * simulate_wide_data.size)
         # 4/5 of dense data should be sparse for testing, i.e. 1/5 data folds
         assert test.notnull().sum().sum() == int(1 / 5 * simulate_wide_data.size)
-        assert train.add(test, fill_value=0).equals(simulate_wide_data)
+        # Put them together and there should be no null values
+        full = train.add(test, fill_value=0)
+        assert not full.isnull().any().any()
+        assert full.equals(simulate_wide_data)
 
     # Sparse data
     mask = create_sparse_mask(simulate_wide_data, n_mask_items=0.1)
@@ -190,9 +187,30 @@ def test_split_train_test(simulate_wide_data):
         # Train and test should be more sparse than original
         assert train.isnull().sum().sum() > masked_data.isnull().sum().sum()
         assert test.isnull().sum().sum() > masked_data.isnull().sum().sum()
-        assert train.add(test, fill_value=0).equals(masked_data)
+
+        # Put them together and sparsity should be n_mask_items above
+        full = train.add(test, fill_value=0)
+        # Also testing get_sparsity
+        effective_sparsity = get_sparsity(full)
+        assert np.allclose(effective_sparsity, 0.1)
+        assert full.equals(masked_data)
+
         train_not_null = train.notnull().sum().sum()
         test_not_null = test.notnull().sum().sum()
         # And adhere close the expected train/test split
         assert np.allclose(train_not_null / masked_not_null, 0.8, atol=0.12)
         assert np.allclose(test_not_null / masked_not_null, 0.2, atol=0.12)
+
+    # Numeric column names
+    s = simulate_wide_data.copy()
+    s.columns = range(s.shape[1])
+    splits = split_train_test(s, n_folds=5)
+    shapes = list(map(lambda s: (s[0].shape, s[1].shape), splits))
+    assert all(
+        list(
+            map(
+                lambda sp: sp[0] == sp[1] and sp[0] == s.shape and sp[1] == s.shape,
+                shapes,
+            )
+        )
+    )
diff --git a/neighbors/utils.py b/neighbors/utils.py
index 5968b17..86e887c 100644
--- a/neighbors/utils.py
+++ b/neighbors/utils.py
@@ -212,7 +212,7 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None):
     return out
 
 
-def flatten_dataframe(data: pd.DataFrame) -> list:
+def flatten_dataframe(data: pd.DataFrame) -> np.ndarray:
     """
     Given a 2d dataframe return a numpy array of arrays organized as (row_idx, col_idx, val). This function is analgous to numpy.ravel or numpy.flatten for arrays, with the addition of the row and column indices for each value
 
@@ -226,37 +226,23 @@ def flatten_dataframe(data: pd.DataFrame) -> list:
     if not isinstance(data, pd.DataFrame):
         raise TypeError("input must be a pandas dataframe")
 
-    # Force index and columns names to be strings so that unflatten can more reliably
-    # auto-infer index and column names when they aren't passed, i.e doesn't add
-    # additional columns or rows with numeric versions of the same name
-    out = data.copy()
-    out.index = list(map(str, out.index))
-    out.columns = list(map(str, out.columns))
-
-    out = zip(product(out.index, out.columns), out.to_numpy().ravel())
+    out = zip(product(data.index, data.columns), data.to_numpy().ravel())
     return np.array([(elem[0][0], elem[0][1], elem[1]) for elem in out])
 
 
 def unflatten_dataframe(
     data: np.ndarray,
-    columns=None,
-    index=None,
-    num_rows=None,
-    num_cols=None,
-    index_name=None,
-    columns_name=None,
+    like_dataframe: pd.DataFrame,
 ) -> pd.DataFrame:
     """
     Reverse a flatten_dataframe operation to reconstruct the original unflattened dataframe
 
     Args:
         data (np.ndarray): n_items x 3 numpy array where columns represent row_idx, col_idx, and val at the location.
-        columns (list, optional): column names of new dataframe. Defaults to None.
-        index (list, optional): row names of new dataframe. Defaults to None.
-        num_rows (int, optional): total number of rows. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(row_idx)
-        num_cols (int, optional): total number of cols. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(col_idx)
         index_name (str; optional): Name of rows; Default None
         columns_name (str; optional): Name of columns; Default None
+        column_order (iterable; optional): Order of columns; Default lexiographic order based on input array
+        row_order (iterable; optional): Order of rows; Default lexiographic order based on input array
 
     Returns:
         pd.DataFrame: original unflattened dataframe
@@ -264,35 +250,20 @@ def unflatten_dataframe(
 
     if not isinstance(data, np.ndarray):
         raise TypeError("input should be a numpy array")
-    if index is None and num_rows is None:
-        index = list(dict.fromkeys(data[:, 0]))
-        num_rows = len(index)
-    elif index is not None and num_rows is None:
-        num_rows = len(index)
-    elif index is None and num_rows is not None:
-        index = list(dict.fromkeys(data[:, 0]))
-        if len(index) != num_rows:
-            raise ValueError(
-                "num_rows does not match the number of unique row_idx values in data"
-            )
-    if columns is None and num_cols is None:
-        columns = list(dict.fromkeys(data[:, 1]))
-        num_cols = len(columns)
-    elif columns is not None and num_cols is None:
-        num_cols = len(columns)
-    elif columns is None and num_cols is not None:
-        columns = list(dict.fromkeys(data[:, 1]))
-        if len(columns) != num_cols:
-            raise ValueError(
-                "num_cols does not match the number of unique col_idx values in data"
-            )
-    out = np.empty((num_rows, num_cols))
+    if not isinstance(like_dataframe, pd.DataFrame):
+        raise TypeError("like_dataframe should be a pandas dataframe")
+
+    out = np.empty((like_dataframe.shape[0], like_dataframe.shape[1]))
     out[:] = np.nan
-    out = pd.DataFrame(out, index=index, columns=columns)
+    out = pd.DataFrame(out, index=like_dataframe.index, columns=like_dataframe.columns)
+
     for elem in data:
-        out.loc[elem[0], elem[1]] = np.float(elem[2])
-    out.index.name = index_name
-    out.columns.name = columns_name
+        row = elem[0].astype(type(like_dataframe.index[0]))
+        col = elem[1].astype(type(like_dataframe.columns[0]))
+        out.loc[row, col] = np.float(elem[2])
+    out.index.name = like_dataframe.index.name
+    out.columns.name = like_dataframe.columns.name
+
     return out
 
 
@@ -355,7 +326,6 @@ def split_train_test(
     """
 
     random_state = check_random_state(random_state)
-    num_rows, num_cols = data.shape
     flat = flatten_dataframe(data)
     if shuffle:
         random_state.shuffle(flat)
@@ -369,22 +339,8 @@ def split_train_test(
         train = np.array([elem for elem in chain(flat[:start], flat[stop:])])
         test = np.array([elem for elem in flat[start:stop]])
 
-        yield unflatten_dataframe(
-            train,
-            num_rows=num_rows,
-            num_cols=num_cols,
-            index=data.index,
-            columns=data.columns,
-            index_name=data.index.name,
-            columns_name=data.columns.name,
-        ), unflatten_dataframe(
-            test,
-            num_rows=num_rows,
-            num_cols=num_cols,
-            index=data.index,
-            columns=data.columns,
-            index_name=data.index.name,
-            columns_name=data.columns.name,
+        yield unflatten_dataframe(train, like_dataframe=data), unflatten_dataframe(
+            test, like_dataframe=data
         )