cosanlab · ejolly · May 2, 2022 · May 2, 2022 · May 2, 2022 · May 3, 2022
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the [ documentat
 
 ```python  
 from neighbors.models import NNMF_sgd
-from neighbors.utils create_user_item_matrix, estimate_performance
+from neighbors.utils import create_user_item_matrix, estimate_performance
 
 # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
 # convert it to a (possibly sparse) user x item matrix

diff --git a/docs/index.md b/docs/index.md
@@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the 3 usage tuto
 
 ```python  
 from neighbors.models import NNMF_sgd
-from neighbors.utils create_user_item_matrix, estimate_performance
+from neighbors.utils import create_user_item_matrix, estimate_performance
 
 # Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
 # convert it to a (possibly sparse) user x item matrix

diff --git a/neighbors/tests/test_utils.py b/neighbors/tests/test_utils.py
@@ -13,6 +13,7 @@
     flatten_dataframe,
     unflatten_dataframe,
     split_train_test,
+    get_sparsity,
     Mean,
 )
 
@@ -129,17 +130,23 @@ def test_create_sparse_mask(simulate_wide_data):
     expected_items = int(simulate_wide_data.shape[1] * (1 - 0.10))
     assert mask.shape == simulate_wide_data.shape
     assert all(mask.sum(1) == expected_items)
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
     mask = create_sparse_mask(simulate_wide_data, n_mask_items=19)
     assert mask.shape == simulate_wide_data.shape
     expected_items = int(simulate_wide_data.shape[1] - 19)
     assert all(mask.sum(1) == expected_items)
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
     masked_data = simulate_wide_data[mask]
     assert isinstance(masked_data, pd.DataFrame)
     assert masked_data.shape == simulate_wide_data.shape
     assert ~simulate_wide_data.isnull().any().any()
     assert masked_data.isnull().any().any()
+    assert mask.index.name == simulate_wide_data.index.name
+    assert mask.columns.name == simulate_wide_data.columns.name
 
 
 def test_flatten_dataframe(simulate_wide_data):
@@ -151,15 +158,8 @@ def test_flatten_dataframe(simulate_wide_data):
 
 def test_unflatten_dataframe(simulate_wide_data):
     out = flatten_dataframe(simulate_wide_data)
-    new = unflatten_dataframe(
-        out, index=simulate_wide_data.index, columns=simulate_wide_data.columns
-    )
-    assert new.equals(simulate_wide_data)
-    new = unflatten_dataframe(out)
+    new = unflatten_dataframe(out, like_dataframe=simulate_wide_data)
     assert new.equals(simulate_wide_data)
-    new = unflatten_dataframe(
-        out, num_rows=simulate_wide_data.shape[0], num_cols=simulate_wide_data.shape[1]
-    )
 
 
 def test_split_train_test(simulate_wide_data):
@@ -171,7 +171,10 @@ def test_split_train_test(simulate_wide_data):
         assert train.notnull().sum().sum() == int(4 / 5 * simulate_wide_data.size)
         # 4/5 of dense data should be sparse for testing, i.e. 1/5 data folds
         assert test.notnull().sum().sum() == int(1 / 5 * simulate_wide_data.size)
-        assert train.add(test, fill_value=0).equals(simulate_wide_data)
+        # Put them together and there should be no null values
+        full = train.add(test, fill_value=0)
+        assert not full.isnull().any().any()
+        assert full.equals(simulate_wide_data)
 
     # Sparse data
     mask = create_sparse_mask(simulate_wide_data, n_mask_items=0.1)
@@ -184,9 +187,30 @@ def test_split_train_test(simulate_wide_data):
         # Train and test should be more sparse than original
         assert train.isnull().sum().sum() > masked_data.isnull().sum().sum()
         assert test.isnull().sum().sum() > masked_data.isnull().sum().sum()
-        assert train.add(test, fill_value=0).equals(masked_data)
+
+        # Put them together and sparsity should be n_mask_items above
+        full = train.add(test, fill_value=0)
+        # Also testing get_sparsity
+        effective_sparsity = get_sparsity(full)
+        assert np.allclose(effective_sparsity, 0.1)
+        assert full.equals(masked_data)
+
         train_not_null = train.notnull().sum().sum()
         test_not_null = test.notnull().sum().sum()
         # And adhere close the expected train/test split
         assert np.allclose(train_not_null / masked_not_null, 0.8, atol=0.12)
         assert np.allclose(test_not_null / masked_not_null, 0.2, atol=0.12)
+
+    # Numeric column names
+    s = simulate_wide_data.copy()
+    s.columns = range(s.shape[1])
+    splits = split_train_test(s, n_folds=5)
+    shapes = list(map(lambda s: (s[0].shape, s[1].shape), splits))
+    assert all(
+        list(
+            map(
+                lambda sp: sp[0] == sp[1] and sp[0] == s.shape and sp[1] == s.shape,
+                shapes,
+            )
+        )
+    )
diff --git a/neighbors/utils.py b/neighbors/utils.py
@@ -206,10 +206,13 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None):
             for _ in range(data.shape[0])
         ]
     )
-    return pd.DataFrame(mask, index=data.index, columns=data.columns)
+    out = pd.DataFrame(mask, index=data.index, columns=data.columns)
+    out.index.name = data.index.name
+    out.columns.name = data.columns.name
+    return out
 
 
-def flatten_dataframe(data: pd.DataFrame) -> list:
+def flatten_dataframe(data: pd.DataFrame) -> np.ndarray:
     """
     Given a 2d dataframe return a numpy array of arrays organized as (row_idx, col_idx, val). This function is analgous to numpy.ravel or numpy.flatten for arrays, with the addition of the row and column indices for each value
 
@@ -229,60 +232,38 @@ def flatten_dataframe(data: pd.DataFrame) -> list:
 
 def unflatten_dataframe(
     data: np.ndarray,
-    columns=None,
-    index=None,
-    num_rows=None,
-    num_cols=None,
-    index_name=None,
-    columns_name=None,
+    like_dataframe: pd.DataFrame,
 ) -> pd.DataFrame:
     """
     Reverse a flatten_dataframe operation to reconstruct the original unflattened dataframe
 
     Args:
         data (np.ndarray): n_items x 3 numpy array where columns represent row_idx, col_idx, and val at the location.
-        columns (list, optional): column names of new dataframe. Defaults to None.
-        index (list, optional): row names of new dataframe. Defaults to None.
-        num_rows (int, optional): total number of rows. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(row_idx)
-        num_cols (int, optional): total number of cols. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(col_idx)
         index_name (str; optional): Name of rows; Default None
         columns_name (str; optional): Name of columns; Default None
+        column_order (iterable; optional): Order of columns; Default lexiographic order based on input array
+        row_order (iterable; optional): Order of rows; Default lexiographic order based on input array
 
     Returns:
         pd.DataFrame: original unflattened dataframe
     """
 
     if not isinstance(data, np.ndarray):
         raise TypeError("input should be a numpy array")
-    if index is None and num_rows is None:
-        index = list(dict.fromkeys(data[:, 0]))
-        num_rows = len(index)
-    elif index is not None and num_rows is None:
-        num_rows = len(index)
-    elif index is None and num_rows is not None:
-        index = list(dict.fromkeys(data[:, 0]))
-        if len(index) != num_rows:
-            raise ValueError(
-                "num_rows does not match the number of unique row_idx values in data"
-            )
-    if columns is None and num_cols is None:
-        columns = list(dict.fromkeys(data[:, 1]))
-        num_cols = len(columns)
-    elif columns is not None and num_cols is None:
-        num_cols = len(columns)
-    elif columns is None and num_cols is not None:
-        columns = list(dict.fromkeys(data[:, 1]))
-        if len(columns) != num_cols:
-            raise ValueError(
-                "num_cols does not match the number of unique col_idx values in data"
-            )
-    out = np.empty((num_rows, num_cols))
+    if not isinstance(like_dataframe, pd.DataFrame):
+        raise TypeError("like_dataframe should be a pandas dataframe")
+
+    out = np.empty((like_dataframe.shape[0], like_dataframe.shape[1]))
     out[:] = np.nan
-    out = pd.DataFrame(out, index=index, columns=columns)
+    out = pd.DataFrame(out, index=like_dataframe.index, columns=like_dataframe.columns)
+
     for elem in data:
-        out.loc[elem[0], elem[1]] = np.float(elem[2])
-    out.index.name = index_name
-    out.columns.name = columns_name
+        row = elem[0].astype(type(like_dataframe.index[0]))
+        col = elem[1].astype(type(like_dataframe.columns[0]))
+        out.loc[row, col] = np.float(elem[2])
+    out.index.name = like_dataframe.index.name
+    out.columns.name = like_dataframe.columns.name
+
     return out
 
 
@@ -345,7 +326,6 @@ def split_train_test(
     """
 
     random_state = check_random_state(random_state)
-    num_rows, num_cols = data.shape
     flat = flatten_dataframe(data)
     if shuffle:
         random_state.shuffle(flat)
@@ -359,22 +339,8 @@ def split_train_test(
         train = np.array([elem for elem in chain(flat[:start], flat[stop:])])
         test = np.array([elem for elem in flat[start:stop]])
 
-        yield unflatten_dataframe(
-            train,
-            num_rows=num_rows,
-            num_cols=num_cols,
-            index=data.index,
-            columns=data.columns,
-            index_name=data.index.name,
-            columns_name=data.columns.name,
-        ), unflatten_dataframe(
-            test,
-            num_rows=num_rows,
-            num_cols=num_cols,
-            index=data.index,
-            columns=data.columns,
-            index_name=data.index.name,
-            columns_name=data.columns.name,
+        yield unflatten_dataframe(train, like_dataframe=data), unflatten_dataframe(
+            test, like_dataframe=data
         )
 
 
@@ -403,7 +369,7 @@ def estimate_performance(
         data (pd.DataFrame): a users x item dataframe
         n_iter (int, optional): number of repetitions for dense data. Defaults to 10.
         n_folds (int, optional): number of folds for CV on sparse data. Defaults to 10.
-        n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration; Defaults to masking out 20% of observed values
+        n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration. Defaults to masking out 20% of observed values. **Ignored if input data is already sparse.**
         return_agg (bool, optional): Return mean and std over repetitions rather than the reptitions themselves Defaults to True.
         return_full_performance (bool, optional): return the performance against both "observed" and "missing" or just "missing" values if using dense data and `n_iter`. Likewise return performance of both "train" and "test" or just "test" splits if using sparse data and `n_folds`; Default False
         agg_stats (list): string names of statistics to compute over repetitions. Must be accepted by `pd.DataFrame.agg`; Default ('mean', 'std')

diff --git a/neighbors/version.py b/neighbors/version.py
@@ -1 +1 @@
-__version__ = "0.1.0"
+__version__ = "0.1.1"
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -3,4 +3,8 @@ black
 mkdocs
 mkdocs-material
 mkdocstrings
-mkdocs-jupyter
+mkdocs-jupyter
+pycodestyle
+pytest
+pytest-sugar
+pytest-xdist