Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

0.1.1 release #37

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the [ documentat

```python
from neighbors.models import NNMF_sgd
from neighbors.utils create_user_item_matrix, estimate_performance
from neighbors.utils import create_user_item_matrix, estimate_performance

# Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
# convert it to a (possibly sparse) user x item matrix
Expand Down
2 changes: 1 addition & 1 deletion docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ The best way to learn how to use the package is by checking out the 3 usage tuto

```python
from neighbors.models import NNMF_sgd
from neighbors.utils create_user_item_matrix, estimate_performance
from neighbors.utils import create_user_item_matrix, estimate_performance

# Assuming data is 3 column pandas df with 'User', 'Item', 'Rating'
# convert it to a (possibly sparse) user x item matrix
Expand Down
44 changes: 34 additions & 10 deletions neighbors/tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
flatten_dataframe,
unflatten_dataframe,
split_train_test,
get_sparsity,
Mean,
)

Expand Down Expand Up @@ -129,17 +130,23 @@ def test_create_sparse_mask(simulate_wide_data):
expected_items = int(simulate_wide_data.shape[1] * (1 - 0.10))
assert mask.shape == simulate_wide_data.shape
assert all(mask.sum(1) == expected_items)
assert mask.index.name == simulate_wide_data.index.name
assert mask.columns.name == simulate_wide_data.columns.name

mask = create_sparse_mask(simulate_wide_data, n_mask_items=19)
assert mask.shape == simulate_wide_data.shape
expected_items = int(simulate_wide_data.shape[1] - 19)
assert all(mask.sum(1) == expected_items)
assert mask.index.name == simulate_wide_data.index.name
assert mask.columns.name == simulate_wide_data.columns.name

masked_data = simulate_wide_data[mask]
assert isinstance(masked_data, pd.DataFrame)
assert masked_data.shape == simulate_wide_data.shape
assert ~simulate_wide_data.isnull().any().any()
assert masked_data.isnull().any().any()
assert mask.index.name == simulate_wide_data.index.name
assert mask.columns.name == simulate_wide_data.columns.name


def test_flatten_dataframe(simulate_wide_data):
Expand All @@ -151,15 +158,8 @@ def test_flatten_dataframe(simulate_wide_data):

def test_unflatten_dataframe(simulate_wide_data):
out = flatten_dataframe(simulate_wide_data)
new = unflatten_dataframe(
out, index=simulate_wide_data.index, columns=simulate_wide_data.columns
)
assert new.equals(simulate_wide_data)
new = unflatten_dataframe(out)
new = unflatten_dataframe(out, like_dataframe=simulate_wide_data)
assert new.equals(simulate_wide_data)
new = unflatten_dataframe(
out, num_rows=simulate_wide_data.shape[0], num_cols=simulate_wide_data.shape[1]
)


def test_split_train_test(simulate_wide_data):
Expand All @@ -171,7 +171,10 @@ def test_split_train_test(simulate_wide_data):
assert train.notnull().sum().sum() == int(4 / 5 * simulate_wide_data.size)
# 4/5 of dense data should be sparse for testing, i.e. 1/5 data folds
assert test.notnull().sum().sum() == int(1 / 5 * simulate_wide_data.size)
assert train.add(test, fill_value=0).equals(simulate_wide_data)
# Put them together and there should be no null values
full = train.add(test, fill_value=0)
assert not full.isnull().any().any()
assert full.equals(simulate_wide_data)

# Sparse data
mask = create_sparse_mask(simulate_wide_data, n_mask_items=0.1)
Expand All @@ -184,9 +187,30 @@ def test_split_train_test(simulate_wide_data):
# Train and test should be more sparse than original
assert train.isnull().sum().sum() > masked_data.isnull().sum().sum()
assert test.isnull().sum().sum() > masked_data.isnull().sum().sum()
assert train.add(test, fill_value=0).equals(masked_data)

# Put them together and sparsity should be n_mask_items above
full = train.add(test, fill_value=0)
# Also testing get_sparsity
effective_sparsity = get_sparsity(full)
assert np.allclose(effective_sparsity, 0.1)
assert full.equals(masked_data)

train_not_null = train.notnull().sum().sum()
test_not_null = test.notnull().sum().sum()
# And adhere close the expected train/test split
assert np.allclose(train_not_null / masked_not_null, 0.8, atol=0.12)
assert np.allclose(test_not_null / masked_not_null, 0.2, atol=0.12)

# Numeric column names
s = simulate_wide_data.copy()
s.columns = range(s.shape[1])
splits = split_train_test(s, n_folds=5)
shapes = list(map(lambda s: (s[0].shape, s[1].shape), splits))
assert all(
list(
map(
lambda sp: sp[0] == sp[1] and sp[0] == s.shape and sp[1] == s.shape,
shapes,
)
)
)
80 changes: 23 additions & 57 deletions neighbors/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,13 @@ def create_sparse_mask(data, n_mask_items=0.2, random_state=None):
for _ in range(data.shape[0])
]
)
return pd.DataFrame(mask, index=data.index, columns=data.columns)
out = pd.DataFrame(mask, index=data.index, columns=data.columns)
out.index.name = data.index.name
out.columns.name = data.columns.name
return out


def flatten_dataframe(data: pd.DataFrame) -> list:
def flatten_dataframe(data: pd.DataFrame) -> np.ndarray:
"""
Given a 2d dataframe return a numpy array of arrays organized as (row_idx, col_idx, val). This function is analgous to numpy.ravel or numpy.flatten for arrays, with the addition of the row and column indices for each value

Expand All @@ -229,60 +232,38 @@ def flatten_dataframe(data: pd.DataFrame) -> list:

def unflatten_dataframe(
data: np.ndarray,
columns=None,
index=None,
num_rows=None,
num_cols=None,
index_name=None,
columns_name=None,
like_dataframe: pd.DataFrame,
) -> pd.DataFrame:
"""
Reverse a flatten_dataframe operation to reconstruct the original unflattened dataframe

Args:
data (np.ndarray): n_items x 3 numpy array where columns represent row_idx, col_idx, and val at the location.
columns (list, optional): column names of new dataframe. Defaults to None.
index (list, optional): row names of new dataframe. Defaults to None.
num_rows (int, optional): total number of rows. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(row_idx)
num_cols (int, optional): total number of cols. Useful if the flattened dataframe had a non-numerical non-ordered index. Default None which uses the max(col_idx)
index_name (str; optional): Name of rows; Default None
columns_name (str; optional): Name of columns; Default None
column_order (iterable; optional): Order of columns; Default lexiographic order based on input array
row_order (iterable; optional): Order of rows; Default lexiographic order based on input array

Returns:
pd.DataFrame: original unflattened dataframe
"""

if not isinstance(data, np.ndarray):
raise TypeError("input should be a numpy array")
if index is None and num_rows is None:
index = list(dict.fromkeys(data[:, 0]))
num_rows = len(index)
elif index is not None and num_rows is None:
num_rows = len(index)
elif index is None and num_rows is not None:
index = list(dict.fromkeys(data[:, 0]))
if len(index) != num_rows:
raise ValueError(
"num_rows does not match the number of unique row_idx values in data"
)
if columns is None and num_cols is None:
columns = list(dict.fromkeys(data[:, 1]))
num_cols = len(columns)
elif columns is not None and num_cols is None:
num_cols = len(columns)
elif columns is None and num_cols is not None:
columns = list(dict.fromkeys(data[:, 1]))
if len(columns) != num_cols:
raise ValueError(
"num_cols does not match the number of unique col_idx values in data"
)
out = np.empty((num_rows, num_cols))
if not isinstance(like_dataframe, pd.DataFrame):
raise TypeError("like_dataframe should be a pandas dataframe")

out = np.empty((like_dataframe.shape[0], like_dataframe.shape[1]))
out[:] = np.nan
out = pd.DataFrame(out, index=index, columns=columns)
out = pd.DataFrame(out, index=like_dataframe.index, columns=like_dataframe.columns)

for elem in data:
out.loc[elem[0], elem[1]] = np.float(elem[2])
out.index.name = index_name
out.columns.name = columns_name
row = elem[0].astype(type(like_dataframe.index[0]))
col = elem[1].astype(type(like_dataframe.columns[0]))
out.loc[row, col] = np.float(elem[2])
out.index.name = like_dataframe.index.name
out.columns.name = like_dataframe.columns.name

return out


Expand Down Expand Up @@ -345,7 +326,6 @@ def split_train_test(
"""

random_state = check_random_state(random_state)
num_rows, num_cols = data.shape
flat = flatten_dataframe(data)
if shuffle:
random_state.shuffle(flat)
Expand All @@ -359,22 +339,8 @@ def split_train_test(
train = np.array([elem for elem in chain(flat[:start], flat[stop:])])
test = np.array([elem for elem in flat[start:stop]])

yield unflatten_dataframe(
train,
num_rows=num_rows,
num_cols=num_cols,
index=data.index,
columns=data.columns,
index_name=data.index.name,
columns_name=data.columns.name,
), unflatten_dataframe(
test,
num_rows=num_rows,
num_cols=num_cols,
index=data.index,
columns=data.columns,
index_name=data.index.name,
columns_name=data.columns.name,
yield unflatten_dataframe(train, like_dataframe=data), unflatten_dataframe(
test, like_dataframe=data
)


Expand Down Expand Up @@ -403,7 +369,7 @@ def estimate_performance(
data (pd.DataFrame): a users x item dataframe
n_iter (int, optional): number of repetitions for dense data. Defaults to 10.
n_folds (int, optional): number of folds for CV on sparse data. Defaults to 10.
n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration; Defaults to masking out 20% of observed values
n_mask_items (int/float, optional): how much randomly sparsify dense data each iteration. Defaults to masking out 20% of observed values. **Ignored if input data is already sparse.**
return_agg (bool, optional): Return mean and std over repetitions rather than the reptitions themselves Defaults to True.
return_full_performance (bool, optional): return the performance against both "observed" and "missing" or just "missing" values if using dense data and `n_iter`. Likewise return performance of both "train" and "test" or just "test" splits if using sparse data and `n_folds`; Default False
agg_stats (list): string names of statistics to compute over repetitions. Must be accepted by `pd.DataFrame.agg`; Default ('mean', 'std')
Expand Down
2 changes: 1 addition & 1 deletion neighbors/version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.1.0"
__version__ = "0.1.1"
6 changes: 5 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,8 @@ black
mkdocs
mkdocs-material
mkdocstrings
mkdocs-jupyter
mkdocs-jupyter
pycodestyle
pytest
pytest-sugar
pytest-xdist