diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..331d387 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,2 @@ +.git +Dockerfile diff --git a/.travis.yml b/.travis.yml index b3220eb..2d97b0b 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,79 +1,21 @@ language: c + env: - TRAVIS_PYTHON_VERSION="2.7" - - TRAVIS_PYTHON_VERSION="3.4" - - TRAVIS_PYTHON_VERSION="3.5" - -matrix: - include: - - os: linux - dist: trusty - # - os: osx - # osx_image: xcode8.3 + - TRAVIS_PYTHON_VERSION="3.6" git: submodules: false before_install: - # fastFM-core depends on cblas - - if [ "$TRAVIS_OS_NAME" = "linux" ]; then sudo apt-get update -qq; sudo apt-get install -y libopenblas-dev; fi - - if [[ "$TRAVIS_PYTHON_VERSION" =~ "^2" ]]; then - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - wget https://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget https://repo.continuum.io/miniconda/Miniconda-latest-MacOSX-x86_64.sh -O miniconda.sh; - fi - else - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -O miniconda.sh; - fi - fi - - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - # Useful for debugging any issues with conda - - conda info -a - - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION cython numpy pandas scipy scikit-learn nose - - source activate test-environment - # use credentials - sed -i -e "s|git@github.com:|https://$CI_TOKEN@github.com/|" .gitmodules install: - git submodule update --init --recursive - - cd fastFM-core2 - - | - if [ "$TRAVIS_OS_NAME" = "linux" ]; then - cmake -H. -B_lib -DCMAKE_BUILD_TYPE=Debug -DFASTFM_MINIMAL=ON -DCMAKE_DEBUG_POSTFIX=d - cmake --build _lib - else - cmake -H. -B_lib -DFASTFM_MINIMAL=ON -GXcode - cmake --build _lib --config Debug - mv _lib/fastFM/Debug/libfastFMd.a _lib/fastFM/ - fi - - ls _lib/fastFM - - cd .. - - make - - python setup.py bdist_wheel - - pip install dist/*.whl + - docker build -t fastfm-test . script: - - nosetests - -before_deploy: - - export RELEASE_PKG_FILE=$(ls dist/*.whl) - - echo "deploying $RELEASE_PKG_FILE to GitHub releases" + - docker run --rm -i -v $(pwd):/fastfm/ fastfm-test /bin/bash -s < docker_run_tests.sh -deploy: - provider: releases - api_key: - secure: AJcZoe2+OiMJ4VlSkASAeMc/ii0ZRnj2PFaaL7zlSbx1THMpY/49U5BSyqX1PQioPSlTV3ZsIXI3u7KyqoXIQSXWzAuaBzpLTLS85fGSuTvUuexmaJtKU92OC143tuVVLCPnjC992+1uyctjrxMSqgoaUolfYkEftt5RGrMIKl2duGfDXrPXIueHSl8FQGXkmlY6NqkRx2v5kxsAjFcurvwTNU8ptJ84jVKjrE6t1IB61vp2eUcqVR/z6Lwau6mdvIybglnbH4lCMXP98zEIibLA8vbn3XxrC+0uU7Kjz37K6/CsJEPNL5tujJDMRKAupnrkgPsAGTpsAn6O6uLUz0ISgcen8R6KJ7cBli+cq08OZ3JLLoJpqkni62YVSQV+uYkQk9b5Pu09vUTOozJMnOqLSj9hVIswyxGiFPcTFskMgqMdx15M59gd0YpXH633YqwBgRmWNsctp4BKnTaE3iGW6aZc8lrXxpL7qcVAosjmpjLp3jiPXVSRdYf0yHl6pDUj5ZVyu27kAn1/I9JL0nH19zjXF2tUlEjuT9ydHwnhmsgBN/V+JhZxi7ZeEbOZfY1MfekKM/NwSRehVEp/J0XWqWg+kIXRU/rqY1/w0vLVNFeQirpEjUp39eCBydXeS3Bik8uANW2UTxojJo3LBfLLoAT8ZWFb3YrIBAYkzjc= - file: "${RELEASE_PKG_FILE}" - skip_cleanup: true - on: - tags: true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..af6af14 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,31 @@ +FROM ubuntu:16.04 + +MAINTAINER Immanuel Bayer + +ENV LANG=C.UTF-8 LC_ALL=C.UTF-8 + +USER root + +RUN apt-get update --fix-missing && apt-get install -y wget bzip2 ca-certificates \ + libglib2.0-0 libxext6 libsm6 libxrender1 \ + build-essential cmake git + +# Download and install miniconda. +RUN echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \ + wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.27-Linux-x86_64.sh -O ~/miniconda.sh && \ + /bin/bash ~/miniconda.sh -b -p /opt/conda && \ + rm ~/miniconda.sh + +ENV PATH /opt/conda/bin:$PATH + +RUN conda config --set always_yes yes --set changeps1 no +RUN conda update -q conda + +# Setup test virtual env +ARG TRAVIS_PYTHON_VERSION=3 +ENV PY_VERSION=$TRAVIS_PYTHON_VERSION + +RUN conda update -q conda && \ + conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION \ + cython numpy pandas scipy scikit-learn nose + diff --git a/Makefile b/Makefile index 613a534..40d5f8a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,11 @@ +PYTHON ?= python + all: + ( cd fastFM-core2 ; \ + cmake -H. -B_lib -DCMAKE_BUILD_TYPE=Debug -DCMAKE_DEBUG_POSTFIX=d; \ + cmake --build _lib; ) ( cd fastFM-core ; $(MAKE) lib ) - python setup.py build_ext --inplace + $(PYTHON) setup.py build_ext --inplace .PHONY : clean clean: diff --git a/docker_run_tests.sh b/docker_run_tests.sh new file mode 100644 index 0000000..1e3aa0c --- /dev/null +++ b/docker_run_tests.sh @@ -0,0 +1,6 @@ +source activate test-environment +# Build fastFM-core +cd /fastfm/ +make +pip install . +nosetests diff --git a/fastFM-core2 b/fastFM-core2 index 832fa76..d28a678 160000 --- a/fastFM-core2 +++ b/fastFM-core2 @@ -1 +1 @@ -Subproject commit 832fa76f2677e401710d6ac7512390e6b8f8a0ad +Subproject commit d28a678de422d3fb5593df114072764a53607b41 diff --git a/fastFM/als.py b/fastFM/als.py index 659076d..f24744c 100644 --- a/fastFM/als.py +++ b/fastFM/als.py @@ -2,11 +2,13 @@ # License: BSD 3 clause import ffm +import ffm2 import numpy as np from sklearn.base import RegressorMixin from .validation import check_consistent_length, check_array from .base import (FactorizationMachine, BaseFMClassifier, - _validate_class_labels, _check_warm_start) + _validate_class_labels, _check_warm_start, + _init_parameter, _settings_factory) class FMRegression(FactorizationMachine, RegressorMixin): @@ -63,9 +65,11 @@ def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, self.l2_reg_w = l2_reg_w self.l2_reg_V = l2_reg_V self.l2_reg = l2_reg - self.task = "regression" + self.loss = "squared" + self.solver = "cd" + self.iter_count = 0 - def fit(self, X_train, y_train, n_more_iter=0): + def fit(self, X, y, n_more_iter=0): """ Fit model with specified loss. Parameters @@ -78,27 +82,24 @@ def fit(self, X_train, y_train, n_more_iter=0): Number of iterations to continue from the current Coefficients. """ + check_consistent_length(X, y) + y = check_array(y, ensure_2d=False, dtype=np.float64) - check_consistent_length(X_train, y_train) - y_train = check_array(y_train, ensure_2d=False, dtype=np.float64) + X = check_array(X, accept_sparse="csc", dtype=np.float64) + n_features = X.shape[1] - X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, - order="F") - self.n_iter = self.n_iter + n_more_iter + if self.iter_count == 0: + self.w0_, self.w_, self.V_ = _init_parameter(self, n_features) - if n_more_iter > 0: - _check_warm_start(self, X_train) - self.warm_start = True + if n_more_iter != 0: + _check_warm_start(self, X) + self.n_iter = n_more_iter - self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) + settings_dict = _settings_factory(self) + ffm2.ffm_fit(self.w0_, self.w_, self.V_, X, y, self.rank, + settings_dict) - if self.iter_count != 0: - self.iter_count = self.iter_count + n_more_iter - else: - self.iter_count = self.n_iter - - # reset to default setting - self.warm_start = False + self.iter_count += self.n_iter return self @@ -158,9 +159,11 @@ def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, self.l2_reg_w = l2_reg_w self.l2_reg_V = l2_reg_V self.l2_reg = l2_reg - self.task = "classification" + self.loss = "squared" + self.solver = "cd" + self.iter_count = 0 - def fit(self, X_train, y_train): + def fit(self, X, y): """ Fit model with specified loss. Parameters @@ -168,24 +171,25 @@ def fit(self, X_train, y_train): X : scipy.sparse.csc_matrix, (n_samples, n_features) y : float | ndarray, shape = (n_samples, ) - the targets have to be encodes as {-1, 1}. + the targets have to be encodes as {0, 1}. """ - check_consistent_length(X_train, y_train) - X_train = check_array(X_train, accept_sparse="csc", dtype=np.float64, - order="F") - y_train = _validate_class_labels(y_train) + check_consistent_length(X, y) + + X = check_array(X, accept_sparse="csc", dtype=np.float64, + order="F") + y = _validate_class_labels(y) + + self.classes_ = np.unique(y) - self.classes_ = np.unique(y_train) if len(self.classes_) != 2: raise ValueError("This solver only supports binary classification" " but the data contains" " class: %r" % self.classes_) - # fastFM-core expects labels to be in {-1,1} - y_train = y_train.copy() - i_class1 = (y_train == self.classes_[0]) - y_train[i_class1] = -1 - y_train[~i_class1] = 1 + self.w0_, self.w_, self.V_ = _init_parameter(self, X.shape[1]) + + settings_dict = _settings_factory(self) + ffm2.ffm_fit(self.w0_, self.w_, self.V_, X, y, self.rank, + settings_dict) - self.w0_, self.w_, self.V_ = ffm.ffm_als_fit(self, X_train, y_train) return self diff --git a/fastFM/base.py b/fastFM/base.py index ae7604c..19aaef2 100644 --- a/fastFM/base.py +++ b/fastFM/base.py @@ -4,17 +4,40 @@ import numpy as np import scipy.sparse as sp from scipy.stats import norm +from scipy.special import expit as sigmoid from sklearn.base import BaseEstimator, ClassifierMixin +from sklearn.utils import check_random_state +import ffm2 from .validation import check_array -import ffm + + +def _init_parameter(fm, n_features): + generator = check_random_state(fm.random_state) + w0 = np.zeros(1, dtype=np.float64) + w = np.zeros(n_features, dtype=np.float64) + V = generator.normal(loc=0.0, scale=fm.init_stdev, + size=(fm.rank, n_features)) + return w0, w, V + + +def _settings_factory(fm): + settings_dict = fm.get_params() + settings_dict['loss'] = fm.loss + settings_dict['solver'] = fm.solver + + # TODO align naming + settings_dict['iter'] = int(settings_dict['n_iter']) + del settings_dict['n_iter'] + + return settings_dict def _validate_class_labels(y): - assert len(set(y)) == 2 - assert y.min() == -1 - assert y.max() == 1 - return check_array(y, ensure_2d=False, dtype=np.float64) + assert len(set(y)) == 2 + assert y.min() == -1 + assert y.max() == 1 + return check_array(y, ensure_2d=False, dtype=np.float64) def _check_warm_start(fm, X): @@ -82,6 +105,7 @@ def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, self.step_size = 0 self.copy_X = copy_X + def predict(self, X_test): """ Return predictions @@ -99,12 +123,12 @@ def predict(self, X_test): order="F") assert sp.isspmatrix_csc(X_test) assert X_test.shape[1] == len(self.w_) - return ffm.ffm_predict(self.w0_, self.w_, self.V_, X_test) + return ffm2.ffm_predict(self.w0_, self.w_, self.V_, X_test) class BaseFMClassifier(FactorizationMachine, ClassifierMixin): - def predict(self, X_test): + def predict(self, X_test, threshold=0.5): """ Return predictions Parameters @@ -117,6 +141,13 @@ def predict(self, X_test): y : array, shape (n_samples) Class labels """ + + if self.loss == "logistic": + y_proba = self.predict_proba(X_test) + y_binary = np.ones_like(y_proba, dtype=np.float64) + y_binary[y_proba < threshold] = -1 + return y_binary + y_proba = norm.cdf(super(BaseFMClassifier, self).predict(X_test)) # convert probs to labels y_pred = np.zeros_like(y_proba, dtype=np.float64) + self.classes_[0] @@ -136,5 +167,10 @@ def predict_proba(self, X_test): y : array, shape (n_samples) Class Probability for the class with smaller label. """ + + if self.loss == "logistic": + pred = ffm2.ffm_predict(self.w0_, self.w_, self.V_, X_test) + return sigmoid(pred) + pred = super(BaseFMClassifier, self).predict(X_test) return norm.cdf(pred) diff --git a/fastFM/bpr.py b/fastFM/bpr.py index 2805fdc..be092fc 100644 --- a/fastFM/bpr.py +++ b/fastFM/bpr.py @@ -92,4 +92,5 @@ def fit(self, X, pairs): assert pairs.max() <= X.shape[1] assert pairs.min() >= 0 self.w0_, self.w_, self.V_ = ffm.ffm_fit_sgd_bpr(self, X, pairs) + self.w0_ = np.array([self.w0_], dtype=np.float64) return self diff --git a/fastFM/cffm.pxd b/fastFM/cffm.pxd index 9e600ea..655d100 100644 --- a/fastFM/cffm.pxd +++ b/fastFM/cffm.pxd @@ -33,15 +33,9 @@ cdef extern from "../fastFM-core/include/ffm.h": void ffm_predict(double *w_0, double * w, double * V, cs_di *X, double *y_pred, int k) - void ffm_als_fit(double *w_0, double *w, double *V, - cs_di *X, double *y, ffm_param *param) - void ffm_mcmc_fit_predict(double *w_0, double *w, double *V, cs_di *X_train, cs_di *X_test, double *y_train, double *y_pred, ffm_param *param) - void ffm_sgd_fit(double *w_0, double *w, double *V, - cs_di *X, double *y, ffm_param *param) - void ffm_sgd_bpr_fit(double *w_0, double *w, double *V, cs_di *X, double *pairs, int n_pairs, ffm_param *param) diff --git a/fastFM/cpp_ffm.pxd b/fastFM/cpp_ffm.pxd index 8d8af41..14b29ad 100644 --- a/fastFM/cpp_ffm.pxd +++ b/fastFM/cpp_ffm.pxd @@ -2,11 +2,14 @@ # License: BSD 3 clause #distutils: language=c++ +from libcpp.string cimport string +from libcpp cimport bool cdef extern from "../fastFM-core2/fastFM/fastfm.h" namespace "fastfm": cdef cppclass Settings: Settings() + Settings(string settings) cdef cppclass Model: Model() @@ -19,7 +22,8 @@ cdef extern from "../fastFM-core2/fastFM/fastfm.h" namespace "fastfm": cdef cppclass Data: Data() void add_design_matrix(int n_samples, int n_features, int nnz, - int* outer_ptr, int* inter_ptr, double* data) + int* outer_ptr, int* inter_ptr, double* data, + bool is_col_major) void add_target(const int n_samples, double *data) void add_prediction(const int n_samples, double* data) diff --git a/fastFM/datasets.py b/fastFM/datasets.py index 5832155..2402069 100644 --- a/fastFM/datasets.py +++ b/fastFM/datasets.py @@ -43,7 +43,7 @@ def make_user_item_regression(random_state=123, n_user=20, n_item=20, if __name__ == '__main__': X, y, coef = make_user_item_regression(n_user=5, n_item=5, rank=2, label_stdev=2) - from sklearn.cross_validation import train_test_split + from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) diff --git a/fastFM/ffm.pyx b/fastFM/ffm.pyx index 344ceb3..26ef1d2 100644 --- a/fastFM/ffm.pyx +++ b/fastFM/ffm.pyx @@ -88,57 +88,6 @@ def ffm_predict(double w_0, double[:] w, return y -def ffm_als_fit(fm, X, double[:] y): - assert X.shape[0] == len(y) # test shapes - n_features = X.shape[1] - X_ = CsMatrix(X) - pt_X = PyCapsule_GetPointer(X_, "CsMatrix") - param = FFMParam(fm) - pt_param = PyCapsule_GetPointer(param, "FFMParam") - cdef double w_0 - cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V - - if fm.warm_start: - w_0 = 0 if fm.ignore_w_0 else fm.w0_ - w = np.zeros(n_features, dtype=np.float64) if fm.ignore_w else fm.w_ - V = np.zeros((fm.rank, n_features), dtype=np.float64)\ - if fm.rank == 0 else fm.V_ - else: - w_0 = 0 - w = np.zeros(n_features, dtype=np.float64) - V = np.zeros((fm.rank, n_features), dtype=np.float64) - - cffm.ffm_als_fit(&w_0, w.data, V.data, - pt_X, &y[0], pt_param) - return w_0, w, V - - -def ffm_sgd_fit(fm, X, double[:] y): - """ - The sgd solver expects a transposed design matrix in column major order - (csc_matrix) Samples are stored in columns, this allows fast sample by - sample access. - """ - assert X.shape[1] == len(y) # test shapes - n_features = X.shape[0] - X_ = CsMatrix(X) - pt_X = PyCapsule_GetPointer(X_, "CsMatrix") - param = FFMParam(fm) - pt_param = PyCapsule_GetPointer(param, "FFMParam") - - # allocate the coefs - cdef double w_0 = 0 - cdef np.ndarray[np.float64_t, ndim=1, mode='c'] w =\ - np.zeros(n_features, dtype=np.float64) - cdef np.ndarray[np.float64_t, ndim=2, mode='c'] V =\ - np.zeros((fm.rank, n_features), dtype=np.float64) - - cffm.ffm_sgd_fit(&w_0, w.data, V.data, - pt_X, &y[0], pt_param) - return w_0, w, V - - def ffm_fit_sgd_bpr(fm, X, np.ndarray[np.float64_t, ndim=2, mode='c'] pairs): n_features = X.shape[0] X_ = CsMatrix(X) @@ -208,4 +157,4 @@ def ffm_mcmc_fit_predict(fm, X_train, X_test, double[:] y): &y[0], y_pred.data, pt_param) fm.hyper_param_ = hyper_param - return (w_0, w, V), y_pred \ No newline at end of file + return (w_0, w, V), y_pred diff --git a/fastFM/ffm2.pyx b/fastFM/ffm2.pyx index 20af616..4d2cc0d 100644 --- a/fastFM/ffm2.pyx +++ b/fastFM/ffm2.pyx @@ -1,46 +1,74 @@ # Author: Immanuel Bayer # License: BSD 3 clause +import json + cimport cpp_ffm from cpp_ffm cimport Settings, Data, Model, predict, fit from libcpp.memory cimport nullptr +from libcpp.string cimport string + +import scipy.sparse as sp cimport numpy as np import numpy as np -def ffm_predict(double w_0, double[:] w, - np.ndarray[np.float64_t, ndim = 2] V, X): - assert X.shape[1] == len(w) - assert X.shape[1] == V.shape[1] +cdef Model* _model_factory(np.ndarray[np.float64_t, ndim = 1] w_0, + np.ndarray[np.float64_t, ndim = 1] w, + np.ndarray[np.float64_t, ndim = 2] V): + + cdef Model *m = new Model() + rank = V.shape[0] + n_features = V.shape[1] + + m.add_parameter( w_0.data) + m.add_parameter( w.data, n_features) + m.add_parameter( V.data, rank, n_features, 2) + + return m + + +cdef Data* _data_factory(X, np.ndarray[np.float64_t, ndim = 1] y_pred): # get attributes from csc scipy n_features = X.shape[1] n_samples = X.shape[0] nnz = X.count_nonzero() + if not (sp.isspmatrix_csc(X) or sp.isspmatrix_csr(X)): + raise "matrix format is not supported" + cdef np.ndarray[int, ndim=1, mode='c'] inner = X.indices cdef np.ndarray[int, ndim=1, mode='c'] outer = X.indptr cdef np.ndarray[np.float64_t, ndim=1, mode='c'] data = X.data - assert n_features == w.shape[0] - assert n_features == V.shape[1] + cdef Data *d = new Data() + d.add_design_matrix(n_samples, n_features, nnz, &outer[0], &inner[0], + &data[0], sp.isspmatrix_csc(X)) + d.add_prediction(n_samples, &y_pred[0]) + return d - rank = V.shape[0] - cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y =\ - np.zeros(X.shape[0], dtype=np.float64) +# cython doesn't support function overloading +cdef Data* _data_factory_fit(X, np.ndarray[np.float64_t, ndim = 1] y_true, + np.ndarray[np.float64_t, ndim = 1] y_pred): + d = _data_factory(X, y_pred) + d.add_target(X.shape[0], &y_true[0]) + return d - cdef Model *m = new Model() - cdef Data *d = new Data() - m.add_parameter(&w_0) - m.add_parameter(&w[0], n_features) - m.add_parameter( V.data, rank, n_features, 2) +def ffm_predict(np.ndarray[np.float64_t, ndim = 1] w_0, + np.ndarray[np.float64_t, ndim = 1] w, + np.ndarray[np.float64_t, ndim = 2] V, X): + assert X.shape[1] == len(w) + assert X.shape[1] == V.shape[1] - d.add_target(n_samples, &y[0]) - d.add_prediction(n_samples, &y[0]) - d.add_design_matrix(n_samples, n_features, nnz, &outer[0], &inner[0], - &data[0]) + # allocate memory for predictions + cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y =\ + np.zeros(X.shape[0], dtype=np.float64) + + m = _model_factory(w_0, w, V) + d = _data_factory(X, y) cpp_ffm.predict(m, d) @@ -49,29 +77,22 @@ def ffm_predict(double w_0, double[:] w, return y -def ffm_als_fit(double w_0, double[:] w, np.ndarray[np.float64_t, ndim = 2] V, - X, double[:] y, int rank): - assert X.shape[0] == len(y) # test shapes - n_features = X.shape[1] - n_samples = X.shape[0] - nnz = X.count_nonzero() - cdef np.ndarray[int, ndim=1, mode='c'] inner = X.indices - cdef np.ndarray[int, ndim=1, mode='c'] outer = X.indptr - cdef np.ndarray[np.float64_t, ndim=1, mode='c'] data = X.data - cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y_pred = np.zeros(n_samples, dtype=np.float64) +def ffm_fit(np.ndarray[np.float64_t, ndim = 1] w_0, + np.ndarray[np.float64_t, ndim = 1] w, + np.ndarray[np.float64_t, ndim = 2] V, + X, np.ndarray[np.float64_t, ndim = 1] y, int rank, dict settings): + assert isinstance(settings, dict) + assert X.shape[0] == len(y) # test shapes - cdef Data* d = new Data() - d.add_design_matrix(n_samples, n_features, nnz, &outer[0], &inner[0], &data[0]) - d.add_target(n_samples, &y[0]) - d.add_prediction(n_samples, &y_pred[0]) + cdef Settings* s = new Settings(json.dumps(settings).encode()) + m = _model_factory(w_0, w, V) - cdef Model* m = new Model() - m.add_parameter(&w_0) - m.add_parameter(&w[0], n_features) - m.add_parameter( V.data, rank, n_features, 2) + # allocate memory for prediction + cdef np.ndarray[np.float64_t, ndim=1, mode='c'] y_pred = np.zeros( + X.shape[0], dtype=np.float64) - cdef Settings* s = new Settings() + d = _data_factory_fit(X, y, y_pred) cpp_ffm.fit(s, m, d) @@ -79,4 +100,4 @@ def ffm_als_fit(double w_0, double[:] w, np.ndarray[np.float64_t, ndim = 2] V, del m del s - return w_0, w, V \ No newline at end of file + return w_0, w, V diff --git a/fastFM/mcmc.py b/fastFM/mcmc.py index 5880864..dcf2129 100644 --- a/fastFM/mcmc.py +++ b/fastFM/mcmc.py @@ -122,6 +122,7 @@ def fit_predict(self, X_train, y_train, X_test, n_more_iter=0): coef, y_pred = ffm.ffm_mcmc_fit_predict(self, X_train, X_test, y_train) self.w0_, self.w_, self.V_ = coef + self.w0_ = np.array([self.w0_], dtype=np.float64) self.prediction_ = y_pred self.warm_start = False @@ -231,4 +232,5 @@ def fit_predict_proba(self, X_train, y_train, X_test): coef, y_pred = ffm.ffm_mcmc_fit_predict(self, X_train, X_test, y_train) self.w0_, self.w_, self.V_ = coef + self.w0_ = np.array([self.w0_], dtype=np.float64) return y_pred diff --git a/fastFM/sgd.py b/fastFM/sgd.py index 27f9810..74d179e 100644 --- a/fastFM/sgd.py +++ b/fastFM/sgd.py @@ -2,12 +2,15 @@ # License: BSD 3 clause -import ffm +import ffm2 import numpy as np +from scipy.special import expit as sigmoid from sklearn.base import RegressorMixin +from sklearn.utils import check_random_state from .validation import check_array, check_consistent_length from .base import (FactorizationMachine, BaseFMClassifier, - _validate_class_labels) + _validate_class_labels, + _init_parameter, _settings_factory) class FMRegression(FactorizationMachine, RegressorMixin): @@ -56,7 +59,7 @@ class FMRegression(FactorizationMachine, RegressorMixin): Coefficients of second order factor matrix. """ - def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, + def __init__(self, n_iter=0, n_epoch=10, init_stdev=0.1, rank=8, random_state=123, l2_reg_w=0.1, l2_reg_V=0.1, l2_reg=0, step_size=0.1): super(FMRegression, self).\ __init__(n_iter=n_iter, init_stdev=init_stdev, rank=rank, @@ -69,7 +72,9 @@ def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, self.l2_reg_V = l2_reg_V self.l2_reg = l2_reg self.step_size = step_size - self.task = "regression" + self.loss = 'squared' + self.solver = 'sgd' + self.n_epoch = n_epoch def fit(self, X, y): """ Fit model with specified loss. @@ -85,12 +90,12 @@ def fit(self, X, y): check_consistent_length(X, y) y = check_array(y, ensure_2d=False, dtype=np.float64) - # The sgd solver expects a transposed design matrix in column major - # order (csc_matrix). - X = X.T # creates a copy - X = check_array(X, accept_sparse="csc", dtype=np.float64) - - self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) + X = check_array(X, accept_sparse="csr", dtype=np.float64) + n_features = X.shape[1] + settings_dict = _settings_factory(self) + self.w0_, self.w_, self.V_ = _init_parameter(self, n_features) + ffm2.ffm_fit(self.w0_, self.w_, self.V_, X, y, self.rank, + settings_dict) return self @@ -140,7 +145,8 @@ class FMClassification(BaseFMClassifier): Coefficients of second order factor matrix. """ - def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, + def __init__(self, n_iter=100, n_epoch=10, init_stdev=0.1, rank=8, + random_state=123, l2_reg_w=0, l2_reg_V=0, l2_reg=None, step_size=0.1): super(FMClassification, self).\ __init__(n_iter=n_iter, init_stdev=init_stdev, rank=rank, @@ -153,7 +159,9 @@ def __init__(self, n_iter=100, init_stdev=0.1, rank=8, random_state=123, self.l2_reg_V = l2_reg_V self.l2_reg = l2_reg self.step_size = step_size - self.task = "classification" + self.loss = 'logistic' + self.solver = 'sgd' + self.n_epoch = n_epoch def fit(self, X, y): """ Fit model with specified loss. @@ -173,19 +181,13 @@ def fit(self, X, y): " but the data contains" " class: %r" % self.classes_) - # fastFM-core expects labels to be in {-1,1} - y_train = y.copy() - i_class1 = (y_train == self.classes_[0]) - y_train[i_class1] = -1 - y_train[~i_class1] = 1 - check_consistent_length(X, y) y = y.astype(np.float64) - # The sgd solver expects a transposed design matrix in column major - # order (csc_matrix). - X = X.T # creates a copy - X = check_array(X, accept_sparse="csc", dtype=np.float64) - - self.w0_, self.w_, self.V_ = ffm.ffm_sgd_fit(self, X, y) + X = check_array(X, accept_sparse="csr", dtype=np.float64) + n_features = X.shape[1] + settings_dict = _settings_factory(self) + self.w0_, self.w_, self.V_ = _init_parameter(self, n_features) + ffm2.ffm_fit(self.w0_, self.w_, self.V_, X, y, self.rank, + settings_dict) return self diff --git a/fastFM/tests/test_als.py b/fastFM/tests/test_als.py index 804812e..2313ce3 100644 --- a/fastFM/tests/test_als.py +++ b/fastFM/tests/test_als.py @@ -7,6 +7,7 @@ from fastFM import als from fastFM.datasets import make_user_item_regression from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split from sklearn.utils.testing import assert_almost_equal @@ -20,7 +21,7 @@ def get_test_problem(task='regression'): V = np.array([[6, 0], [5, 8]], dtype=np.float64) w = np.array([9, 2], dtype=np.float64) - w0 = 2 + w0 = np.array([2], dtype=np.float64) if task == 'classification': y_labels = np.ones_like(y) y_labels[y < np.median(y)] = -1 @@ -95,7 +96,6 @@ def test_fm_classification(): def test_als_warm_start(): X, y, coef = make_user_item_regression(label_stdev=0) - from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) X_train = sp.csc_matrix(X_train) @@ -123,50 +123,36 @@ def test_als_warm_start(): def test_warm_start_path(): - X, y, coef = make_user_item_regression(label_stdev=.4) - from sklearn.cross_validation import train_test_split - X_train, X_test, y_train, y_test = train_test_split( - X, y, test_size=0.33, random_state=42) - X_train = sp.csc_matrix(X_train) - X_test = sp.csc_matrix(X_test) - n_iter = 10 + X = sp.csc_matrix(X) rank = 4 seed = 333 step_size = 1 + n_iter = 10 l2_reg_w = 0 l2_reg_V = 0 fm = als.FMRegression(n_iter=0, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, rank=rank, random_state=seed) - # initalize coefs - fm.fit(X_train, y_train) - rmse_train = [] - rmse_test = [] - for i in range(1, n_iter): - fm.fit(X_train, y_train, n_more_iter=step_size) - rmse_train.append(np.sqrt(mean_squared_error( - fm.predict(X_train), y_train))) - rmse_test.append(np.sqrt(mean_squared_error( - fm.predict(X_test), y_test))) + rmse = [] + for _ in range(1, n_iter): + fm.fit(X, y, n_more_iter=step_size) + rmse.append(np.sqrt(mean_squared_error( + fm.predict(X), y))) print('------- restart ----------') - values = np.arange(1, n_iter) - rmse_test_re = [] - rmse_train_re = [] - for i in values: + rmse_re = [] + for i in range(1, n_iter): fm = als.FMRegression(n_iter=i, l2_reg_w=l2_reg_w, l2_reg_V=l2_reg_V, rank=rank, random_state=seed) - fm.fit(X_train, y_train) - rmse_test_re.append(np.sqrt(mean_squared_error( - fm.predict(X_test), y_test))) - rmse_train_re.append(np.sqrt(mean_squared_error( - fm.predict(X_train), y_train))) + fm.fit(X, y) + rmse_re.append(np.sqrt(mean_squared_error( + fm.predict(X), y))) - assert_almost_equal(rmse_train, rmse_train_re) - assert_almost_equal(rmse_test, rmse_test_re) + assert len(rmse) == len(rmse_re) + assert_almost_equal(rmse, rmse_re) def test_clone(): @@ -183,4 +169,5 @@ def test_clone(): if __name__ == '__main__': # test_fm_regression_only_w0() - test_fm_linear_regression() + # test_fm_linear_regression() + test_warm_start_path() diff --git a/fastFM/tests/test_datasets.py b/fastFM/tests/test_datasets.py index 34b4dfb..e4e3a0a 100644 --- a/fastFM/tests/test_datasets.py +++ b/fastFM/tests/test_datasets.py @@ -3,13 +3,13 @@ from fastFM.datasets import make_user_item_regression from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split import scipy.sparse as sp def test_make_user_item_regression(): from fastFM.mcmc import FMRegression X, y, coef = make_user_item_regression(label_stdev=0) - from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) @@ -19,7 +19,6 @@ def test_make_user_item_regression(): # generate data with noisy lables X, y, coef = make_user_item_regression(label_stdev=2) - from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) diff --git a/fastFM/tests/test_ffm.py b/fastFM/tests/test_ffm.py index 3a1cde2..7ebcb5d 100644 --- a/fastFM/tests/test_ffm.py +++ b/fastFM/tests/test_ffm.py @@ -19,7 +19,7 @@ def get_test_problem(): V = np.array([[6, 0], [5, 8]], dtype=np.float64) w = np.array([9, 2], dtype=np.float64) - w0 = 2 + w0 = np.array([2], dtype=np.float64) return w0, w, V, y, X def test_ffm_predict(): @@ -37,29 +37,63 @@ def test_ffm2_predict_w0(): w[:] = 0 V[:, :] = 0 y_pred = ffm2.ffm_predict(w0, w, V, X) - assert_equal(y_pred, w0) + assert_equal(y_pred[0], w0) -def test_ffm2_fit(): +def test_ffm2_fit_als(): w0, w, V, y, X = get_test_problem() - w0 = 0 + w0[:] = 0 w[:] = 0 - V = np.random.normal(loc=0.0, scale=1.0, size=(2, 2)) + np.random.seed(123) + V = np.random.normal(loc=0.0, scale=1.0, + size=(2, 2)) + rank = 2 + + y_pred = ffm2.ffm_predict(w0, w, V, X) + msqr_before = mean_squared_error(y, y_pred) + + settings = {'solver': 'cd', + 'loss': 'squared', + 'iter': 500, + 'l2_reg_w': 0.01, + 'l2_reg_V': 0.02} + + ffm2.ffm_fit(w0, w, V, X, y, rank, settings) + + y_pred = ffm2.ffm_predict(w0, w, V, X) + msqr_after = mean_squared_error(y, y_pred) + + assert w0 != 0 + assert(msqr_before > msqr_after) + +def test_ffm2_fit_sgd(): + w0, w, V, y, X = get_test_problem() + w0[:] = 0 + w[:] = 0 + np.random.seed(123) + V = np.random.normal(loc=0.0, scale=0.01, + size=(2, 2)) - w0_init = w0 - w_init = np.copy(w) - V_init = np.copy(V) rank = 2 y_pred = ffm2.ffm_predict(w0, w, V, X) msqr_before = mean_squared_error(y, y_pred) - w0, w, V = ffm2.ffm_als_fit(w0, w, V, X, y, rank) + settings = {'solver': 'sgd', + 'loss': 'squared', + 'step_size': 0.0001, + 'n_epoch': 5, + 'l2_reg_w': 0.01, + 'l2_reg_V': 0.02} + + w0, w, V = ffm2.ffm_fit(w0, w, V, sp.csr_matrix(X), y, rank, settings) y_pred = ffm2.ffm_predict(w0, w, V, X) msqr_after = mean_squared_error(y, y_pred) - assert(w0 != w0_init) - # FIXME: use np.all instead np.any after we can set solver params from python - assert (np.any(w != w_init)) - assert (np.any(V != V_init)) + assert w0 != 0 assert(msqr_before > msqr_after) + + +if __name__ == "__main__": + # test_ffm2_fit_sgd() + test_ffm2_fit_als() diff --git a/fastFM/tests/test_mcmc.py b/fastFM/tests/test_mcmc.py index be6e8ef..503455f 100644 --- a/fastFM/tests/test_mcmc.py +++ b/fastFM/tests/test_mcmc.py @@ -7,6 +7,7 @@ from fastFM import mcmc from fastFM.datasets import make_user_item_regression from sklearn.metrics import mean_squared_error +from sklearn.model_selection import train_test_split from sklearn.utils.testing import assert_almost_equal, assert_array_equal @@ -83,7 +84,6 @@ def test_fm_classification_proba(): def test_mcmc_warm_start(): X, y, coef = make_user_item_regression(label_stdev=0) - from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=44) X_train = sp.csc_matrix(X_train) @@ -106,7 +106,6 @@ def test_mcmc_warm_start(): def test_find_init_stdev(): X, y, coef = make_user_item_regression(label_stdev=.5) - from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=44) X_train = sp.csc_matrix(X_train) diff --git a/fastFM/tests/test_sgd.py b/fastFM/tests/test_sgd.py index 949414f..a7ab349 100644 --- a/fastFM/tests/test_sgd.py +++ b/fastFM/tests/test_sgd.py @@ -6,6 +6,7 @@ from sklearn import metrics from sklearn.datasets import make_regression from sklearn.utils.testing import assert_almost_equal +from numpy.testing import assert_equal from fastFM import sgd from fastFM import als @@ -20,7 +21,7 @@ def get_test_problem(task='regression'): V = np.array([[6, 0], [5, 8]], dtype=np.float64) w = np.array([9, 2], dtype=np.float64) - w0 = 2 + w0 = np.array([2], dtype=np.float64) if task == 'classification': y_labels = np.ones_like(y) y_labels[y < np.median(y)] = -1 @@ -28,27 +29,44 @@ def get_test_problem(task='regression'): return w0, w, V, y, X -def test_sgd_regression_small_example(): +def test_sgd_predict(): w0, w, V, y, X = get_test_problem() - X_test = X.copy() X_train = sp.csc_matrix(X) - fm = sgd.FMRegression(n_iter=10000, - init_stdev=0.01, l2_reg_w=0.5, l2_reg_V=50.5, rank=2, - step_size=0.0001) + fm = sgd.FMRegression(rank=V.shape[0]) + + # set model parameter + fm.w0_ = w0 + fm.w_ = w + fm.V_ = V + y_pred = fm.predict(X_train) + assert_equal(y_pred, y) + + +def test_sgd_regression_small_example(): + w0, w, V, y, X = get_test_problem() + X_test = sp.csr_matrix(X) + X_train = sp.csr_matrix(X) + + fm = sgd.FMRegression(n_epoch=31, + init_stdev=0.01, l2_reg_w=0.01, + l2_reg_V=0.002, rank=2, + step_size=0.0001, random_state=123) + + X_train.shape[0] == y.shape[0] fm.fit(X_train, y) y_pred = fm.predict(X_test) - assert metrics.r2_score(y_pred, y) > 0.99 + assert metrics.r2_score(y_pred, y) > 0.95 def test_first_order_sgd_vs_als_regression(): X, y = make_regression(n_samples=100, n_features=50, random_state=123) X = sp.csc_matrix(X) - fm_sgd = sgd.FMRegression(n_iter=900, init_stdev=0.01, l2_reg_w=0.0, - l2_reg_V=50.5, rank=0, step_size=0.01) - fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=0) + fm_sgd = sgd.FMRegression(n_epoch=20, init_stdev=0.01, l2_reg_w=0.01, + l2_reg_V=0.02, rank=0, step_size=0.01) + fm_als = als.FMRegression(n_iter=10, l2_reg_w=0.01, l2_reg_V=0.02, rank=0) y_pred_sgd = fm_sgd.fit(X, y).predict(X) y_pred_als = fm_als.fit(X, y).predict(X) @@ -63,11 +81,12 @@ def test_second_order_sgd_vs_als_regression(): X, y = make_regression(n_samples=100, n_features=50, random_state=123) X = sp.csc_matrix(X) - fm_sgd = sgd.FMRegression(n_iter=50000, init_stdev=0.00, l2_reg_w=0.0, - l2_reg_V=50.5, rank=2, step_size=0.0002) - fm_als = als.FMRegression(n_iter=10, l2_reg_w=0, l2_reg_V=0, rank=2) + fm_sgd = sgd.FMRegression(n_epoch=300, init_stdev=0.1, l2_reg_w=0.01, + l2_reg_V=0.05, rank=2, step_size=0.00001) + fm_als = als.FMRegression(n_iter=10, l2_reg_w=0.01, l2_reg_V=0.05, + rank=2) - y_pred_als = fm_als.fit(X, y).predict(X) + y_pred_als = fm_als.fit(sp.csr_matrix(X), y).predict(X) y_pred_sgd = fm_sgd.fit(X, y).predict(X) score_als = metrics.r2_score(y_pred_als, y) @@ -78,15 +97,17 @@ def test_second_order_sgd_vs_als_regression(): def test_sgd_classification_small_example(): w0, w, V, y, X = get_test_problem(task='classification') - X_test = X.copy() - X_train = sp.csc_matrix(X) + X_test = sp.csr_matrix(X) + X_train = sp.csr_matrix(X) - fm = sgd.FMClassification(n_iter=1000, - init_stdev=0.1, l2_reg_w=0, l2_reg_V=0, rank=2, - step_size=0.1) + fm = sgd.FMClassification(n_epoch=100, + init_stdev=0.01, l2_reg_w=0.02, l2_reg_V=0.02, + rank=2, step_size=0.01) fm.fit(X_train, y) + y_pred = fm.predict_proba(X_test) + assert metrics.roc_auc_score(y, y_pred) > 0.95 + y_pred = fm.predict(X_test) - print(y_pred) assert metrics.accuracy_score(y, y_pred) > 0.95 @@ -103,6 +124,9 @@ def test_clone(): if __name__ == '__main__': + # test_sgd_fit_small_example() + # test_sgd_fit_small_example() test_sgd_regression_small_example() - test_first_order_sgd_vs_als_regression() - test_second_order_sgd_vs_als_regression() + + # test_first_order_sgd_vs_als_regression() + # test_second_order_sgd_vs_als_regression()