From 9e252c5d67f3f69f17ca96bd5cd645990f424afa Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Wed, 22 Sep 2021 16:44:08 +0000 Subject: [PATCH 01/27] Mock Movielens schema v1 --- recommenders/datasets/mock/__init__.py | 0 recommenders/datasets/mock/movielens.py | 66 +++++++++++++++++ recommenders/datasets/movielens.py | 52 +++++++++----- setup.py | 7 +- .../recommenders/datasets/mock/__init__.py | 0 .../datasets/mock/test_movielens.py | 72 +++++++++++++++++++ .../recommenders/datasets/test_movielens.py | 21 ++++++ 7 files changed, 200 insertions(+), 18 deletions(-) create mode 100644 recommenders/datasets/mock/__init__.py create mode 100644 recommenders/datasets/mock/movielens.py create mode 100644 tests/unit/recommenders/datasets/mock/__init__.py create mode 100644 tests/unit/recommenders/datasets/mock/test_movielens.py create mode 100644 tests/unit/recommenders/datasets/test_movielens.py diff --git a/recommenders/datasets/mock/__init__.py b/recommenders/datasets/mock/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py new file mode 100644 index 0000000000..a9e73bf031 --- /dev/null +++ b/recommenders/datasets/mock/movielens.py @@ -0,0 +1,66 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +""" +Mock dataset schema to generate fake data for testing use. This will mimic the Movielens Dataset +""" +try: + import pandera as pa +except ImportError as e: + raise ImportError("Pandera not installed. Try `pip install recommender['dev']`") from e + +import random +from typing import Optional + +from pandera.typing import DateTime, Series +from pandera import Field +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType + + +class MockMovielens100kSchema(pa.SchemaModel): + """ + Mock dataset schema to generate fake data for testing purpose. + This schema is configured to mimic the Movielens 100k dataset + + http://files.grouplens.org/datasets/movielens/ml-100k/ + """ + # The 100k dataset has 943 total users + userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 943}) + # And 1682 total items + itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 1682}) + # Rating is on the scale from 1 to 5 + rating: Series[int] = Field(in_range={"min_value": 1, "max_value": 5}) + timestamp: Series[DateTime] + title: Series[str] = Field(eq="foo") + genres: Series[str] = Field(eq="genreA|0") + + @classmethod + def get_df(cls, size: int = 3, seed: int = 100): + """Return fake movielens dataset as a Pandas Dataframe with specified rows. + + Args: + size (int): number of rows to generate + seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + + Returns: + pandas.DataFrame: a mock dataset + """ + random.seed(seed) + return cls.example(size=size) + + @classmethod + def get_spark_df(cls, spark: SparkSession, size: int = 3, seed: int = 100, schema: Optional[StructType] = None): + """Return fake movielens dataset as a Spark Dataframe with specified rows + + Args: + spark (SparkSession): spark session to load the dataframe into + size (int): number of rows to generate + seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + schema (pyspark.sql.types.StructType optional): [description]. Defaults to None. + + Returns: + pyspark.sql.DataFrame: a mock dataset + """ + pandas_df = cls.get_df(size=size, seed=seed) + return spark.createDataFrame(pandas_df, schema=schema) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 73d7a58f1c..463bc00853 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -7,6 +7,7 @@ import warnings import pandas as pd from zipfile import ZipFile +from recommenders.datasets.mock.movielens import MockMovielens100kSchema from recommenders.datasets.download_utils import maybe_download, download_path from recommenders.utils.notebook_utils import is_databricks from recommenders.utils.constants import ( @@ -100,6 +101,11 @@ def item_has_header(self): "20m": _DataFormat(",", "ml-20m/ratings.csv", True, ",", "ml-20m/movies.csv", True), } +# Fake data for testing only +MOCK_DATA_FORMAT = { + "mock100": {"size": 100, "seed": 101} +} + # 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset. GENRES = ( "unknown", @@ -136,7 +142,7 @@ def item_has_header(self): Will only use the first four column names.""" WARNING_HAVE_SCHEMA_AND_HEADER = """Both schema and header are provided. The header argument will be ignored.""" -ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m}" +ERROR_MOVIE_LENS_SIZE = "Invalid data size. Should be one of {100k, 1m, 10m, or 20m, or mock100}" ERROR_HEADER = "Header error. At least user and movie column names should be provided" @@ -154,14 +160,16 @@ def load_pandas_df( To load movie information only, you can use load_item_df function. Args: - size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). - header (list or tuple or None): Rating dataset header. - local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. + size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). + header* (list or tuple or None): Rating dataset header. 'DEFAULT_HEADER' is set for all mock data sizes ("mock*"). + local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. - title_col (str): Movie title column name. If None, the column will not be loaded. - genres_col (str): Genres column name. Genres are '|' separated string. + title_col* (str): Movie title column name. If None, the column will not be loaded. + genres_col* (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. - year_col (str): Movie release year column name. If None, the column will not be loaded. + year_col* (str): Movie release year column name. If None, the column will not be loaded. + + All (*) arguments are not applicable when mock dataset is specified (size = "mock*") Returns: pandas.DataFrame: Movie rating dataset. @@ -185,9 +193,13 @@ def load_pandas_df( ) """ size = size.lower() - if size not in DATA_FORMAT: + if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) + if size in MOCK_DATA_FORMAT: + # generate fake data using the dictionary as a kwarg to the generation function + return MockMovielens100kSchema.get_df(**MOCK_DATA_FORMAT[size]) + if header is None: header = DEFAULT_HEADER elif len(header) < 2: @@ -349,17 +361,19 @@ def load_spark_df( Args: spark (pyspark.SparkSession): Spark session. - size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m"). - header (list or tuple): Rating dataset header. + size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). + header* (list or tuple): Rating dataset header. 'DEFAULT_HEADER' is set for all mock data sizes ("mock*"). If schema is provided, this argument is ignored. - schema (pyspark.StructType): Dataset schema. - local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. + schema* (pyspark.StructType): Dataset schema. + local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. - dbutils (Databricks.dbutils): Databricks utility object - title_col (str): Title column name. If None, the column will not be loaded. - genres_col (str): Genres column name. Genres are '|' separated string. + dbutils* (Databricks.dbutils): Databricks utility object + title_col* (str): Title column name. If None, the column will not be loaded. + genres_col* (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. - year_col (str): Movie release year column name. If None, the column will not be loaded. + year_col* (str): Movie release year column name. If None, the column will not be loaded. + + All (*) arguments are not applicable when mock dataset is specified (size = "mock*") Returns: pyspark.sql.DataFrame: Movie rating dataset. @@ -394,9 +408,13 @@ def load_spark_df( spark_df = load_spark_df(spark, dbutils=dbutils) """ size = size.lower() - if size not in DATA_FORMAT: + if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) + if size in MOCK_DATA_FORMAT: + # generate fake data using the dictionary as a kwarg to the generation function + return MockMovielens100kSchema.get_spark_df(spark, **MOCK_DATA_FORMAT[size]) + schema = _get_schema(header, schema) if len(schema) < 2: raise ValueError(ERROR_HEADER) diff --git a/setup.py b/setup.py index 0aba6982be..7aef19fe52 100644 --- a/setup.py +++ b/setup.py @@ -73,7 +73,12 @@ "cmake>=3.18.4.post1", "xlearn==0.40a1", ], - "dev": ["black>=18.6b4,<21", "pytest>=3.6.4", "pytest-cov>=2.12.1"], + "dev": [ + "black>=18.6b4,<21", + "pytest>=3.6.4", + "pytest-cov>=2.12.1", + "pytest-lazy-fixture>=0.6.3", # Allow using fixtures in pytest.mark.parametrize + ], } # for the brave of heart extras_require["all"] = list(set(sum([*extras_require.values()], []))) diff --git a/tests/unit/recommenders/datasets/mock/__init__.py b/tests/unit/recommenders/datasets/mock/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py new file mode 100644 index 0000000000..446f5cd75e --- /dev/null +++ b/tests/unit/recommenders/datasets/mock/test_movielens.py @@ -0,0 +1,72 @@ +from recommenders.datasets.mock.movielens import MockMovielens100kSchema +from recommenders.datasets.movielens import DEFAULT_HEADER +from recommenders.utils.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, +) + +import pytest +import pandas +import pyspark.sql +from pyspark.sql import SparkSession +from pyspark.sql.types import IntegerType, FloatType, LongType, StructField, StructType + + +@pytest.fixture(scope="module") +def default_schema(): + return StructType([ + StructField(DEFAULT_USER_COL, IntegerType()), + StructField(DEFAULT_ITEM_COL, IntegerType()), + StructField(DEFAULT_RATING_COL, FloatType()), + StructField(DEFAULT_TIMESTAMP_COL, LongType()), + ]) + + +@pytest.fixture(scope="module") +def custom_schema(): + return StructType([ + StructField("userID", IntegerType()), + StructField("itemID", IntegerType()), + StructField("rating", FloatType()), + ]) + + +@pytest.mark.parametrize("size", [10, 100]) +def test_mock_movielens_schema__has_default_col_names(size): + df = MockMovielens100kSchema.example(size=size) + for col_name in DEFAULT_HEADER: + assert col_name in df.columns + + +@pytest.mark.parametrize("seed", [-1]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_df__return_success(size, seed): + df = MockMovielens100kSchema.get_df(size, seed=seed) + assert type(df) == pandas.DataFrame + assert len(df) == size + + +@pytest.mark.parametrize("seed", [0, 101]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [3, 10]) +def test_mock_movielens_schema__get_spark_df__return_success(spark: SparkSession, size, seed): + df = MockMovielens100kSchema.get_spark_df(spark, size, seed=seed) + assert type(df) == pyspark.sql.DataFrame + assert df.count() == size + + +@pytest.mark.parametrize("schema", [ + None, + pytest.lazy_fixture('default_schema'), + pytest.lazy_fixture('custom_schema') +]) +def test_mock_movielens_schema__get_spark_df__with_custom_schema_return_success(spark: SparkSession, schema): + df = MockMovielens100kSchema.get_spark_df(spark, schema=schema) + assert type(df) == pyspark.sql.DataFrame + assert df.count() >= 0 + + +def test_mock_movielens_schema__get_spark_df__fail_on_empty_rows(spark: SparkSession): + with pytest.raises(ValueError, match="can not infer schema from empty dataset.*"): + MockMovielens100kSchema.get_spark_df(spark, 0) diff --git a/tests/unit/recommenders/datasets/test_movielens.py b/tests/unit/recommenders/datasets/test_movielens.py new file mode 100644 index 0000000000..f05cc24882 --- /dev/null +++ b/tests/unit/recommenders/datasets/test_movielens.py @@ -0,0 +1,21 @@ +from recommenders.datasets.movielens import DATA_FORMAT, MOCK_DATA_FORMAT +from recommenders.datasets.movielens import load_pandas_df, load_spark_df + +import pyspark.sql +from pyspark.sql import SparkSession + + +def test_mock_movielens_data__no_name_collision(): + """ + Making sure that no common names are shared between the mock and real dataset sizes + """ + dataset_name = set(DATA_FORMAT.keys()) + dataset_name_mock = set(MOCK_DATA_FORMAT.keys()) + collision = dataset_name.intersection(dataset_name_mock) + assert not collision + + +def test_mock_movielens_data_generation_succeed(spark: SparkSession): + df = load_spark_df(spark, "mock100") + assert type(df) == pyspark.sql.DataFrame + assert df.count() == 100 From feef435f6e6948c019b0b4add72c965f9fce4945 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Wed, 22 Sep 2021 19:58:12 +0000 Subject: [PATCH 02/27] Mock schema experiment --- .../als_movielens_diversity_metrics.ipynb | 622 +++++++++--------- recommenders/datasets/mock/movielens.py | 47 +- recommenders/evaluation/spark_evaluation.py | 3 +- tests/unit/examples/test_notebooks_pyspark.py | 6 +- 4 files changed, 362 insertions(+), 316 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index bf684cf9d5..31e998bbb6 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -2,16 +2,15 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Apply Diversity Metrics \n", "## -- Compare ALS and Random Recommenders on MovieLens (PySpark)\n", @@ -41,11 +40,11 @@ "The comparision results show that the ALS recommender outperforms the random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while the random recommender outperforms ALS recommender on diversity metrics. This is because ALS is optimized for estimating the item rating as accurate as possible, therefore it performs well on accuracy metrics including rating and ranking metrics. As a side effect, the items being recommended tend to be popular items, which are the items mostly sold or viewed. It leaves the [long-tail items](https://github.com/microsoft/recommenders/blob/main/GLOSSARY.md) having less chance to get introduced to the users. This is the reason why ALS is not performing as well as a random recommender on diversity metrics. \n", "\n", "From the algorithmic point of view, items in the tail suffer from the cold-start problem, making them hard for recommendation systems to use. However, from the business point of view, oftentimes the items in the tail can be highly profitable, since, depending on supply, business can apply a higher margin to them. Recommendation systems that optimize metrics like novelty and diversity, can help to find users willing to get these long tail items. Usually there is a trade-off between one type of metric vs. another. One should decide which set of metrics to optimize based on business scenarios." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Coverage**\n", "\n", @@ -65,11 +64,11 @@ "p(i|R) = \\frac{|M_r (i)|}{|\\textrm{reco_df}|}\n", "$$\n", "and $M_r (i)$ denotes the users who are recommended item $i$.\n" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "\n", "**Diversity**\n", @@ -89,11 +88,11 @@ "$$\n", "\\textrm{diversity} = 1 - \\textrm{IL}\n", "$$\n" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "\n", "**Novelty**\n", @@ -112,11 +111,11 @@ "$$\n", "\\textrm{novelty} = \\sum_{i \\in N_r} \\frac{|M_r (i)|}{|\\textrm{reco_df}|} \\textrm{novelty}(i)\n", "$$\n" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Serendipity**\n", "\n", @@ -131,30 +130,19 @@ "\\textrm{serendipity} = \\frac{1}{|M|} \\sum_{u \\in M_r}\n", "\\frac{1}{|N_r (u)|} \\sum_{i \\in N_r (u)} \\big(1 - \\textrm{expectedness}(i|u) \\big) \\, \\textrm{relevance}(i)\n", "$$\n" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.6.13 |Anaconda, Inc.| (default, Jun 4 2021, 14:25:59) \n", - "[GCC 7.5.0]\n", - "Spark version: 2.4.8\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "%load_ext autoreload\n", @@ -185,52 +173,62 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", + "Spark version: 2.4.8\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "\n", "Set the default parameters." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], "source": [ "# top k items to recommend\n", - "TOP_K = 10\n", + "TOP_K = 1\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'\n", + "MOVIELENS_DATA_SIZE = 'mock100'\n", "\n", "# user, item column names\n", - "COL_USER=\"UserId\"\n", - "COL_ITEM=\"MovieId\"\n", - "COL_RATING=\"Rating\"" - ] + "COL_USER=\"userId\"\n", + "COL_ITEM=\"itemID\"\n", + "COL_RATING=\"rating\"" + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 1. Set up Spark context\n", "\n", "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", @@ -238,66 +236,26 @@ "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")\n", "\n", "spark.conf.set(\"spark.sql.crossJoin.enabled\", \"true\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 2. Download the MovieLens dataset" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 17.1kKB/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------+------+------+---------+--------------------+------+\n", - "|MovieId|UserId|Rating|Timestamp| title|genres|\n", - "+-------+------+------+---------+--------------------+------+\n", - "| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n", - "| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n", - "| 26| 18| 4.0|880129731|Brothers McMullen...|Comedy|\n", - "| 26| 222| 3.0|878183043|Brothers McMullen...|Comedy|\n", - "| 26| 43| 5.0|883954901|Brothers McMullen...|Comedy|\n", - "| 26| 201| 4.0|884111927|Brothers McMullen...|Comedy|\n", - "| 26| 299| 4.0|878192601|Brothers McMullen...|Comedy|\n", - "| 26| 95| 3.0|880571951|Brothers McMullen...|Comedy|\n", - "| 26| 89| 3.0|879459909|Brothers McMullen...|Comedy|\n", - "| 26| 361| 3.0|879440941|Brothers McMullen...|Comedy|\n", - "| 26| 194| 3.0|879522240|Brothers McMullen...|Comedy|\n", - "| 26| 391| 5.0|877399745|Brothers McMullen...|Comedy|\n", - "| 26| 345| 3.0|884993555|Brothers McMullen...|Comedy|\n", - "| 26| 303| 4.0|879468307|Brothers McMullen...|Comedy|\n", - "| 26| 401| 3.0|891033395|Brothers McMullen...|Comedy|\n", - "| 26| 429| 3.0|882386333|Brothers McMullen...|Comedy|\n", - "| 26| 293| 3.0|888907015|Brothers McMullen...|Comedy|\n", - "| 26| 270| 5.0|876954995|Brothers McMullen...|Comedy|\n", - "| 26| 442| 3.0|883388576|Brothers McMullen...|Comedy|\n", - "| 26| 342| 2.0|875320037|Brothers McMullen...|Comedy|\n", - "+-------+------+------+---------+--------------------+------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], "source": [ "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", "schema = StructType(\n", " (\n", " StructField(COL_USER, IntegerType()),\n", - " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_ITEM, LongType()),\n", " StructField(COL_RATING, FloatType()),\n", " StructField(\"Timestamp\", LongType()),\n", " )\n", @@ -305,75 +263,108 @@ "\n", "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=\"title\", genres_col=\"genres\")\n", "data.show()" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+------+------+--------------------+-----+--------+\n", + "|userID|itemID|rating| timestamp|title| genres|\n", + "+------+------+------+--------------------+-----+--------+\n", + "| 6| 4| 4|2200-06-19 12:21:...| foo|genreA|0|\n", + "| 8| 4| 1|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 8| 4| 4|2109-02-14 15:31:...| foo|genreA|0|\n", + "| 9| 2| 2|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 9| 4| 3|2210-04-25 01:58:...| foo|genreA|0|\n", + "| 3| 5| 3| 1970-01-01 00:00:00| foo|genreA|0|\n", + "| 1| 2| 1|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 8| 3| 2| 1970-01-01 00:00:00| foo|genreA|0|\n", + "| 3| 10| 4|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 7| 10| 2|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 8| 9| 5|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 4| 2| 3|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 5| 8| 5|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 2| 7| 1|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 4| 6| 2| 1970-01-01 00:00:00| foo|genreA|0|\n", + "| 2| 5| 3|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 7| 2| 1|1970-01-01 00:00:...| foo|genreA|0|\n", + "| 8| 4| 5|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 7| 8| 1|1969-12-31 23:59:...| foo|genreA|0|\n", + "| 9| 4| 1|1970-01-01 00:00:...| foo|genreA|0|\n", + "+------+------+------+--------------------+-----+--------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Split the data using the Spark random splitter provided in utilities" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "source": [ + "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", + "print (\"N train_df\", train_df.cache().count())\n", + "print (\"N test_df\", test_df.cache().count())" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "N train_df 75066\n", - "N test_df 24934\n" + "N train_df 73\n", + "N test_df 27\n" ] } ], - "source": [ - "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", - "print (\"N train_df\", train_df.cache().count())\n", - "print (\"N test_df\", test_df.cache().count())" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Get all possible user-item pairs" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Note: We assume that training data contains all users and all catalog items. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, - "outputs": [], "source": [ "users = train_df.select(COL_USER).distinct()\n", "items = train_df.select(COL_ITEM).distinct()\n", "user_item = users.crossJoin(items)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3. Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", "\n", "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", "Timing will vary depending on the machine being used to train." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ "header = {\n", " \"userCol\": COL_USER,\n", @@ -392,51 +383,42 @@ " seed=42,\n", " **header\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train_df)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ - "Took 4.012367556002573 seconds for training.\n" + "Took 2.5952707109972835 seconds for training.\n" ] } ], - "source": [ - "with Timer() as train_time:\n", - " model = als.fit(train_df)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "In the movie recommendation use case, recommending movies that have been rated by the users does not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1464853\n", - "9430\n" - ] - } - ], "source": [ "# Score all user-item pairs\n", "dfs_pred = model.transform(user_item)\n", @@ -457,22 +439,31 @@ "top_k_reco = top_all.select(\"*\", F.row_number().over(window).alias(\"rank\")).filter(F.col(\"rank\") <= TOP_K).drop(\"rank\")\n", " \n", "print(top_k_reco.count())" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "48\n", + "10\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4. Random Recommender\n", "\n", "We define a recommender which randomly recommends unseen items to each user. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, - "outputs": [], "source": [ "# random recommender\n", "window = Window.partitionBy(COL_USER).orderBy(F.rand())\n", @@ -493,20 +484,20 @@ " .filter(F.col(\"score\") <= TOP_K)\n", " .drop(COL_RATING)\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 5. ALS vs Random Recommenders Performance Comparison" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, - "outputs": [], "source": [ "def get_ranking_results(ranking_eval):\n", " metrics = {\n", @@ -527,13 +518,13 @@ " \"serendipity\": diversity_eval.serendipity()\n", " }\n", " return metrics " - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 12, - "metadata": {}, - "outputs": [], "source": [ "def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):\n", " summary = {\"Data\": data, \"Algo\": algo, \"K\": k}\n", @@ -548,40 +539,40 @@ " summary.update(ranking_metrics)\n", " summary.update(diversity_metrics)\n", " return summary" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### ALS Recommender Performance Results" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 13, - "metadata": {}, - "outputs": [], "source": [ "als_ranking_eval = SparkRankingEvaluation(\n", " test_df, \n", " top_all, \n", " k = TOP_K, \n", - " col_user=\"UserId\", \n", - " col_item=\"MovieId\",\n", - " col_rating=\"Rating\", \n", + " col_user=COL_USER, \n", + " col_item=COL_ITEM,\n", + " col_rating=COL_RATING, \n", " col_prediction=\"prediction\",\n", " relevancy_method=\"top_k\"\n", ")\n", "\n", "als_ranking_metrics = get_ranking_results(als_ranking_eval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], + "execution_count": 14, "source": [ "als_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -591,29 +582,29 @@ ")\n", "\n", "als_diversity_metrics = get_diversity_results(als_diversity_eval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], + "execution_count": 15, "source": [ "als_results = generate_summary(MOVIELENS_DATA_SIZE, \"als\", TOP_K, als_ranking_metrics, als_diversity_metrics)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Random Recommender Performance Results" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], + "execution_count": 16, "source": [ "random_ranking_eval = SparkRankingEvaluation(\n", " test_df,\n", @@ -626,13 +617,13 @@ ")\n", "\n", "random_ranking_metrics = get_ranking_results(random_ranking_eval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], + "execution_count": 17, "source": [ "random_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -642,43 +633,48 @@ ")\n", " \n", "random_diversity_metrics = get_diversity_results(random_diversity_eval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], + "execution_count": 18, "source": [ "random_results = generate_summary(MOVIELENS_DATA_SIZE, \"random\", TOP_K, random_ranking_metrics, random_diversity_metrics)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Result Comparison" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], + "execution_count": 19, "source": [ "cols = [\"Data\", \"Algo\", \"K\", \"Precision@k\", \"Recall@k\", \"NDCG@k\", \"Mean average precision\",\"catalog_coverage\", \"distributional_coverage\",\"novelty\", \"diversity\", \"serendipity\" ]\n", "df_results = pd.DataFrame(columns=cols)\n", "\n", "df_results.loc[1] = als_results \n", "df_results.loc[2] = random_results " - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 23, - "metadata": {}, + "execution_count": 20, + "source": [ + "df_results" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -716,98 +712,89 @@ " \n", " \n", " 1\n", - " 100k\n", + " mock100\n", " als\n", - " 10\n", - " 0.047296\n", - " 0.016015\n", - " 0.043097\n", - " 0.004579\n", - " 0.385793\n", - " 7.967257\n", - " 11.659776\n", - " 0.892277\n", - " 0.878733\n", + " 1\n", + " 0.4\n", + " 0.150000\n", + " 0.4\n", + " 0.17\n", + " 0.4\n", + " 1.685475\n", + " 3.624421\n", + " None\n", + " 0.405009\n", " \n", " \n", " 2\n", - " 100k\n", + " mock100\n", " random\n", - " 10\n", - " 0.016755\n", - " 0.005883\n", - " 0.017849\n", - " 0.001890\n", - " 0.996326\n", - " 10.540834\n", - " 12.133664\n", - " 0.922288\n", - " 0.893001\n", + " 1\n", + " 0.3\n", + " 0.116667\n", + " 0.3\n", + " 0.12\n", + " 0.6\n", + " 2.446439\n", + " 3.644061\n", + " None\n", + " 0.396229\n", " \n", " \n", "\n", "
" ], "text/plain": [ - " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", - "1 100k als 10 0.047296 0.016015 0.043097 0.004579 \n", - "2 100k random 10 0.016755 0.005883 0.017849 0.001890 \n", - "\n", - " catalog_coverage distributional_coverage novelty diversity \\\n", - "1 0.385793 7.967257 11.659776 0.892277 \n", - "2 0.996326 10.540834 12.133664 0.922288 \n", + " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", + "1 mock100 als 1 0.4 0.150000 0.4 0.17 \n", + "2 mock100 random 1 0.3 0.116667 0.3 0.12 \n", "\n", - " serendipity \n", - "1 0.878733 \n", - "2 0.893001 " + " catalog_coverage distributional_coverage novelty diversity serendipity \n", + "1 0.4 1.685475 3.624421 None 0.405009 \n", + "2 0.6 2.446439 3.644061 None 0.396229 " ] }, - "execution_count": 23, "metadata": {}, - "output_type": "execute_result" + "execution_count": 20 } ], - "source": [ - "df_results" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Conclusion\n", "The comparision results show that the ALS recommender outperforms the random recommender on ranking metrics (Precision@k, Recall@k, NDCG@k, and\tMean average precision), while the random recommender outperforms ALS recommender on diversity metrics. This is because ALS is optimized for estimating the item rating as accurate as possible, therefore it performs well on accuracy metrics including rating and ranking metrics. As a side effect, the items being recommended tend to be popular items, which are the items mostly sold or viewed. It leaves the long-tail less popular items having less chance to get introduced to the users. This is the reason why ALS is not performing as well as a random recommender on diversity metrics. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 6. Calculate diversity metrics using item feature vector based item-item similarity\n", "In the above section we calculate diversity metrics using item co-occurrence count based item-item similarity. In the scenarios when item features are available, we may want to calculate item-item similarity based on item feature vectors. In this section, we show how to calculate diversity metrics using item feature vector based item-item similarity." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], + "execution_count": 21, "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", - " data.groupBy(\"MovieId\", \"title\", \"genres\").count()\n", + " data.groupBy(COL_ITEM, \"title\", \"genres\").count()\n", " .na.drop() # remove rows with null values\n", " .withColumn(\"genres\", F.split(F.col(\"genres\"), \"\\|\")) # convert to array of genres\n", " .withColumn(\"title\", F.regexp_replace(F.col(\"title\"), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", " .drop(\"count\") # remove unused columns\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], + "execution_count": 22, "source": [ "# tokenize \"title\" column\n", "title_tokenizer = Tokenizer(inputCol=\"title\", outputCol=\"title_words\")\n", @@ -816,41 +803,18 @@ "# remove stop words\n", "remover = StopWordsRemover(inputCol=\"title_words\", outputCol=\"text\")\n", "clean_data = remover.transform(tokenized_data).drop(\"title\", \"title_words\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "+-------+---------------------------------------------+\n", - "|MovieId|features |\n", - "+-------+---------------------------------------------+\n", - "|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n", - "|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n", - "|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n", - "|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n", - "|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n", - "|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n", - "|1118 |(1043,[702,1025],[1.0,1.0]) |\n", - "|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n", - "|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n", - "|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n", - "+-------+---------------------------------------------+\n", - "only showing top 10 rows\n", - "\n" - ] - } - ], + "execution_count": 23, "source": [ "# convert text input into feature vectors\n", "\n", "# step 1: perform HashingTF on column \"text\"\n", - "text_hasher = HashingTF(inputCol=\"text\", outputCol=\"text_features\", numFeatures=1024)\n", + "text_hasher = HashingTF(inputCol=\"text\", outputCol=\"text_features\", numFeatures=3)\n", "hashed_data = text_hasher.transform(clean_data)\n", "\n", "# step 2: fit a CountVectorizerModel from column \"genres\".\n", @@ -863,32 +827,54 @@ " inputCols=[\"text_features\", \"genres_features\"],\n", " outputCol=\"features\",\n", ")\n", - "feature_data = assembler.transform(vectorized_data).select(\"MovieId\", \"features\")\n", + "feature_data = assembler.transform(vectorized_data).select(COL_ITEM, \"features\")\n", "\n", "feature_data.show(10, False)" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+---------------------+\n", + "|itemID|features |\n", + "+------+---------------------+\n", + "|6 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|2 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|5 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|7 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|1 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|4 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|3 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|10 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|8 |[0.0,1.0,0.0,1.0,1.0]|\n", + "|9 |[0.0,1.0,0.0,1.0,1.0]|\n", + "+------+---------------------+\n", + "\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The *features* column is represented with a SparseVector object. For example, in the feature vector (1043,[128,544,1025],[1.0,1.0,1.0]), 1043 is the vector length, indicating the vector consisting of 1043 item features. The values at index positions 128,544,1025 are 1.0, and the values at other positions are all 0. " - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.8738984131037538\n", - "0.8873467159479473\n" - ] - } + "execution_count": null, + "source": [ + "feature_data.count()" ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": null, "source": [ "als_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -903,22 +889,13 @@ "als_serendipity=als_eval.serendipity()\n", "print(als_diversity)\n", "print(als_serendipity)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0.8978120851519519\n", - "0.8937850286817351\n" - ] - } - ], + "execution_count": null, "source": [ "random_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -933,18 +910,48 @@ "random_serendipity=random_eval.serendipity()\n", "print(random_diversity)\n", "print(random_serendipity)" - ] + ], + "outputs": [], + "metadata": {} + }, + { + "cell_type": "code", + "execution_count": 27, + "source": [ + "import cProfile, pstats, io\n", + "\n", + "pr = cProfile.Profile()\n", + "pr.enable()\n", + "# ... do something ...\n", + "als_eval = SparkDiversityEvaluation(\n", + " train_df = train_df, \n", + " reco_df = top_k_reco,\n", + " item_feature_df = feature_data, \n", + " item_sim_measure=\"item_feature_vector\",\n", + " col_user = COL_USER, \n", + " col_item = COL_ITEM\n", + ")\n", + "als_diversity=als_eval.diversity()\n", + "als_serendipity=als_eval.serendipity()\n", + "\n", + "pr.disable()\n", + "s = io.StringIO()\n", + "ps = pstats.Stats(pr, stream=s).sort_stats(\"cumulative\")\n", + "ps.print_stats()\n", + "print(s.getvalue())" + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It's interesting that the value of diversity and serendipity changes when using different item-item similarity calculation approach, for both ALS algorithm and random recommender. The diversity and serendipity of random recommender are still higher than ALS algorithm. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### References\n", "The metric definitions / formulations are based on the following references:\n", @@ -952,24 +959,24 @@ "- G. Shani and A. Gunawardana, Evaluating recommendation systems, Recommender Systems Handbook pp. 257-297, 2010.\n", "- E. Yan, Serendipity: Accuracy’s unpopular best friend in recommender Systems, eugeneyan.com, April 2020\n", "- Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist: introducing serendipity into music recommendation, WSDM 2012\n" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} } ], "metadata": { "kernelspec": { - "display_name": "Python (reco_pyspark)", - "language": "python", - "name": "reco_pyspark" + "name": "python3", + "display_name": "Python 3.6.9 64-bit ('.env': venv)" }, "language_info": { "codemirror_mode": { @@ -981,7 +988,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.13" + "version": "3.6.9" + }, + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" } }, "nbformat": 4, diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index a9e73bf031..1c72e2a97f 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -9,11 +9,19 @@ except ImportError as e: raise ImportError("Pandera not installed. Try `pip install recommender['dev']`") from e +from recommenders.utils.constants import ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, +) + import random from typing import Optional from pandera.typing import DateTime, Series -from pandera import Field +from pandera import Field, Check +from pandera.schemas import DataFrameSchema from pyspark.sql import SparkSession from pyspark.sql.types import StructType @@ -26,9 +34,9 @@ class MockMovielens100kSchema(pa.SchemaModel): http://files.grouplens.org/datasets/movielens/ml-100k/ """ # The 100k dataset has 943 total users - userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 943}) + userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # And 1682 total items - itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 1682}) + itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # Rating is on the scale from 1 to 5 rating: Series[int] = Field(in_range={"min_value": 1, "max_value": 5}) timestamp: Series[DateTime] @@ -36,12 +44,18 @@ class MockMovielens100kSchema(pa.SchemaModel): genres: Series[str] = Field(eq="genreA|0") @classmethod - def get_df(cls, size: int = 3, seed: int = 100): + def get_df( + cls, + size: int = 3, seed: int = 100, + # title_col: Optional[str] = None, genres_col: Optional[str] = None + ): """Return fake movielens dataset as a Pandas Dataframe with specified rows. Args: size (int): number of rows to generate seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + title_col (str, optional): if not None, append a title column. Defaults to None. + genres_col (str, optional): if not None, append a genre column. Defaults to None. Returns: pandas.DataFrame: a mock dataset @@ -50,17 +64,36 @@ def get_df(cls, size: int = 3, seed: int = 100): return cls.example(size=size) @classmethod - def get_spark_df(cls, spark: SparkSession, size: int = 3, seed: int = 100, schema: Optional[StructType] = None): + def get_spark_df( + cls, + spark: SparkSession, + size: int = 3, seed: int = 100, + # title_col: Optional[str] = None, genres_col: Optional[str] = None, + # schema: Optional[StructType] = None + ): """Return fake movielens dataset as a Spark Dataframe with specified rows Args: spark (SparkSession): spark session to load the dataframe into size (int): number of rows to generate seed (int, optional): seeding the pseudo-number generation. Defaults to 100. - schema (pyspark.sql.types.StructType optional): [description]. Defaults to None. + title_col (str, optional): if not None, append a title column. Defaults to None. + genres_col (str, optional): if not None, append a genre column. Defaults to None. + schema (pyspark.sql.types.StructType, optional): dataset schema. Defaults to None. Returns: pyspark.sql.DataFrame: a mock dataset """ pandas_df = cls.get_df(size=size, seed=seed) - return spark.createDataFrame(pandas_df, schema=schema) + return spark.createDataFrame(pandas_df) + + # @classmethod + # def _get_item_df(cls, size, title_col: Optional[str] = None, genres_col: Optional[str] = None): + # schema = DataFrameSchema() # create an empty schema + # if title_col is not None: + # # adds a title column with random alphabets + # schema = schema.add_columns({title_col: pa.Column(str, Check.str_matches(r'^[a-z]+$'))}) + # if genres_col is not None: + # # adds a genre column with '|' separated string + # schema = schema.add_columns({genres_col: pa.Column(str, Check.str_matches(r'^[a-z]+\|[0-9]$'))}) + # schema.example() \ No newline at end of file diff --git a/recommenders/evaluation/spark_evaluation.py b/recommenders/evaluation/spark_evaluation.py index 37a73778ea..875e404519 100644 --- a/recommenders/evaluation/spark_evaluation.py +++ b/recommenders/evaluation/spark_evaluation.py @@ -3,6 +3,7 @@ import numpy as np +from pyspark.sql.types import LongType try: from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics @@ -574,7 +575,7 @@ def __init__( self.col_item_features = DEFAULT_ITEM_FEATURES_COL required_schema = StructType( ( - StructField(self.col_item, IntegerType()), + StructField(self.col_item, LongType()), StructField(self.col_item_features, VectorUDT()), ) ) diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index e4ae1d9464..c8a916c45c 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -48,9 +48,11 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name): +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=data_size)) @pytest.mark.notebooks From e4f41e7b943cbab0fa7aa0e0e10148d14709c556 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Wed, 22 Sep 2021 23:37:50 +0000 Subject: [PATCH 03/27] use csv and change datetime to int --- recommenders/datasets/mock/movielens.py | 15 ++++++++++++--- recommenders/evaluation/spark_evaluation.py | 4 ++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index 1c72e2a97f..41d2eaec6f 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -23,7 +23,7 @@ from pandera import Field, Check from pandera.schemas import DataFrameSchema from pyspark.sql import SparkSession -from pyspark.sql.types import StructType +from pyspark.sql.types import StructField, StructType, LongType, IntegerType, StringType, FloatType class MockMovielens100kSchema(pa.SchemaModel): @@ -39,7 +39,7 @@ class MockMovielens100kSchema(pa.SchemaModel): itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # Rating is on the scale from 1 to 5 rating: Series[int] = Field(in_range={"min_value": 1, "max_value": 5}) - timestamp: Series[DateTime] + timestamp: Series[int] title: Series[str] = Field(eq="foo") genres: Series[str] = Field(eq="genreA|0") @@ -85,7 +85,16 @@ def get_spark_df( pyspark.sql.DataFrame: a mock dataset """ pandas_df = cls.get_df(size=size, seed=seed) - return spark.createDataFrame(pandas_df) + pandas_df.to_csv('test.csv', header=False, index=False) + default_schema = StructType([ + StructField(DEFAULT_USER_COL, IntegerType()), + StructField(DEFAULT_ITEM_COL, IntegerType()), + StructField(DEFAULT_RATING_COL, FloatType()), + StructField(DEFAULT_TIMESTAMP_COL, LongType()), + StructField("title", StringType()), + StructField("genres", StringType()), + ]) + return spark.read.csv('test.csv', schema=default_schema) # @classmethod # def _get_item_df(cls, size, title_col: Optional[str] = None, genres_col: Optional[str] = None): diff --git a/recommenders/evaluation/spark_evaluation.py b/recommenders/evaluation/spark_evaluation.py index 875e404519..5110d72e82 100644 --- a/recommenders/evaluation/spark_evaluation.py +++ b/recommenders/evaluation/spark_evaluation.py @@ -575,7 +575,7 @@ def __init__( self.col_item_features = DEFAULT_ITEM_FEATURES_COL required_schema = StructType( ( - StructField(self.col_item, LongType()), + StructField(self.col_item, IntegerType()), StructField(self.col_item_features, VectorUDT()), ) ) @@ -618,7 +618,7 @@ def _get_pairwise_items(self, df): .select(self.col_user, "i1", "i2") ) - def _get_cosine_similarity(self, n_partitions=200): + def _get_cosine_similarity(self, n_partitions=10): if self.item_sim_measure == "item_cooccurrence_count": # calculate item-item similarity based on item co-occurrence count From 772bbc6a3b96add2c6a335249213bd61957ea45a Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Thu, 23 Sep 2021 15:29:29 +0000 Subject: [PATCH 04/27] Try more experiment with 10 rows and another NB --- recommenders/datasets/mock/movielens.py | 4 ++-- recommenders/datasets/movielens.py | 3 ++- tests/unit/examples/test_notebooks_pyspark.py | 8 +++++--- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index 41d2eaec6f..4344de7e42 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -39,7 +39,7 @@ class MockMovielens100kSchema(pa.SchemaModel): itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # Rating is on the scale from 1 to 5 rating: Series[int] = Field(in_range={"min_value": 1, "max_value": 5}) - timestamp: Series[int] + timestamp: Series[str] = Field(eq="2022-2-22") title: Series[str] = Field(eq="foo") genres: Series[str] = Field(eq="genreA|0") @@ -90,7 +90,7 @@ def get_spark_df( StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), - StructField(DEFAULT_TIMESTAMP_COL, LongType()), + StructField(DEFAULT_TIMESTAMP_COL, StringType()), StructField("title", StringType()), StructField("genres", StringType()), ]) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 463bc00853..863578902b 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -103,7 +103,8 @@ def item_has_header(self): # Fake data for testing only MOCK_DATA_FORMAT = { - "mock100": {"size": 100, "seed": 101} + "mock100": {"size": 100, "seed": 0}, + "mock10": {"size": 10, "seed": 6} } # 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset. diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index c8a916c45c..d96e5c2ca9 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -31,9 +31,11 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) -def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name): +@pytest.mark.parametrize("data_size", ["100k", "mock100", "mock10"]) +def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["als_deep_dive"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=data_size)) @pytest.mark.notebooks @@ -48,7 +50,7 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["100k", "mock100", "mock10"]) def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, From 49f874d53f3612bba81f0c9819094631ee245b81 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Thu, 23 Sep 2021 16:08:08 +0000 Subject: [PATCH 05/27] Try mock100 dataset on other NBs --- .../als_deep_dive.ipynb | 406 +++--- .../als_movielens_diversity_metrics.ipynb | 241 +--- .../tuning_spark_als.ipynb | 1281 ++++------------- tests/unit/examples/test_notebooks_pyspark.py | 8 +- tests/unit/examples/test_notebooks_python.py | 12 +- 5 files changed, 522 insertions(+), 1426 deletions(-) diff --git a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb index b633257bff..ce825152fc 100644 --- a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb @@ -2,32 +2,31 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Spark Collaborative Filtering (ALS) Deep Dive" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Spark MLlib provides a collaborative filtering algorithm that can be used for training a matrix factorization model, which predicts explicit or implicit ratings of users on items for recommendations.\n", "\n", "This notebook presents a deep dive into the Spark collaborative filtering algorithm." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 1 Matrix factorization algorithm\n", "\n", @@ -54,11 +53,11 @@ "Owing to the term of $q_{i}^{T}p_{u}$ the loss function is non-convex. Gradient descent method can be applied but this will incur expensive computations. An Alternating Least Square (ALS) algorithm was therefore developed to overcome this issue. \n", "\n", "The basic idea of ALS is to learn one of $q$ and $p$ at a time for optimization while keeping the other as constant. This makes the objective at each iteration convex and solvable. The alternating between $q$ and $p$ stops when there is convergence to the optimal. It is worth noting that this iterative computation can be parallelised and/or distributed, which makes the algorithm desirable for use cases where the dataset is large and thus the user-item rating matrix is super sparse (as is typical in recommendation scenarios). A comprehensive discussion of ALS and its distributed computation can be found [here](http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 2 Spark Mllib implementation\n", "\n", @@ -67,40 +66,28 @@ "* The uniqueness of ALS implementation is that it distributes the matrix factorization model training by using \"Alternating Least Square\" method. \n", "* In the training method, there are parameters that can be selected to control the model performance.\n", "* Both explicit and implicit ratings are supported by Spark ALS model." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 3 Spark ALS based MovieLens recommender\n", "\n", "In the following code, the MovieLens-100K dataset is used to illustrate the ALS algorithm in Spark." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", - "[GCC 7.2.0]\n", - "Pandas version: 0.23.0\n", - "PySpark version: 2.3.1\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -129,21 +116,34 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", + "[GCC 7.2.0]\n", + "Pandas version: 0.23.0\n", + "PySpark version: 2.3.1\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data column names" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 20, - "metadata": {}, - "outputs": [], "source": [ + "MOVIELENS_DATA_SIZE = \"100k\"\n", + "\n", "COL_USER = \"UserId\"\n", "COL_ITEM = \"MovieId\"\n", "COL_RATING = \"Rating\"\n", @@ -158,80 +158,84 @@ " StructField(COL_TIMESTAMP, LongType()),\n", " )\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Model hyper parameters - these parameters are selected with reference to the benchmarking results [here](http://mymedialite.net/examples/datasets.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 21, - "metadata": {}, - "outputs": [], "source": [ "RANK = 10\n", "MAX_ITER = 15\n", "REG_PARAM = 0.05" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Number of recommended items" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 22, - "metadata": {}, - "outputs": [], "source": [ "K = 10" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Initialize a Spark session." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 23, - "metadata": {}, - "outputs": [], "source": [ "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.1 Load and prepare data" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is read from csv into a Spark DataFrame." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "source": [ + "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "../../recommenders/dataset/movielens.py:471: UserWarning: Both schema and header are provided.\n", " The header argument will be ignored.\n", @@ -240,20 +244,18 @@ ] } ], - "source": [ - "dfs = movielens.load_spark_df(spark=spark, size=\"100k\", schema=schema)" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 25, - "metadata": { - "scrolled": true - }, + "source": [ + "dfs.show(5)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -269,45 +271,43 @@ ] } ], - "source": [ - "dfs.show(5)" - ] + "metadata": { + "scrolled": true + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is then randomly split by 80-20 ratio for training and testing." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 26, - "metadata": {}, - "outputs": [], "source": [ "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.2 Train a movielens model " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It is worth noting that Spark ALS model allows dropping cold users to favor a robust evaluation with the testing data. In case there are cold users, Spark ALS implementation allows users to drop cold users in order to make sure evaluations on the prediction results are sound." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 27, - "metadata": {}, - "outputs": [], "source": [ "als = ALS(\n", " maxIter=MAX_ITER, \n", @@ -320,49 +320,38 @@ ")\n", "\n", "model = als.fit(dfs_train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.3 Prediction with the model\n", "\n", "The trained model can be used to predict ratings with a given test data." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 28, - "metadata": {}, - "outputs": [], "source": [ "dfs_pred = model.transform(dfs_test).drop(COL_RATING)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "With the prediction results, the model performance can be evaluated." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "RMSE score = 0.9697095550242029\n", - "MAE score = 0.7554838330206419\n", - "R2 score = 0.24874053010909036\n", - "Explained variance score = 0.2547961843833687\n" - ] - } - ], "source": [ "evaluations = SparkRatingEvaluation(\n", " dfs_test, \n", @@ -380,23 +369,54 @@ " \"Explained variance score = {}\".format(evaluations.exp_var()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE score = 0.9697095550242029\n", + "MAE score = 0.7554838330206419\n", + "R2 score = 0.24874053010909036\n", + "Explained variance score = 0.2547961843833687\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Oftentimes ranking metrics are also of interest to data scientists. Note usually ranking metrics apply to the scenario of recommending a list of items. In our case, the recommended items should be different from those that have been rated by the users. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 30, - "metadata": {}, + "source": [ + "# Get the cross join of all user-item pairs and score them.\n", + "users = dfs_train.select('UserId').distinct()\n", + "items = dfs_train.select('MovieId').distinct()\n", + "user_item = users.crossJoin(items)\n", + "dfs_pred = model.transform(user_item)\n", + "\n", + "# Remove seen items.\n", + "dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", + " dfs_train.alias(\"train\"),\n", + " (dfs_pred['UserId'] == dfs_train['UserId']) & (dfs_pred['MovieId'] == dfs_train['MovieId']),\n", + " how='outer'\n", + ")\n", + "\n", + "dfs_pred_final = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", + " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", + "\n", + "dfs_pred_final.show()" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+----------+\n", "|UserId|MovieId|prediction|\n", @@ -427,42 +447,11 @@ ] } ], - "source": [ - "# Get the cross join of all user-item pairs and score them.\n", - "users = dfs_train.select('UserId').distinct()\n", - "items = dfs_train.select('MovieId').distinct()\n", - "user_item = users.crossJoin(items)\n", - "dfs_pred = model.transform(user_item)\n", - "\n", - "# Remove seen items.\n", - "dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", - " dfs_train.alias(\"train\"),\n", - " (dfs_pred['UserId'] == dfs_train['UserId']) & (dfs_pred['MovieId'] == dfs_train['MovieId']),\n", - " how='outer'\n", - ")\n", - "\n", - "dfs_pred_final = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", - " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", - "\n", - "dfs_pred_final.show()" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 31, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Precision@k = 0.04061505832449631\n", - "Recall@k = 0.013571438145917577\n", - "NDCG@k = 0.03699684800440573\n", - "Mean average precision = 0.003702411260039904\n" - ] - } - ], "source": [ "evaluations = SparkRankingEvaluation(\n", " dfs_test, \n", @@ -481,11 +470,23 @@ " \"Mean average precision = {}\".format(evaluations.map_at_k()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Precision@k = 0.04061505832449631\n", + "Recall@k = 0.013571438145917577\n", + "NDCG@k = 0.03699684800440573\n", + "Mean average precision = 0.003702411260039904\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.4 Fine tune the model\n", "\n", @@ -498,48 +499,47 @@ "|`maxIters`|Maximum number of iterations|10|The more iterations the better the model converges to the optimal point.|\n", "\n", "It is always a good practice to start model building with default parameter values and then sweep the parameter in a range to find the optimal combination of parameters. The following parameter set is used for training ALS models for comparison study purposes." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 32, - "metadata": {}, - "outputs": [], "source": [ "param_dict = {\n", " \"rank\": [10, 15, 20],\n", " \"regParam\": [0.001, 0.1, 1.0]\n", "}" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Generate a dictionary for each parameter combination which can then be fed into model training." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 33, - "metadata": {}, - "outputs": [], "source": [ "param_grid = generate_param_grid(param_dict)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Train models with parameters specified in the parameter grid. Evaluate the model with, for example, the RMSE metric, and then record the metrics for visualization." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 34, - "metadata": {}, - "outputs": [], "source": [ "rmse_score = []\n", "\n", @@ -569,94 +569,98 @@ "\n", "rmse_score = [float('%.4f' % x) for x in rmse_score]\n", "rmse_score_array = np.reshape(rmse_score, (len(param_dict[\"rank\"]), len(param_dict[\"regParam\"]))) " - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 35, - "metadata": {}, - "outputs": [], "source": [ "rmse_df = pd.DataFrame(data=rmse_score_array, index=pd.Index(param_dict[\"rank\"], name=\"rank\"), \n", " columns=pd.Index(param_dict[\"regParam\"], name=\"reg. parameter\"))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 36, - "metadata": {}, + "source": [ + "fig, ax = plt.subplots()\n", + "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "" ] }, - "execution_count": 36, "metadata": {}, - "output_type": "execute_result" + "execution_count": 36 }, { + "output_type": "display_data", "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEKCAYAAAD6q1UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XeYU1X+x/H3yUwywzAFZqjSUVSKCD8BCwsMyoLiosgKLBaKAhYWbCC4soC6riIKu6KsIChFBUR2F8Te2MUyUpQiTao4OCK9OS3J+f2REBhghqySBOZ+Xs+Th+Sec5Pvmct8cnNykjHWWkRExFlcsS5ARESiT+EvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHCg+1gUUZ/e1bfTR47NU5Q82xroE+RX2D8+MdQnyK5Qd8aoJp5/O/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoPhYF1AalL1nKJ4Wl+Pft5f9A/qc0O6+rCVJt9wO1g8+H4cnPYd3zSoAEq7qQJnuPQHInT2d/I/eA8DTqi1lut8KLheFS7L4+eUXojcgB+jQPpOxYx8lzuXipZdn8tSY54u016xZjcmTxlKhYjp79+yjZ+9BbN+ew8UXN+T58U+QkpqMz+fjiSfHM2fOfADaZrZk9Og/4/G4+eqrVfTr/wA+n49OndrzyKgh+P0Wr9fLAw+M5LPPl8Ri2KWOp1M/4s9vij18gNwXhp3QHnf+JXja3oi1Fvw+Ct6bgf/7b3HVboCn/S2hfq4KVcmf+xy+9ctw1WmIp10PMC4oyCN/3kTs3h3RHFZUGGttrGs4qd3XtjkzCzuJ+IaNsXm5JN//p5OGP4llIC8XgLjadUkZNop9d/bEJKeQ9vdJ7L+nP2BJ+/uL7L+nHxgXac9OZv89/bAH9lP2vofI//g9vCu+iu7AfqHKH2yMdQklcrlcrF29iKs79iA7O4esL97mllvvZu3aDaE+s2ZO5K23P2TGjDm0zWxJr17d6d1nEPXq1cVay8aNW6hatTKLs96hUeNMDhw4yOaNi2l/dXc2bNjMqJGD+e67bF6eOouyZZM4fPhnAC66qD4zX3uBRhe1idXwT2n/8MxYlxA2V80LoSCPhM53njT8cSdAYT4AplINEm8cRO6EIUX7JJYlaeBYfh43ELwFlBnwNHmzx2J3/UB8s3a4zjmXgvkTozCa06PsiFdNOP007XMaeFevxB48WHyHYPADmMQyHHlWc1/SgsKvl2IPHcQeOkTh10txX3Iprirn4Pvhe+yB/QAULl9GQsszNyzONi2aN2XTpq1s2bKNwsJCXn99Htd16lCkT/369fj4408B+GThZ1zXqT0AGzZsZuPGLQDk5Ozgp527qVgxg4yM8uTn57Nhw2YAPvzwv3S5oSNAKPgByiYlcaaecJ2N/NvWYXMPFd8hGPwAxpMAJ/nZxzdogW/jCvAWBDZYi0koE9gnIQl7aO9prflMoWmfKPFc3oqkXv0w5cpzcFTgDMWVUQH/zp9Cffy7duLKqEDhsi+Jq14TV6Uq+HftxHP5bzDx7liVXuqcU60K32f/ELqdvT2HFs2bFumzcuUautzQkfHPTaFz52tITU0hPb08e/YcDYLmzZrg8bjZtGkr1lrcbjeX/F9jln21ki5drqV6jXNCfa+//moe/8tDVKqYwXXX94r8ICUk7oJmeK7qjimbSt7MMSe0xze8nMKsd0K38xdMJrHHEKy3EPJzyZ0yMprlRo3O/KOk4ItF7LuzJwcfe5gyt94W3HqSV2fWYg8d4vDz40geNpLUp8bj3/Ej+HxRrbc0M+bEn/vxZ+MPDn2M1q0vY8ni92jd6jKys3Pwer2h9ipVKjF16rP07Xt/aN+bb7mbZ54exRefLeDQocN4vUeP2bx579Loojb8/sbbeWTUcdMOElG+9UvJnTCEvNnj8GR2LdJmksvhqlQD36aVoW3uS68hb+YYcv82EO/y/+Bpf3O0S46KiJz5G2PSgIeAzkDF4OafgHnAk9bafcXs1x/oD/BMo3r0qlk1EuXFlHf1SuKqVMOkpuHfvRP3RU1Cba4KFSlctRyAwsWfU7j4cwASru4Efn9M6i2NtmfnUKP60bPy6tWqkpNT9A29nJwddO3WD4CyZZPocsO1HDgQmNpLSUlm/rzpjBj5FF8uPvo+TNaXy8i8sgsAv23Xmnr16p7w2Is+/ZK6dWuRkVGe3btL53TCmcq/bR2mfCUokwzBqaK4BpfiXbcU/MEn6qQUXJVr4t++CQDv6iwSbx4aq5IjKlJn/q8De4FMa22GtTYDaBvcNqe4nay1k6y1zay1zUpT8LuqVgtdjzu3HiY+HntgP4XLFuNu2hyTnIxJTsbdtDmFyxYDYNLKBf5NTibx2uvJe29BTGovjZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3KLHpGLFDAA8Hg9DBg9g0qQZAJx7bu1Qn6ZNGuHxuBX8UWLKVw5dd1WpDXHxoeAHiG90Bd7VXxzdIfcwJjEJk14FgLi6jfDv2h6tcqMqUnP+ta21o4/dYK39ERhtjLmtmH3OWskPjsB9URNMahrlps0h99WXA//JgPx35uNp2ZqEKzuAz4vNL+Dg6EcAsIcOkjtrOmnjAisJcmdOwx4KnF2WvWMQcXXODW33/5Adg5GVTj6fj3vuHc7bb71GnMvF1GmzWbPmW0aNHMzSZStYsOAD2rS5gscfewiLZdGiLAYOehiArl070arVpaRnlKdnz24A3N73PlasWM3g+++i47XtcLlcTJw4nU8WfgZAlxs6csstN1JY6CUvN4+bbr4rZmMvbRK6DMBVqz4mKYUy946ncOEbod8977KPiK/fnPjGrbB+H3gLyJ87PrSvSauASU3Hv3Xt0Tu0fvLfnExi13ux1g95h8mfPynaw4qKiCz1NMa8D3wITLPW7ghuqwz0Bn5rrW13qvs4m5Z6SlFn+lJPKdnZtNRTThTrpZ7dgQzgP8aYPcaYPcBCIB3oWtKOIiISeRGZ9rHW7gWGBi9FGGP6AC9H4nFFRCQ8sVjq+UgMHlNERI4RqaWeK4trAioX0yYiIlESqdU+lYEOBJZ2HssAn0foMUVEJEyRCv8FQLK1dvnxDcaYhRF6TBERCVOk3vC9vYS2myLxmCIiEj59t4+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXGg+FgXUJzUaS/HugT5pc5pFesKROQUdOYvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOFBY4W+MSTjJtvTTX46IiERDuGf+/zTGuI/cMMZUBT6ITEkiIhJp4Yb/v4E5xpg4Y0xt4D3goUgVJSIikRUfTidr7YvGGA+BJ4HawB3W2s8jWZiIiEROieFvjLn/2JtADWA5cJkx5jJr7dhIFne2GP7Xsfz3s8Wkly/Hv1954YT2xV+tZNCwR6hWtQoA7dpcwV233Ux+fgG9BgyhoLAQn9fHb9v+hj/2vRWArKVf88zzU/D7LUlJiTz+8APUrH5OVMdVmnVon8nYsY8S53Lx0sszeWrM80Xaa9asxuRJY6lQMZ29e/bRs/cgtm/P4eKLG/L8+CdISU3G5/PxxJPjmTNnPgBtM1syevSf8XjcfPXVKvr1fwCfz0e5cmlMfvEZ6tatRX5ePn37P8Dq1etjMexSx9OpH/HnN8UePkDuC8NOaI87/xI8bW/EWgt+HwXvzcD//be4ajfA0/6WUD9Xharkz30O3/pluOo0xNOuBxgXFOSRP28idu+OaA4rKoy1tvhGY0aWtLO19pHTXlFQ4a7NxRd2hlm6fBVJZcrwp8eeLjb8p86cy4QxRX9c1lpyc/NISipDoddLz7sGM+yeO7i4UX2u/UNfnn1yBOfWrsmsfy5g1Zr1PD78gWgN6Vcpc06rWJdQIpfLxdrVi7i6Yw+ys3PI+uJtbrn1btau3RDqM2vmRN56+0NmzJhD28yW9OrVnd59BlGvXl2stWzcuIWqVSuzOOsdGjXO5MCBg2zeuJj2V3dnw4bNjBo5mO++y+blqbMY/cRwDh0+zGN/GccFF5zL+L//lfZXd4/hT6Bk+4dnxrqEsLlqXggFeSR0vvOk4Y87AQrzATCVapB44yByJwwp2iexLEkDx/LzuIHgLaDMgKfJmz0Wu+sH4pu1w3XOuRTMnxiF0ZweZUe8asLpV+KZfyTDvTRp1uQituf872cGxhiSksoA4PV68Xq9GBM4bgY4fPhnAA4eOkzFChmnrV6na9G8KZs2bWXLlm0AvP76PK7r1KFI+NevX48HBo8C4JOFnzH3jSkAbNiwOdQnJ2cHP+3cTcWKGbjd8eTn54faP/zwvwx98I+8PHUW9eufz+inxgOwfv0matWqTqVKFfjpp13RGG6p5t+2DpNWofgOweAHMJ4EOMnJbnyDFvg2rgBvQWCDtZiEMljAJCRhD+09zVWfGcKa8zfGnA8MJjDfH9rHWntlZMoqfVZ8s5Yuve6mUoUMBg/oy3l1awHg8/nodtsgtm3/gR5dfkfjhhcC8Miwe7lr8AgSEzyULZvEa5PGxbL8UuWcalX4PvuH0O3s7Tm0aN60SJ+VK9fQ5YaOjH9uCp07X0Nqagrp6eXZs+doEDRv1gSPx82mTVux1uJ2u7nk/xqz7KuVdOlyLdVrBKbpVq5aww2dO/LZ50to3qwJtWpVp3q1qgr/KIm7oBmeq7pjyqaSN3PMCe3xDS+nMOud0O38BZNJ7DEE6y2E/Fxyp5Q4AXLWCne1zxzga2A4MOSYi4ShwQXn8sHcafxz2gRu+n0nBj30aKgtLi6OudOe56N/zWDVmm/ZsHkrANNn/4t/PP0oH/37FTp3bM9Tz74Yo+pLnyOvro51/PTng0Mfo3Xry1iy+D1at7qM7OwcvF5vqL1KlUpMnfosffveH9r35lvu5pmnR/HFZws4dOgwXq8PgNFPPUe58mksXfI+AwbcxtfLv8Hr80VwhHIs3/ql5E4YQt7scXgyuxZpM8nlcFWqgW/TytA296XXkDdzDLl/G4h3+X/wtL852iVHRbjh77XW/sNau9hau+zIpbjOxpirj7meZoyZYoxZaYx5zRhTuYT9+htjlhpjlk6ePvN/GMaZLbls2dD0TusrWuD1etm7b3+RPqkpyTT/v8Z8mrWUPXv3sX7j5tCrgGuuas3yb9ZEve7Sant2DjWOefO8erWq5Bw3bZeTs4Ou3frRvEUH/jxiNAAHDhwEICUlmfnzpjNi5FN8ufir0D5ZXy4j88ouXN7ydyxalMXGjVsAOHjwEH373U+z5u3p3WcQFStkhKacJHr829ZhyleCMsmhbXENLsW7bin4g0/GSSm4KtfEv30TAN7VWcTVOD8W5UZcuOH/pjHmbmNMVWNM+pFLCf3/esz1Z4AcoBOwBCj2nRNr7SRrbTNrbbO+PXuEWdqZb9fuPaGzw1Vr1uO3lnJpqezZu48DBw8BkJefT9aSr6lTqwapKSkcOvwzW7dlA/D5kq+pW6tmzOovbZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3AVF9qlYMfC+jMfjYcjgAUyaNAOAtLRU3O7AZyRvv+0mFn36JQeDx10iy5Q/eq7pqlIb4uIh9+jPPr7RFXhXf3F0h9zDmMQkTHpgZV5c3Ub4d22PVrlRFdacP9Ar+O+xUz0WqBvGvs2stU2C18cZY3qV2PssNGTkkyz5eiX79h3gqs63cPftt4amCLrfcC3vf/Ips//1FnHxcSR6PIx5ZBjGGHbu3svDf3kan9+P9Vs6XNmKzJaXAjBq6CDue/hxjMuQmpLMYw/dF8shlio+n4977h3O22+9RpzLxdRps1mz5ltGjRzM0mUrWLDgA9q0uYLHH3sIi2XRoiwGDnoYgK5dO9Gq1aWkZ5SnZ89uANze9z5WrFjN4PvvouO17XC5XEycOJ1PFn4GQP0L6/HyS3/H5/exdu239Os/OGZjL20SugzAVas+JimFMveOp3DhG4GAB7zLPiK+fnPiG7fC+n3gLSB/7vjQviatAiY1Hf/WtUfv0PrJf3MyiV3vxVo/5B0mf/6kaA8rKkpc6vmL79SYbGAsgUUrA4BzbfCBjDErrbWNT3UfZ9NSTynqTF/qKSU7m5Z6yolOy1LPYxljGgENgMQj26y104vp/iKQErw+DagA7DTGVCHwITEREYmhcJd6jgQyCYT/28A1wKfAScO/uM8HWGt/NMZ88osqFRGR0ybcN3xvBK4CfrTW9gEuBk74mucw6YNjIiIxFu60T5611m+M8RpjUoGfKOHNXmPMyuKagGKXeoqISHScMvxNYL3bSmNMOQJz+cuAQ8DiEnarDHQAjv9ctAH0baAiIjF2yvC31lpjTBNr7T7gBWPMu0Cqtba4s3uABUCytfaEN3eNMQt/cbUiInJahDvtk2WMaW6tXWKt3Xqqztba20touync4kREJDLCDf+2wB3GmO+AwwSmb2w46/VFROTME274XxPRKkREJKrC/TOO30W6EBERiZ5w1/mLiEgpovAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQeKj3UBxfL7Yl2BiEippTN/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBwoPtYFlAbDn/gb//18Cenl0/j39AkntC/+eiWDHvoL1apWBqBd6yu4q08P8vML6DVwKAUFhfh8fn6b2ZI/3n4zAA8/Po6lK74huWwSAI//6T4urFc3eoMq5Tq0z2Ts2EeJc7l46eWZPDXm+SLtNWtWY/KksVSomM7ePfvo2XsQ27fncPHFDXl+/BOkpCbj8/l44snxzJkzH4Ar2/6GJ58cjsvl4vChw9zW9z42bdrKvff057bbeuD1etm1cw99+9/Ptm3bYzHsUsfTqR/x5zfFHj5A7gvDTmiPO/8SPG1vxFoLfh8F783A//23uGo3wNP+llA/V4Wq5M99Dt/6ZbjqNMTTrgcYFxTkkT9vInbvjmgOKyqMtTbWNZxU4U8bzszCTmLp8m9IKpPInx4fW2z4T535LyY8NbLIdmstubl5JCWVodDrpefdDzLsnv5c3PBCHn58HG2uaE77tr+J1jBOmzLVM2NdQolcLhdrVy/i6o49yM7OIeuLt7nl1rtZu3ZDqM+smRN56+0PmTFjDm0zW9KrV3d69xlEvXp1sdayceMWqlatzOKsd2jUOJP9+w+wZvUiuvy+D+vWbeTOO3rRvHkTbu97H5ltruDLxV+Rm5vHHf170qbN5dx0810x/AmUbP/wzFiXEDZXzQuhII+EzneeNPxxJ0BhPgCmUg0SbxxE7oQhRfskliVp4Fh+HjcQvAWUGfA0ebPHYnf9QHyzdrjOOZeC+ROjMJrTo+yIV004/TTtcxo0a9KItNSU/3k/YwxJSWUA8Hq9eL0+DGEdN/kVWjRvyqZNW9myZRuFhYW8/vo8ruvUoUif+vXr8fHHnwLwycLPuK5TewA2bNjMxo1bAMjJ2cFPO3dTsWIGEHgyT00J/D9IS0shJydwtrjwP5+Tm5sHwJeLl1G9WtXID9Ih/NvWYXMPFd8hGPwAxpMAJznZjW/QAt/GFeAtCGywFpMQ+L00CUnYQ3tPa81nCk37RMmK1evo0vuPVKqQweABt3FenVoA+Hw+uvW9l23bc+hxw7U0bnhBaJ9nX5zBP6bO4rJLLua+O3vj8bhjVX6pck61Knyf/UPodvb2HFo0b1qkz8qVa+hyQ0fGPzeFzp2vITU1hfT08uzZczQImjdrgsfjZtOmrQDcccdg3pw/g9zcPA4cPEjL33Q64bH79O7Bu+99EpmByUnFXdAMz1XdMWVTyZs55oT2+IaXU5j1Tuh2/oLJJPYYgvUWQn4uuVNGnrBPaRCRM39jTJox5kljzDpjzO7gZW1wW7lIPOaZrMH55/HBnJf459TnuOn3v2PQn/4SaouLi2Puy+P5aO5UVq39lg2btwJw7x29ePPVF5j94jj2HzzIlFffiFH1pY8xJ766On7688Ghj9G69WUsWfwerVtdRnZ2Dl6vN9RepUolpk59lr597w/te889/eh03a3UrtuMadNm8/SYoqFx001daHbJxTz9zD8iMCopjm/9UnInDCFv9jg8mV2LtJnkcrgq1cC3aWVom/vSa8ibOYbcvw3Eu/w/eNrfHO2SoyJS0z6vA3uBTGtthrU2A2gb3DanuJ2MMf2NMUuNMUsnT58VodKiL7lsUmh6p/XlzfF6fezdt79In9SUZJo3vYhPv/wKgIoV0jHG4PG46dyxHavWfhv1ukur7dk51Kh+Tuh29WpVQ1M0R+Tk7KBrt340b9GBP48YDcCBAwcBSElJZv686YwY+RRfLg4crwoV0ml8UQMWL/kagNfnzOfyy5uF7u+qK1vx0LBBdO7Sm4KCgoiOT07Ov20dpnwlKJMc2hbX4FK865aC3xfYkJSCq3JN/Ns3AeBdnUVcjfNjUW7ERSr8a1trR1trfzyywVr7o7V2NFCzuJ2stZOstc2stc369vxDhEqLvl2794bODletWY/fbymXlsqevfs5cDAwX5mXn0/W0uXUqVkdgJ279gCBM9KPF2VRr26t2BRfCi1ZupzzzqtD7do1cLvddOt2PW8ueL9In4yM8qFXCMOGDmTqtMDJiNvtZu6cKbzyyhvMnbsg1H/v3v2kpaVSL7giq91VrVm3LvAGcpMmDZnw/JPc0KUPO3fujsYQJciUrxy67qpSG+Li4Zj3COIbXYF39RdHd8g9jElMwqRXASCubiP8u0rnyqxIzfl/Z4x5EJhmrd0BYIypDPQGvo/QY8bMkFFPseTrVezbf4CruvTi7ttuDk0RdO/ckfcXfsrsf79DXJyLxIQExox6EGMMO3fv4eG/jsPn82Otnw5tW5HZsgUAQx97mr379mOt5YLz6jJy8IBYDrFU8fl83HPvcN5+6zXiXC6mTpvNmjXfMmrkYJYuW8GCBR/Qps0VPP7YQ1gsixZlMXDQwwB07dqJVq0uJT2jPD17dgPg9r73sWLFau64awivz56E32/Zt3cfffs/AMDoJ/5McnJZZs0MrBj5/vvt3NClT2wGX8okdBmAq1Z9TFIKZe4dT+HCNwIBD3iXfUR8/ebEN26F9fvAW0D+3PGhfU1aBUxqOv6ta4/eofWT/+ZkErvei7V+yDtM/vxJ0R5WVERkqacxpjwwDLgeqAxYYAcwHxhtrd1zqvs4m5Z6SlFn+lJPKdnZtNRTThTuUs+InPlba/caY14GPgCyrLWh11nGmKuBdyPxuCIiEp5IrfYZBMwD/gh8Y4y5/pjmv0biMUVEJHyRmvPvB1xirT1kjKkNvGGMqW2t/TvoU0wiIrEWqfCPOzLVY63daozJJPAEUAuFv4hIzEVqqeePxpgmR24Enwh+B1QALorQY4qISJgiFf49gR+P3WCt9VprewKtI/SYIiISpkit9skuoe2zSDymiIiET9/qKSLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQMZaG+saHMkY099aOynWdcgvo+N39tKxC9CZf+z0j3UB8qvo+J29dOxQ+IuIOJLCX0TEgRT+seP4OceznI7f2UvHDr3hKyLiSDrzFxFxIIX/aWCMudoYs94Ys9EYM+wk7QnGmNnB9i+NMbWPaXsouH29MabDMdtfMsb8ZIz5JjqjkOOFcVxbG2O+MsZ4jTE3xqJGOblT/f6YgGeDx3alMeb/ol1jrCn8fyVjTBzwPHAN0ADoYYxpcFy324G91trzgHHA6OC+DYA/AA2Bq4EJwfsDmBrcJjEQ5nHdBvQGXotudRKGqZT8+3MNUC946Q/8Iwo1nVEU/r9eC2CjtXaztbYAmAVcf1yf64FpwetvAFcZY0xw+yxrbb61dguwMXh/WGv/C+yJxgDkpE55XK21W621KwF/LAqU4oXx+3M9MN0GZAHljDFVo1PdmUHh/+tVA74/5nZ2cNtJ+1hrvcB+ICPMfSU2dGxKN8cfX4X/r2dOsu34JVTF9QlnX4kNHZvSzfHHV+H/62UDNY65XR34obg+xph4II3AS9Jw9pXY0LEp3Rx/fBX+v94SoJ4xpo4xxkPgDdz5x/WZD/QKXr8R+NgGPmAxH/hDcDVQHQJvPi2OUt1SsnCOq5xhr+LzAAADzklEQVS95gM9g6t+LgP2W2tzYl1UNMXHuoCznbXWa4z5I/AeEAe8ZK1dbYx5FFhqrZ0PTAFmGGM2Ejjj/0Nw39XGmNeBNYAXGGCt9QEYY2YCmUAFY0w2MNJaOyXKw3OscI6rMaY58C+gPNDJGPOItbZhDMuWoJP9/gBuAGvtC8DbQEcCiyx+BvrEptLY0Sd8RUQcSNM+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/kTOQMaa3MeacWNchpZfCX84awQ/knDH/Z4Of1o6U3sD/FP4RrkdKGa3zlzNa8G8fvAN8AlwOdAYuAB4BEoBNQB9r7SFjTEdgLLAL+Aqoa6393Snu+13gS6Ap8C3Q01r7szFmBNAJKAN8DtxhrbXGmIXB2y0JfEr0W2A44AF2Azdba3cYY0YBdYCqwPnA/cBlBL5KeDvQyVpbaIy5JFhzcrDu3sH7nhrslxscd4Pj+1lrc46vx1r7zP/y8xUHs9bqossZewFqE/jK5MuCtysA/wXKBm8PBUYAiQS+pbFOcPtMYEEY922BlsHbLwGDg9fTj+k3g0BYAywEJhzTVp6jJ1F9gWeC10cBnxL4VOnFBD5Fek2w7V8EnsTcBIK7YnB7dwKfJD7yOM2C10/Vb0JJ49RFl5Nd9DJRzgbf2cB3rkPg7LkB8FngTyLgAb4ALgQ228DfRYBA+PcP476/t9Z+Frz+CjAIeBpoa4x5EEgC0oHVwJvBfrOP2b86MDv4XfAeYMsxbe/YwNn9KgJfEfFucPsqAk88FwCNgA+CY4kDTvb9MqfqN/sk+4iUSOEvZ4PDx1w3wAfW2h7HdjDGNP2F9338vKc1xiQCEwiceX8fnMJJLKae8cBYG/iun0wCZ/xH5ANYa/3GmEJr7ZHH8hP43TPAamvt5aeo8VT9DhezXaRYZ8ybZyJhygJaGmPOAzDGJBljzgfWAXWP+fvI3cO8v5rGmCOh2oPAVM2RoN9ljEkm8E2sxUkjMDcPR7+5NVzrgYpHHt8Y4zbGHPliuINAShj9RH4Rhb+cVay1Owm8KTrTGLOSwJPBhdbaXOBu4F1jzKfADgJ/MQ1jTDNjzORi7nIt0Ct4X+nAP6y1+4AXCUzP/JvA1zsXZxQwxxiziMAbsf/LWAoIPLGMNsasAJYDVwSbpwIvGGOWE5jmKa6fyC+i1T5Sahhjkm1g1Y8h8MfXN1hrx5XQvzaBN4UbRalEkTOGzvylNOkXPFNeTWA6ZmKM6xE5Y+nMX0TEgXTmLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoP8HkqdScClQ374AAAAASUVORK5CYII=\n", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEKCAYAAAD6q1UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XeYU1X+x/H3yUwywzAFZqjSUVSKCD8BCwsMyoLiosgKLBaKAhYWbCC4soC6riIKu6KsIChFBUR2F8Te2MUyUpQiTao4OCK9OS3J+f2REBhghqySBOZ+Xs+Th+Sec5Pvmct8cnNykjHWWkRExFlcsS5ARESiT+EvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHCg+1gUUZ/e1bfTR47NU5Q82xroE+RX2D8+MdQnyK5Qd8aoJp5/O/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoPhYF1AalL1nKJ4Wl+Pft5f9A/qc0O6+rCVJt9wO1g8+H4cnPYd3zSoAEq7qQJnuPQHInT2d/I/eA8DTqi1lut8KLheFS7L4+eUXojcgB+jQPpOxYx8lzuXipZdn8tSY54u016xZjcmTxlKhYjp79+yjZ+9BbN+ew8UXN+T58U+QkpqMz+fjiSfHM2fOfADaZrZk9Og/4/G4+eqrVfTr/wA+n49OndrzyKgh+P0Wr9fLAw+M5LPPl8Ri2KWOp1M/4s9vij18gNwXhp3QHnf+JXja3oi1Fvw+Ct6bgf/7b3HVboCn/S2hfq4KVcmf+xy+9ctw1WmIp10PMC4oyCN/3kTs3h3RHFZUGGttrGs4qd3XtjkzCzuJ+IaNsXm5JN//p5OGP4llIC8XgLjadUkZNop9d/bEJKeQ9vdJ7L+nP2BJ+/uL7L+nHxgXac9OZv89/bAH9lP2vofI//g9vCu+iu7AfqHKH2yMdQklcrlcrF29iKs79iA7O4esL97mllvvZu3aDaE+s2ZO5K23P2TGjDm0zWxJr17d6d1nEPXq1cVay8aNW6hatTKLs96hUeNMDhw4yOaNi2l/dXc2bNjMqJGD+e67bF6eOouyZZM4fPhnAC66qD4zX3uBRhe1idXwT2n/8MxYlxA2V80LoSCPhM53njT8cSdAYT4AplINEm8cRO6EIUX7JJYlaeBYfh43ELwFlBnwNHmzx2J3/UB8s3a4zjmXgvkTozCa06PsiFdNOP007XMaeFevxB48WHyHYPADmMQyHHlWc1/SgsKvl2IPHcQeOkTh10txX3Iprirn4Pvhe+yB/QAULl9GQsszNyzONi2aN2XTpq1s2bKNwsJCXn99Htd16lCkT/369fj4408B+GThZ1zXqT0AGzZsZuPGLQDk5Ozgp527qVgxg4yM8uTn57Nhw2YAPvzwv3S5oSNAKPgByiYlcaaecJ2N/NvWYXMPFd8hGPwAxpMAJ/nZxzdogW/jCvAWBDZYi0koE9gnIQl7aO9prflMoWmfKPFc3oqkXv0w5cpzcFTgDMWVUQH/zp9Cffy7duLKqEDhsi+Jq14TV6Uq+HftxHP5bzDx7liVXuqcU60K32f/ELqdvT2HFs2bFumzcuUautzQkfHPTaFz52tITU0hPb08e/YcDYLmzZrg8bjZtGkr1lrcbjeX/F9jln21ki5drqV6jXNCfa+//moe/8tDVKqYwXXX94r8ICUk7oJmeK7qjimbSt7MMSe0xze8nMKsd0K38xdMJrHHEKy3EPJzyZ0yMprlRo3O/KOk4ItF7LuzJwcfe5gyt94W3HqSV2fWYg8d4vDz40geNpLUp8bj3/Ej+HxRrbc0M+bEn/vxZ+MPDn2M1q0vY8ni92jd6jKys3Pwer2h9ipVKjF16rP07Xt/aN+bb7mbZ54exRefLeDQocN4vUeP2bx579Loojb8/sbbeWTUcdMOElG+9UvJnTCEvNnj8GR2LdJmksvhqlQD36aVoW3uS68hb+YYcv82EO/y/+Bpf3O0S46KiJz5G2PSgIeAzkDF4OafgHnAk9bafcXs1x/oD/BMo3r0qlk1EuXFlHf1SuKqVMOkpuHfvRP3RU1Cba4KFSlctRyAwsWfU7j4cwASru4Efn9M6i2NtmfnUKP60bPy6tWqkpNT9A29nJwddO3WD4CyZZPocsO1HDgQmNpLSUlm/rzpjBj5FF8uPvo+TNaXy8i8sgsAv23Xmnr16p7w2Is+/ZK6dWuRkVGe3btL53TCmcq/bR2mfCUokwzBqaK4BpfiXbcU/MEn6qQUXJVr4t++CQDv6iwSbx4aq5IjKlJn/q8De4FMa22GtTYDaBvcNqe4nay1k6y1zay1zUpT8LuqVgtdjzu3HiY+HntgP4XLFuNu2hyTnIxJTsbdtDmFyxYDYNLKBf5NTibx2uvJe29BTGovjZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3KLHpGLFDAA8Hg9DBg9g0qQZAJx7bu1Qn6ZNGuHxuBX8UWLKVw5dd1WpDXHxoeAHiG90Bd7VXxzdIfcwJjEJk14FgLi6jfDv2h6tcqMqUnP+ta21o4/dYK39ERhtjLmtmH3OWskPjsB9URNMahrlps0h99WXA//JgPx35uNp2ZqEKzuAz4vNL+Dg6EcAsIcOkjtrOmnjAisJcmdOwx4KnF2WvWMQcXXODW33/5Adg5GVTj6fj3vuHc7bb71GnMvF1GmzWbPmW0aNHMzSZStYsOAD2rS5gscfewiLZdGiLAYOehiArl070arVpaRnlKdnz24A3N73PlasWM3g+++i47XtcLlcTJw4nU8WfgZAlxs6csstN1JY6CUvN4+bbr4rZmMvbRK6DMBVqz4mKYUy946ncOEbod8977KPiK/fnPjGrbB+H3gLyJ87PrSvSauASU3Hv3Xt0Tu0fvLfnExi13ux1g95h8mfPynaw4qKiCz1NMa8D3wITLPW7ghuqwz0Bn5rrW13qvs4m5Z6SlFn+lJPKdnZtNRTThTrpZ7dgQzgP8aYPcaYPcBCIB3oWtKOIiISeRGZ9rHW7gWGBi9FGGP6AC9H4nFFRCQ8sVjq+UgMHlNERI4RqaWeK4trAioX0yYiIlESqdU+lYEOBJZ2HssAn0foMUVEJEyRCv8FQLK1dvnxDcaYhRF6TBERCVOk3vC9vYS2myLxmCIiEj59t4+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXGg+FgXUJzUaS/HugT5pc5pFesKROQUdOYvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOFBY4W+MSTjJtvTTX46IiERDuGf+/zTGuI/cMMZUBT6ITEkiIhJp4Yb/v4E5xpg4Y0xt4D3goUgVJSIikRUfTidr7YvGGA+BJ4HawB3W2s8jWZiIiEROieFvjLn/2JtADWA5cJkx5jJr7dhIFne2GP7Xsfz3s8Wkly/Hv1954YT2xV+tZNCwR6hWtQoA7dpcwV233Ux+fgG9BgyhoLAQn9fHb9v+hj/2vRWArKVf88zzU/D7LUlJiTz+8APUrH5OVMdVmnVon8nYsY8S53Lx0sszeWrM80Xaa9asxuRJY6lQMZ29e/bRs/cgtm/P4eKLG/L8+CdISU3G5/PxxJPjmTNnPgBtM1syevSf8XjcfPXVKvr1fwCfz0e5cmlMfvEZ6tatRX5ePn37P8Dq1etjMexSx9OpH/HnN8UePkDuC8NOaI87/xI8bW/EWgt+HwXvzcD//be4ajfA0/6WUD9Xharkz30O3/pluOo0xNOuBxgXFOSRP28idu+OaA4rKoy1tvhGY0aWtLO19pHTXlFQ4a7NxRd2hlm6fBVJZcrwp8eeLjb8p86cy4QxRX9c1lpyc/NISipDoddLz7sGM+yeO7i4UX2u/UNfnn1yBOfWrsmsfy5g1Zr1PD78gWgN6Vcpc06rWJdQIpfLxdrVi7i6Yw+ys3PI+uJtbrn1btau3RDqM2vmRN56+0NmzJhD28yW9OrVnd59BlGvXl2stWzcuIWqVSuzOOsdGjXO5MCBg2zeuJj2V3dnw4bNjBo5mO++y+blqbMY/cRwDh0+zGN/GccFF5zL+L//lfZXd4/hT6Bk+4dnxrqEsLlqXggFeSR0vvOk4Y87AQrzATCVapB44yByJwwp2iexLEkDx/LzuIHgLaDMgKfJmz0Wu+sH4pu1w3XOuRTMnxiF0ZweZUe8asLpV+KZfyTDvTRp1uQituf872cGxhiSksoA4PV68Xq9GBM4bgY4fPhnAA4eOkzFChmnrV6na9G8KZs2bWXLlm0AvP76PK7r1KFI+NevX48HBo8C4JOFnzH3jSkAbNiwOdQnJ2cHP+3cTcWKGbjd8eTn54faP/zwvwx98I+8PHUW9eufz+inxgOwfv0matWqTqVKFfjpp13RGG6p5t+2DpNWofgOweAHMJ4EOMnJbnyDFvg2rgBvQWCDtZiEMljAJCRhD+09zVWfGcKa8zfGnA8MJjDfH9rHWntlZMoqfVZ8s5Yuve6mUoUMBg/oy3l1awHg8/nodtsgtm3/gR5dfkfjhhcC8Miwe7lr8AgSEzyULZvEa5PGxbL8UuWcalX4PvuH0O3s7Tm0aN60SJ+VK9fQ5YaOjH9uCp07X0Nqagrp6eXZs+doEDRv1gSPx82mTVux1uJ2u7nk/xqz7KuVdOlyLdVrBKbpVq5aww2dO/LZ50to3qwJtWpVp3q1qgr/KIm7oBmeq7pjyqaSN3PMCe3xDS+nMOud0O38BZNJ7DEE6y2E/Fxyp5Q4AXLWCne1zxzga2A4MOSYi4ShwQXn8sHcafxz2gRu+n0nBj30aKgtLi6OudOe56N/zWDVmm/ZsHkrANNn/4t/PP0oH/37FTp3bM9Tz74Yo+pLnyOvro51/PTng0Mfo3Xry1iy+D1at7qM7OwcvF5vqL1KlUpMnfosffveH9r35lvu5pmnR/HFZws4dOgwXq8PgNFPPUe58mksXfI+AwbcxtfLv8Hr80VwhHIs3/ql5E4YQt7scXgyuxZpM8nlcFWqgW/TytA296XXkDdzDLl/G4h3+X/wtL852iVHRbjh77XW/sNau9hau+zIpbjOxpirj7meZoyZYoxZaYx5zRhTuYT9+htjlhpjlk6ePvN/GMaZLbls2dD0TusrWuD1etm7b3+RPqkpyTT/v8Z8mrWUPXv3sX7j5tCrgGuuas3yb9ZEve7Sant2DjWOefO8erWq5Bw3bZeTs4Ou3frRvEUH/jxiNAAHDhwEICUlmfnzpjNi5FN8ufir0D5ZXy4j88ouXN7ydyxalMXGjVsAOHjwEH373U+z5u3p3WcQFStkhKacJHr829ZhyleCMsmhbXENLsW7bin4g0/GSSm4KtfEv30TAN7VWcTVOD8W5UZcuOH/pjHmbmNMVWNM+pFLCf3/esz1Z4AcoBOwBCj2nRNr7SRrbTNrbbO+PXuEWdqZb9fuPaGzw1Vr1uO3lnJpqezZu48DBw8BkJefT9aSr6lTqwapKSkcOvwzW7dlA/D5kq+pW6tmzOovbZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3AVF9qlYMfC+jMfjYcjgAUyaNAOAtLRU3O7AZyRvv+0mFn36JQeDx10iy5Q/eq7pqlIb4uIh9+jPPr7RFXhXf3F0h9zDmMQkTHpgZV5c3Ub4d22PVrlRFdacP9Ar+O+xUz0WqBvGvs2stU2C18cZY3qV2PssNGTkkyz5eiX79h3gqs63cPftt4amCLrfcC3vf/Ips//1FnHxcSR6PIx5ZBjGGHbu3svDf3kan9+P9Vs6XNmKzJaXAjBq6CDue/hxjMuQmpLMYw/dF8shlio+n4977h3O22+9RpzLxdRps1mz5ltGjRzM0mUrWLDgA9q0uYLHH3sIi2XRoiwGDnoYgK5dO9Gq1aWkZ5SnZ89uANze9z5WrFjN4PvvouO17XC5XEycOJ1PFn4GQP0L6/HyS3/H5/exdu239Os/OGZjL20SugzAVas+JimFMveOp3DhG4GAB7zLPiK+fnPiG7fC+n3gLSB/7vjQviatAiY1Hf/WtUfv0PrJf3MyiV3vxVo/5B0mf/6kaA8rKkpc6vmL79SYbGAsgUUrA4BzbfCBjDErrbWNT3UfZ9NSTynqTF/qKSU7m5Z6yolOy1LPYxljGgENgMQj26y104vp/iKQErw+DagA7DTGVCHwITEREYmhcJd6jgQyCYT/28A1wKfAScO/uM8HWGt/NMZ88osqFRGR0ybcN3xvBK4CfrTW9gEuBk74mucw6YNjIiIxFu60T5611m+M8RpjUoGfKOHNXmPMyuKagGKXeoqISHScMvxNYL3bSmNMOQJz+cuAQ8DiEnarDHQAjv9ctAH0baAiIjF2yvC31lpjTBNr7T7gBWPMu0Cqtba4s3uABUCytfaEN3eNMQt/cbUiInJahDvtk2WMaW6tXWKt3Xqqztba20touync4kREJDLCDf+2wB3GmO+AwwSmb2w46/VFROTME274XxPRKkREJKrC/TOO30W6EBERiZ5w1/mLiEgpovAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQeKj3UBxfL7Yl2BiEippTN/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBwoPtYFlAbDn/gb//18Cenl0/j39AkntC/+eiWDHvoL1apWBqBd6yu4q08P8vML6DVwKAUFhfh8fn6b2ZI/3n4zAA8/Po6lK74huWwSAI//6T4urFc3eoMq5Tq0z2Ts2EeJc7l46eWZPDXm+SLtNWtWY/KksVSomM7ePfvo2XsQ27fncPHFDXl+/BOkpCbj8/l44snxzJkzH4Ar2/6GJ58cjsvl4vChw9zW9z42bdrKvff057bbeuD1etm1cw99+9/Ptm3bYzHsUsfTqR/x5zfFHj5A7gvDTmiPO/8SPG1vxFoLfh8F783A//23uGo3wNP+llA/V4Wq5M99Dt/6ZbjqNMTTrgcYFxTkkT9vInbvjmgOKyqMtTbWNZxU4U8bzszCTmLp8m9IKpPInx4fW2z4T535LyY8NbLIdmstubl5JCWVodDrpefdDzLsnv5c3PBCHn58HG2uaE77tr+J1jBOmzLVM2NdQolcLhdrVy/i6o49yM7OIeuLt7nl1rtZu3ZDqM+smRN56+0PmTFjDm0zW9KrV3d69xlEvXp1sdayceMWqlatzOKsd2jUOJP9+w+wZvUiuvy+D+vWbeTOO3rRvHkTbu97H5ltruDLxV+Rm5vHHf170qbN5dx0810x/AmUbP/wzFiXEDZXzQuhII+EzneeNPxxJ0BhPgCmUg0SbxxE7oQhRfskliVp4Fh+HjcQvAWUGfA0ebPHYnf9QHyzdrjOOZeC+ROjMJrTo+yIV004/TTtcxo0a9KItNSU/3k/YwxJSWUA8Hq9eL0+DGEdN/kVWjRvyqZNW9myZRuFhYW8/vo8ruvUoUif+vXr8fHHnwLwycLPuK5TewA2bNjMxo1bAMjJ2cFPO3dTsWIGEHgyT00J/D9IS0shJydwtrjwP5+Tm5sHwJeLl1G9WtXID9Ih/NvWYXMPFd8hGPwAxpMAJznZjW/QAt/GFeAtCGywFpMQ+L00CUnYQ3tPa81nCk37RMmK1evo0vuPVKqQweABt3FenVoA+Hw+uvW9l23bc+hxw7U0bnhBaJ9nX5zBP6bO4rJLLua+O3vj8bhjVX6pck61Knyf/UPodvb2HFo0b1qkz8qVa+hyQ0fGPzeFzp2vITU1hfT08uzZczQImjdrgsfjZtOmrQDcccdg3pw/g9zcPA4cPEjL33Q64bH79O7Bu+99EpmByUnFXdAMz1XdMWVTyZs55oT2+IaXU5j1Tuh2/oLJJPYYgvUWQn4uuVNGnrBPaRCRM39jTJox5kljzDpjzO7gZW1wW7lIPOaZrMH55/HBnJf459TnuOn3v2PQn/4SaouLi2Puy+P5aO5UVq39lg2btwJw7x29ePPVF5j94jj2HzzIlFffiFH1pY8xJ766On7688Ghj9G69WUsWfwerVtdRnZ2Dl6vN9RepUolpk59lr597w/te889/eh03a3UrtuMadNm8/SYoqFx001daHbJxTz9zD8iMCopjm/9UnInDCFv9jg8mV2LtJnkcrgq1cC3aWVom/vSa8ibOYbcvw3Eu/w/eNrfHO2SoyJS0z6vA3uBTGtthrU2A2gb3DanuJ2MMf2NMUuNMUsnT58VodKiL7lsUmh6p/XlzfF6fezdt79In9SUZJo3vYhPv/wKgIoV0jHG4PG46dyxHavWfhv1ukur7dk51Kh+Tuh29WpVQ1M0R+Tk7KBrt340b9GBP48YDcCBAwcBSElJZv686YwY+RRfLg4crwoV0ml8UQMWL/kagNfnzOfyy5uF7u+qK1vx0LBBdO7Sm4KCgoiOT07Ov20dpnwlKJMc2hbX4FK865aC3xfYkJSCq3JN/Ns3AeBdnUVcjfNjUW7ERSr8a1trR1trfzyywVr7o7V2NFCzuJ2stZOstc2stc369vxDhEqLvl2794bODletWY/fbymXlsqevfs5cDAwX5mXn0/W0uXUqVkdgJ279gCBM9KPF2VRr26t2BRfCi1ZupzzzqtD7do1cLvddOt2PW8ueL9In4yM8qFXCMOGDmTqtMDJiNvtZu6cKbzyyhvMnbsg1H/v3v2kpaVSL7giq91VrVm3LvAGcpMmDZnw/JPc0KUPO3fujsYQJciUrxy67qpSG+Li4Zj3COIbXYF39RdHd8g9jElMwqRXASCubiP8u0rnyqxIzfl/Z4x5EJhmrd0BYIypDPQGvo/QY8bMkFFPseTrVezbf4CruvTi7ttuDk0RdO/ckfcXfsrsf79DXJyLxIQExox6EGMMO3fv4eG/jsPn82Otnw5tW5HZsgUAQx97mr379mOt5YLz6jJy8IBYDrFU8fl83HPvcN5+6zXiXC6mTpvNmjXfMmrkYJYuW8GCBR/Qps0VPP7YQ1gsixZlMXDQwwB07dqJVq0uJT2jPD17dgPg9r73sWLFau64awivz56E32/Zt3cfffs/AMDoJ/5McnJZZs0MrBj5/vvt3NClT2wGX8okdBmAq1Z9TFIKZe4dT+HCNwIBD3iXfUR8/ebEN26F9fvAW0D+3PGhfU1aBUxqOv6ta4/eofWT/+ZkErvei7V+yDtM/vxJ0R5WVERkqacxpjwwDLgeqAxYYAcwHxhtrd1zqvs4m5Z6SlFn+lJPKdnZtNRTThTuUs+InPlba/caY14GPgCyrLWh11nGmKuBdyPxuCIiEp5IrfYZBMwD/gh8Y4y5/pjmv0biMUVEJHyRmvPvB1xirT1kjKkNvGGMqW2t/TvoU0wiIrEWqfCPOzLVY63daozJJPAEUAuFv4hIzEVqqeePxpgmR24Enwh+B1QALorQY4qISJgiFf49gR+P3WCt9VprewKtI/SYIiISpkit9skuoe2zSDymiIiET9/qKSLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQMZaG+saHMkY099aOynWdcgvo+N39tKxC9CZf+z0j3UB8qvo+J29dOxQ+IuIOJLCX0TEgRT+seP4OceznI7f2UvHDr3hKyLiSDrzFxFxIIX/aWCMudoYs94Ys9EYM+wk7QnGmNnB9i+NMbWPaXsouH29MabDMdtfMsb8ZIz5JjqjkOOFcVxbG2O+MsZ4jTE3xqJGOblT/f6YgGeDx3alMeb/ol1jrCn8fyVjTBzwPHAN0ADoYYxpcFy324G91trzgHHA6OC+DYA/AA2Bq4EJwfsDmBrcJjEQ5nHdBvQGXotudRKGqZT8+3MNUC946Q/8Iwo1nVEU/r9eC2CjtXaztbYAmAVcf1yf64FpwetvAFcZY0xw+yxrbb61dguwMXh/WGv/C+yJxgDkpE55XK21W621KwF/LAqU4oXx+3M9MN0GZAHljDFVo1PdmUHh/+tVA74/5nZ2cNtJ+1hrvcB+ICPMfSU2dGxKN8cfX4X/r2dOsu34JVTF9QlnX4kNHZvSzfHHV+H/62UDNY65XR34obg+xph4II3AS9Jw9pXY0LEp3Rx/fBX+v94SoJ4xpo4xxkPgDdz5x/WZD/QKXr8R+NgGPmAxH/hDcDVQHQJvPi2OUt1SsnCOq5xhr+LzAAADzklEQVS95gM9g6t+LgP2W2tzYl1UNMXHuoCznbXWa4z5I/AeEAe8ZK1dbYx5FFhqrZ0PTAFmGGM2Ejjj/0Nw39XGmNeBNYAXGGCt9QEYY2YCmUAFY0w2MNJaOyXKw3OscI6rMaY58C+gPNDJGPOItbZhDMuWoJP9/gBuAGvtC8DbQEcCiyx+BvrEptLY0Sd8RUQcSNM+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/kTOQMaa3MeacWNchpZfCX84awQ/knDH/Z4Of1o6U3sD/FP4RrkdKGa3zlzNa8G8fvAN8AlwOdAYuAB4BEoBNQB9r7SFjTEdgLLAL+Aqoa6393Snu+13gS6Ap8C3Q01r7szFmBNAJKAN8DtxhrbXGmIXB2y0JfEr0W2A44AF2Azdba3cYY0YBdYCqwPnA/cBlBL5KeDvQyVpbaIy5JFhzcrDu3sH7nhrslxscd4Pj+1lrc46vx1r7zP/y8xUHs9bqossZewFqE/jK5MuCtysA/wXKBm8PBUYAiQS+pbFOcPtMYEEY922BlsHbLwGDg9fTj+k3g0BYAywEJhzTVp6jJ1F9gWeC10cBnxL4VOnFBD5Fek2w7V8EnsTcBIK7YnB7dwKfJD7yOM2C10/Vb0JJ49RFl5Nd9DJRzgbf2cB3rkPg7LkB8FngTyLgAb4ALgQ228DfRYBA+PcP476/t9Z+Frz+CjAIeBpoa4x5EEgC0oHVwJvBfrOP2b86MDv4XfAeYMsxbe/YwNn9KgJfEfFucPsqAk88FwCNgA+CY4kDTvb9MqfqN/sk+4iUSOEvZ4PDx1w3wAfW2h7HdjDGNP2F9338vKc1xiQCEwiceX8fnMJJLKae8cBYG/iun0wCZ/xH5ANYa/3GmEJr7ZHH8hP43TPAamvt5aeo8VT9DhezXaRYZ8ybZyJhygJaGmPOAzDGJBljzgfWAXWP+fvI3cO8v5rGmCOh2oPAVM2RoN9ljEkm8E2sxUkjMDcPR7+5NVzrgYpHHt8Y4zbGHPliuINAShj9RH4Rhb+cVay1Owm8KTrTGLOSwJPBhdbaXOBu4F1jzKfADgJ/MQ1jTDNjzORi7nIt0Ct4X+nAP6y1+4AXCUzP/JvA1zsXZxQwxxiziMAbsf/LWAoIPLGMNsasAJYDVwSbpwIvGGOWE5jmKa6fyC+i1T5Sahhjkm1g1Y8h8MfXN1hrx5XQvzaBN4UbRalEkTOGzvylNOkXPFNeTWA6ZmKM6xE5Y+nMX0TEgXTmLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoP8HkqdScClQ374AAAAASUVORK5CYII=", "text/plain": [ "
" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], - "source": [ - "fig, ax = plt.subplots()\n", - "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The calculated RMSE scores can be visualized to comparatively study how model performance is affected by different parameters." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It can be seen from this visualization that RMSE first decreases and then increases as rank increases, due to overfitting. When the rank equals 20 and the regularization parameter equals 0.1, the model achieves the lowest RMSE score." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.5 Top K recommendation" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.1 Top k for all users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 37, - "metadata": {}, - "outputs": [], "source": [ "dfs_rec = model.recommendForAllUsers(10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 38, - "metadata": {}, + "source": [ + "dfs_rec.show(10)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", @@ -677,36 +681,36 @@ ] } ], - "source": [ - "dfs_rec.show(10)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.2 Top k for a selected set of users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 39, - "metadata": {}, - "outputs": [], "source": [ "users = dfs_train.select(als.getUserCol()).distinct().limit(3)\n", "\n", "dfs_rec_subset = model.recommendForUserSubset(users, 10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 40, - "metadata": {}, + "source": [ + "dfs_rec_subset.show(10)" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+\n", "|UserId| recommendations|\n", @@ -719,13 +723,10 @@ ] } ], - "source": [ - "dfs_rec_subset.show(10)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.3 Run-time considerations for top-k recommendations\n", "\n", @@ -734,28 +735,28 @@ "* Inner products of user-item pairs are calculated individually instead of leveraging matrix block multiplication features which are available in certain contemporary computing acceleration libraries (e.g., BLAS).\n", "\n", "More details about possible optimizations of the top k recommendations in Spark can be found [here](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 41, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## References" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "1. Yehuda Koren, Robert Bell, and Chris Volinsky, \"Matrix Factorization Techniques for Recommender Systems\n", "\", ACM Computer, Vol. 42, Issue 8, pp 30-37, Aug., 2009.\n", @@ -765,14 +766,14 @@ "4. Seaborn. url: https://seaborn.pydata.org/\n", "5. Scaling collaborative filtering with PySpark. url: https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html\n", "6. Matrix Completion via Alternating Least Square (ALS). url: http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf" - ] + ], + "metadata": {} } ], "metadata": { "kernelspec": { - "display_name": "Python 3 Spark - local", - "language": "python", - "name": "spark-3-python" + "name": "python3", + "display_name": "Python 3.6.9 64-bit ('.env': venv)" }, "language_info": { "codemirror_mode": { @@ -784,7 +785,10 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.5.5" + "version": "3.6.9" + }, + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" } }, "nbformat": 4, diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 31e998bbb6..aaba0a35d1 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -197,18 +197,20 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 10, "source": [ "# top k items to recommend\n", - "TOP_K = 1\n", + "TOP_K = 10\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = 'mock100'\n", + "MOVIELENS_DATA_SIZE = 'mock10'\n", "\n", "# user, item column names\n", "COL_USER=\"userId\"\n", "COL_ITEM=\"itemID\"\n", - "COL_RATING=\"rating\"" + "COL_RATING=\"rating\"\n", + "COL_TITLE=\"title\"\n", + "COL_GENRE=\"genres\"" ], "outputs": [], "metadata": { @@ -255,13 +257,13 @@ "schema = StructType(\n", " (\n", " StructField(COL_USER, IntegerType()),\n", - " StructField(COL_ITEM, LongType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", " StructField(COL_RATING, FloatType()),\n", - " StructField(\"Timestamp\", LongType()),\n", + " StructField(\"Timestamp\", StringType()),\n", " )\n", ")\n", "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=\"title\", genres_col=\"genres\")\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=\"genres\")\n", "data.show()" ], "outputs": [ @@ -269,31 +271,20 @@ "output_type": "stream", "name": "stdout", "text": [ - "+------+------+------+--------------------+-----+--------+\n", - "|userID|itemID|rating| timestamp|title| genres|\n", - "+------+------+------+--------------------+-----+--------+\n", - "| 6| 4| 4|2200-06-19 12:21:...| foo|genreA|0|\n", - "| 8| 4| 1|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 8| 4| 4|2109-02-14 15:31:...| foo|genreA|0|\n", - "| 9| 2| 2|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 9| 4| 3|2210-04-25 01:58:...| foo|genreA|0|\n", - "| 3| 5| 3| 1970-01-01 00:00:00| foo|genreA|0|\n", - "| 1| 2| 1|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 8| 3| 2| 1970-01-01 00:00:00| foo|genreA|0|\n", - "| 3| 10| 4|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 7| 10| 2|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 8| 9| 5|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 4| 2| 3|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 5| 8| 5|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 2| 7| 1|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 4| 6| 2| 1970-01-01 00:00:00| foo|genreA|0|\n", - "| 2| 5| 3|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 7| 2| 1|1970-01-01 00:00:...| foo|genreA|0|\n", - "| 8| 4| 5|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 7| 8| 1|1969-12-31 23:59:...| foo|genreA|0|\n", - "| 9| 4| 1|1970-01-01 00:00:...| foo|genreA|0|\n", - "+------+------+------+--------------------+-----+--------+\n", - "only showing top 20 rows\n", + "+------+------+------+---------+-----+--------+\n", + "|userID|itemID|rating|timestamp|title| genres|\n", + "+------+------+------+---------+-----+--------+\n", + "| 8| 3| 4.0|2022-2-22| foo|genreA|0|\n", + "| 8| 9| 5.0|2022-2-22| foo|genreA|0|\n", + "| 5| 1| 5.0|2022-2-22| foo|genreA|0|\n", + "| 9| 1| 1.0|2022-2-22| foo|genreA|0|\n", + "| 7| 5| 5.0|2022-2-22| foo|genreA|0|\n", + "| 3| 6| 5.0|2022-2-22| foo|genreA|0|\n", + "| 2| 6| 2.0|2022-2-22| foo|genreA|0|\n", + "| 5| 7| 4.0|2022-2-22| foo|genreA|0|\n", + "| 6| 9| 2.0|2022-2-22| foo|genreA|0|\n", + "| 5| 6| 3.0|2022-2-22| foo|genreA|0|\n", + "+------+------+------+---------+-----+--------+\n", "\n" ] } @@ -320,8 +311,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "N train_df 73\n", - "N test_df 27\n" + "N train_df 6\n", + "N test_df 4\n" ] } ], @@ -401,7 +392,7 @@ "output_type": "stream", "name": "stdout", "text": [ - "Took 2.5952707109972835 seconds for training.\n" + "Took 2.296935658028815 seconds for training.\n" ] } ], @@ -445,8 +436,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "48\n", - "10\n" + "30\n", + "30\n" ] } ], @@ -463,7 +454,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "source": [ "# random recommender\n", "window = Window.partitionBy(COL_USER).orderBy(F.rand())\n", @@ -497,7 +488,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "source": [ "def get_ranking_results(ranking_eval):\n", " metrics = {\n", @@ -524,7 +515,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "source": [ "def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):\n", " summary = {\"Data\": data, \"Algo\": algo, \"K\": k}\n", @@ -552,7 +543,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "source": [ "als_ranking_eval = SparkRankingEvaluation(\n", " test_df, \n", @@ -572,7 +563,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "source": [ "als_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -588,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "source": [ "als_results = generate_summary(MOVIELENS_DATA_SIZE, \"als\", TOP_K, als_ranking_metrics, als_diversity_metrics)" ], @@ -604,7 +595,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "source": [ "random_ranking_eval = SparkRankingEvaluation(\n", " test_df,\n", @@ -623,7 +614,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": null, "source": [ "random_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -639,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "source": [ "random_results = generate_summary(MOVIELENS_DATA_SIZE, \"random\", TOP_K, random_ranking_metrics, random_diversity_metrics)" ], @@ -655,7 +646,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, "source": [ "cols = [\"Data\", \"Algo\", \"K\", \"Precision@k\", \"Recall@k\", \"NDCG@k\", \"Mean average precision\",\"catalog_coverage\", \"distributional_coverage\",\"novelty\", \"diversity\", \"serendipity\" ]\n", "df_results = pd.DataFrame(columns=cols)\n", @@ -668,96 +659,11 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, "source": [ "df_results" ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DataAlgoKPrecision@kRecall@kNDCG@kMean average precisioncatalog_coveragedistributional_coveragenoveltydiversityserendipity
1mock100als10.40.1500000.40.170.41.6854753.624421None0.405009
2mock100random10.30.1166670.30.120.62.4464393.644061None0.396229
\n", - "
" - ], - "text/plain": [ - " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", - "1 mock100 als 1 0.4 0.150000 0.4 0.17 \n", - "2 mock100 random 1 0.3 0.116667 0.3 0.12 \n", - "\n", - " catalog_coverage distributional_coverage novelty diversity serendipity \n", - "1 0.4 1.685475 3.624421 None 0.405009 \n", - "2 0.6 2.446439 3.644061 None 0.396229 " - ] - }, - "metadata": {}, - "execution_count": 20 - } - ], + "outputs": [], "metadata": {} }, { @@ -778,14 +684,14 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", - " data.groupBy(COL_ITEM, \"title\", \"genres\").count()\n", + " data.groupBy(COL_ITEM, COL_TITLE, \"genres\").count()\n", " .na.drop() # remove rows with null values\n", " .withColumn(\"genres\", F.split(F.col(\"genres\"), \"\\|\")) # convert to array of genres\n", - " .withColumn(\"title\", F.regexp_replace(F.col(\"title\"), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", + " .withColumn(COL_TITLE, F.regexp_replace(F.col(COL_TITLE), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", " .drop(\"count\") # remove unused columns\n", ")" ], @@ -794,22 +700,22 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, "source": [ "# tokenize \"title\" column\n", - "title_tokenizer = Tokenizer(inputCol=\"title\", outputCol=\"title_words\")\n", + "title_tokenizer = Tokenizer(inputCol=COL_TITLE, outputCol=\"title_words\")\n", "tokenized_data = title_tokenizer.transform(movies)\n", "\n", "# remove stop words\n", "remover = StopWordsRemover(inputCol=\"title_words\", outputCol=\"text\")\n", - "clean_data = remover.transform(tokenized_data).drop(\"title\", \"title_words\")" + "clean_data = remover.transform(tokenized_data).drop(COL_TITLE, \"title_words\")" ], "outputs": [], "metadata": {} }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, "source": [ "# convert text input into feature vectors\n", "\n", @@ -831,29 +737,7 @@ "\n", "feature_data.show(10, False)" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+---------------------+\n", - "|itemID|features |\n", - "+------+---------------------+\n", - "|6 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|2 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|5 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|7 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|1 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|4 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|3 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|10 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|8 |[0.0,1.0,0.0,1.0,1.0]|\n", - "|9 |[0.0,1.0,0.0,1.0,1.0]|\n", - "+------+---------------------+\n", - "\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -914,35 +798,6 @@ "outputs": [], "metadata": {} }, - { - "cell_type": "code", - "execution_count": 27, - "source": [ - "import cProfile, pstats, io\n", - "\n", - "pr = cProfile.Profile()\n", - "pr.enable()\n", - "# ... do something ...\n", - "als_eval = SparkDiversityEvaluation(\n", - " train_df = train_df, \n", - " reco_df = top_k_reco,\n", - " item_feature_df = feature_data, \n", - " item_sim_measure=\"item_feature_vector\",\n", - " col_user = COL_USER, \n", - " col_item = COL_ITEM\n", - ")\n", - "als_diversity=als_eval.diversity()\n", - "als_serendipity=als_eval.serendipity()\n", - "\n", - "pr.disable()\n", - "s = io.StringIO()\n", - "ps = pstats.Stats(pr, stream=s).sort_stats(\"cumulative\")\n", - "ps.print_stats()\n", - "print(s.getvalue())" - ], - "outputs": [], - "metadata": {} - }, { "cell_type": "markdown", "source": [ diff --git a/examples/04_model_select_and_optimize/tuning_spark_als.ipynb b/examples/04_model_select_and_optimize/tuning_spark_als.ipynb index 0d8cf261ea..e0d839412c 100644 --- a/examples/04_model_select_and_optimize/tuning_spark_als.ipynb +++ b/examples/04_model_select_and_optimize/tuning_spark_als.ipynb @@ -2,23 +2,22 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Hyperparameter tuning (Spark based recommender)" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Hyperparameter tuning for Spark based recommender algorithm is important to select a model with the optimal performance. This notebook introduces good practices in performing hyperparameter tuning for building recommender models with the utility functions provided in the [Microsoft/Recommenders](https://github.com/Microsoft/Recommenders.git) repository.\n", "\n", @@ -26,31 +25,19 @@ "* Spark native/custom constructs (`ParamGridBuilder`, `TrainValidationSplit`).\n", "* `hyperopt` package with Tree of Parzen Estimator algorithm. \n", "* Brute-force random search of parameter values sampled with pre-defined space. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 0 Global settings and import" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", - "[GCC 7.2.0]\n", - "Pandas version: 0.23.0\n", - "PySpark version: 2.3.1\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "%matplotlib notebook\n", @@ -58,7 +45,6 @@ "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import sys\n", - "import pandas as pd\n", "import numpy as np\n", "\n", @@ -90,18 +76,27 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", + "[GCC 7.2.0]\n", + "Pandas version: 0.23.0\n", + "PySpark version: 2.3.1\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], "source": [ + "MOVIELENS_DATA_SIZE = \"100k\"\n", + "\n", "NUMBER_CORES = 1\n", "NUMBER_ITERATIONS = 25\n", "\n", @@ -128,138 +123,142 @@ "\n", "RANK = [10, 15, 20, 30, 40]\n", "REG = [ 0.1, 0.01, 0.001, 0.0001, 0.00001]" - ] + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 1 Data preparation" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A Spark session is created. Note in this case, to study the running time for different approaches, the Spark session in local mode uses only one core for running. This eliminates the impact of parallelization of parameter tuning. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 3, - "metadata": {}, - "outputs": [], "source": [ "spark = start_or_get_spark(url=\"local[{}]\".format(NUMBER_CORES))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "MovieLens 100k dataset is used for running the demonstration." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, + "source": [ + "data = load_spark_df(spark, size=MOVIELENS_DATA_SIZE, header=(COL_USER, COL_ITEM, COL_RATING))" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "100%|██████████| 4.81k/4.81k [00:01<00:00, 2.47kKB/s]\n" ] } ], - "source": [ - "data = load_spark_df(spark, size='100k', header=(COL_USER, COL_ITEM, COL_RATING))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "To reduce time spent on the comparitive study, 50% of the data is used for the experimentation below." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, - "outputs": [], "source": [ "data, _ = spark_random_split(data, ratio=SUBSET_RATIO)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The dataset is split into 3 subsets randomly with a given split ratio. The hyperparameter tuning is performed on the training and the validating data, and then the optimal recommender selected is evaluated on the testing dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, - "outputs": [], "source": [ "train, valid, test = spark_random_split(data, ratio=[3, 1, 1])" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 2 Hyper parameter tuning with Azure Machine Learning Services" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The `hyperdrive` module in the [Azure Machine Learning Services](https://azure.microsoft.com/en-us/services/machine-learning-service/) runs [hyperparameter tuning and optimizing for machine learning model selection](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters). At the moment, the service supports running hyperparameter tuning on heterogenous computing targets such as cluster of commodity compute nodes with or without GPU devices (see detailed documentation [here](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets)). It is feasible to run parameter tuning on a cluster of VM nodes. In this case, the service containerizes individual and independent Spark session on each node of the cluster to run the parameter tuning job in parallel, instead of inside a single Spark session where the training is executed in a distributed manner. \n", "\n", "Detailed instructions of tuning hyperparameter of non-Spark workloads by using Azure Machine Learning Services can be found in [this](./hypertune_aml_wide_and_deep_quickstart.ipynb) notebook. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 3 Hyper parameter tuning with Spark ML constructs" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.1 Spark native construct" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Spark ML lib implements modules such as `CrossValidator` and `TrainValidationSplit` for tuning hyperparameters (see [here](https://spark.apache.org/docs/2.2.0/ml-tuning.html)). However, by default, it does not support custom machine learning algorithms, data splitting methods, and evaluation metrics, like what are offered as utility functions in the Recommenders repository. \n", "\n", "For example, the Spark native constuct can be used for tuning a recommender against the `rmse` metric which is one of the available regression metrics in Spark." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Firstly, a Spark ALS object needs to be created. In this case, for illustration purpose, it is an ALS model object." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ "# NOTE the parameters of interest, rank and regParam, are left unset, \n", "# because their values will be assigned in the parameter grid builder.\n", @@ -271,41 +270,41 @@ " nonnegative=False,\n", " **HEADER_ALS\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Then, a parameter grid can be defined as follows. Without loss of generity, only `rank` and `regParam` are considered." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, - "outputs": [], "source": [ "paramGrid = ParamGridBuilder() \\\n", " .addGrid(als.rank, RANK) \\\n", " .addGrid(als.regParam, REG) \\\n", " .build()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Given the settings above, a `TrainValidationSplit` constructor can be created for fitting the best model in the given parameter range. In this case, the `RegressionEvaluator` is using `RMSE`, by default, as an evaluation metric. \n", "\n", "Since the data splitter is embedded in the `TrainValidationSplit` object, to make sure the splitting ratio is consistent across different approaches, the split ratio is set to be 0.75 and in the model training the training dataset and validating dataset are combined. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, - "outputs": [], "source": [ "tvs = TrainValidationSplit(\n", " estimator=als,\n", @@ -317,36 +316,44 @@ " # are therefore not available here. \n", " trainRatio=0.75\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_spark:\n", " # Run TrainValidationSplit, and choose the best set of parameters.\n", " # NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", " model = tvs.fit(train.union(valid))\n", "\n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The model parameters in the grid and the best metrics can be then returned. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, + "source": [ + "for idx, item in enumerate(model.getEstimatorParamMaps()):\n", + " print('Run {}:'.format(idx))\n", + " print('\\tValidation Metric: {}'.format(model.validationMetrics[idx]))\n", + " for key, value in item.items():\n", + " print('\\t{0}: {1}'.format(repr(key), value))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Run 0:\n", "\tValidation Metric: 1.0505385750367227\n", @@ -451,20 +458,17 @@ ] } ], - "source": [ - "for idx, item in enumerate(model.getEstimatorParamMaps()):\n", - " print('Run {}:'.format(idx))\n", - " print('\\tValidation Metric: {}'.format(model.validationMetrics[idx]))\n", - " for key, value in item.items():\n", - " print('\\t{0}: {1}'.format(repr(key), value))" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "source": [ + "model.validationMetrics" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[1.0505385750367227,\n", @@ -494,54 +498,49 @@ " 4.426604995574413]" ] }, - "execution_count": 12, "metadata": {}, - "output_type": "execute_result" + "execution_count": 12 } ], - "source": [ - "model.validationMetrics" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "To get the best model, just do" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 13, - "metadata": {}, - "outputs": [], "source": [ "model_best_spark = model.bestModel" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.2 Custom `Estimator`, `Transformer`, and `Evaluator` for Spark ALS\n", "\n", "One can also customize Spark modules to allow tuning hyperparameters for a desired model and evaluation metric, given that the native Spark ALS does not allow tuning hyperparameters for ranking metrics such as precision@k, recall@k, etc. This can be done by creating custom `Estimator`, `Transformer` and `Evaluator`. The benefit is that, after the customization, the tuning process can make use of `trainValidSplit` directly, which distributes the tuning in a Spark session." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Customized `Estimator` and `Transformer` for top k recommender based on Spark ALS\n", "\n", "The following shows how to implement a PySpark `Estimator` and `Transfomer` for recommending top k items from ALS model. The latter generates top k recommendations from the model object. Both of the two are designed by following the protocol of Spark APIs, to make sure that they can be run with the hyperparameter tuning constructs in Spark." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 14, - "metadata": {}, - "outputs": [], "source": [ "class ALSTopK(\n", " ALS,\n", @@ -696,22 +695,22 @@ " )\n", " \n", " return topk_recommendation_all.select(self.userCol, labelCol, predictionCol)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### Customized precision@k evaluation metric\n", "\n", "In addition to the custom `Estimator` and `Transformer`, it may also be desired to customize an `Evaluator` to allow \"beyond-rating\" metrics. The codes as following illustrates a precision@k evaluator. Other types of evaluators can be developed in a similar way." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 15, - "metadata": {}, - "outputs": [], "source": [ "# Define a custom Evaulator. Here precision@k is used.\n", "class PrecisionAtKEvaluator(Evaluator):\n", @@ -733,20 +732,20 @@ "\n", " def isLargerBetter(self):\n", " return True" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Then a new ALS top-k recommender can be created, and the Spark native construct, `TrainValidationSplit` module, can be used to find the optimal model w.r.t the precision@k metric." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 16, - "metadata": {}, - "outputs": [], "source": [ "alstopk = ALSTopK(\n", " userCol=COL_USER,\n", @@ -771,14 +770,23 @@ " # are therefore not available here. \n", " trainRatio=0.75\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, + "source": [ + "# Run TrainValidationSplit, and choose the best set of parameters.\n", + "# NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", + "model_precision = tvs.fit(train.union(valid))\n", + "\n", + "model_precision.getEstimatorParamMaps()" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[{Param(parent='ALSTopK_4f48b7cc6cf2badfcea7', name='rank', doc='rank of the factorization'): 10,\n", @@ -791,24 +799,15 @@ " Param(parent='ALSTopK_4f48b7cc6cf2badfcea7', name='regParam', doc='regularization parameter (>= 0).'): 0.01}]" ] }, - "execution_count": 17, "metadata": {}, - "output_type": "execute_result" + "execution_count": 17 } ], - "source": [ - "# Run TrainValidationSplit, and choose the best set of parameters.\n", - "# NOTE train and valid is union because in Spark TrainValidationSplit does splitting by itself.\n", - "model_precision = tvs.fit(train.union(valid))\n", - "\n", - "model_precision.getEstimatorParamMaps()" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 18, - "metadata": {}, - "outputs": [], "source": [ "def best_param(model, is_larger_better=True):\n", " if is_larger_better:\n", @@ -819,25 +818,35 @@ " parameters = model.getEstimatorParamMaps()[model.validationMetrics.index(best_metric)]\n", " \n", " return list(parameters.values())" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 19, - "metadata": {}, - "outputs": [], "source": [ "params = best_param(model_precision)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 20, - "metadata": {}, + "source": [ + "model_precision.bestModel.transform(valid).limit(5).show()\n", + "\n", + "for idx, item in enumerate(model_precision.getEstimatorParamMaps()):\n", + " print('Run {}:'.format(idx))\n", + " print('\\tValidation Metric: {}'.format(model_precision.validationMetrics[idx]))\n", + " for key, value in item.items():\n", + " print('\\t{0}: {1}'.format(repr(key), value))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+--------------------+--------------------+\n", "|userID| label| prediction|\n", @@ -868,39 +877,29 @@ ] } ], - "source": [ - "model_precision.bestModel.transform(valid).limit(5).show()\n", - "\n", - "for idx, item in enumerate(model_precision.getEstimatorParamMaps()):\n", - " print('Run {}:'.format(idx))\n", - " print('\\tValidation Metric: {}'.format(model_precision.validationMetrics[idx]))\n", - " for key, value in item.items():\n", - " print('\\t{0}: {1}'.format(repr(key), value))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 4 Hyperparameter tuning with `hyperopt`" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "`hyperopt` is an open source Python package that is designed for tuning parameters for generic function with any pre-defined loss. More information about `hyperopt` can be found [here](https://github.com/hyperopt/hyperopt). `hyperopt` supports parallelization on MongoDB but not Spark. In our case, the tuning is performed in a sequential mode on a local computer.\n", "\n", "In `hyperopt`, an *objective* function is defined for optimizing the hyper parameters. In this case, the objective is similar to that in the Spark native construct situation, which is *to the RMSE metric for an ALS recommender*. Parameters of `rank` and `regParam` are used as hyperparameters. \n", "\n", "The objective function shown below demonstrates a RMSE loss for an ALS recommender. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 21, - "metadata": {}, - "outputs": [], "source": [ "# Customize an objective function\n", "def objective(params):\n", @@ -946,11 +945,12 @@ " 'status': STATUS_OK,\n", " 'eval_time': time_run_start.interval\n", " }" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "A search space is usually defined for hyperparameter exploration. Design of search space is empirical, and depends on the understanding of how distribution of parameter of interest affects the model performance measured by the loss function. \n", "\n", @@ -959,13 +959,12 @@ "* The reg parameter prevents overfitting in certain way. \n", "\n", "Therefore, in this case, a uniform distribution and a lognormal distribution sampling spaces are used for rank and reg, respectively. A narrow search space is used for illustration purpose, that is, the range of rank is from 10 to 20, while that of reg is from $e^{-5}$ to $e^{-1}$. Together with the randomly sampled hyper parameters, other parameters use for building / evaluating the recommender, like `k`, column names, data, etc., are kept as constants." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 22, - "metadata": {}, - "outputs": [], "source": [ "# define a search space\n", "space = {\n", @@ -980,31 +979,31 @@ " 'k': 10,\n", " 'relevancy_method': \"top_k\"\n", "}" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4.1 Hyperparameter tuning with TPE" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "`fmin` of `hyperopt` is used for running the trials for searching optimal hyper parameters. In `hyperopt`, there are different strategies for intelligently optimize hyper parameters. For example, `hyperopt` avails [Tree of Parzen Estimators (TPE) method](https://papers.nips.cc/paper/4443-algorithms-for-hyper-parameter-optimization.pdf) for searching optimal parameters. \n", "\n", "The TPE method models a surface response of $p(x|y)$ by transforming a generative process, replacing the distributions of the configuration prior with non-parametric densities, where $p$ is the probability of configuration space $x$ given the loss $y$. For different configuration space, the TPE method does different replacements. That is, uniform $\\to$ truncated Gaussian mixture, log-uniform $\\to$ exponentiated truncated Gaussian mixture, categorical $\\to$ re-weighted categorical, etc. Using different observations ${x(1), ..., x(k)}$ in the non-parametric densities, these substitutions represent a learning algorithm that can produce a variety of densities over the configuration space $X$. By maintaining sorted lists of observed variables in $H$, the runtime of each iteration of the TPE algorithm can scale linearly in $|H|$ and linearly in the number of variables (dimensions) being optimized. In a nutshell, the algorithm recognizes the irrelevant variables in the configuration space, and thus reduces iterations in searching for the optimal ones. Details of the TPE algorithm can be found in the reference paper.\n", "\n", "The following runs the trials with the pre-defined objective function and search space. TPE is used as the optimization method. Totally there will be 10 evaluations run for searching the best parameters." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 23, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_hyperopt:\n", " # Trials for recording each iteration of the hyperparameter searching.\n", @@ -1018,14 +1017,19 @@ " max_evals=NUMBER_ITERATIONS\n", " )\n", " \n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 24, - "metadata": {}, + "source": [ + "trials.best_trial" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "{'book_time': datetime.datetime(2019, 7, 17, 12, 28, 19, 108000),\n", @@ -1046,801 +1050,41 @@ " 'version': 0}" ] }, - "execution_count": 24, "metadata": {}, - "output_type": "execute_result" + "execution_count": 24 } ], - "source": [ - "trials.best_trial" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 25, - "metadata": {}, + "source": [ + "parameters = ['rank', 'reg']\n", + "cols = len(parameters)\n", + "f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(15,5))\n", + "cmap = plt.cm.jet\n", + "for i, val in enumerate(parameters):\n", + " xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()\n", + " ys = [t['result']['loss'] for t in trials.trials]\n", + " xs, ys = zip(*sorted(zip(xs, ys)))\n", + " ys = np.array(ys)\n", + " axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75, c=cmap(float(i)/len(parameters)))\n", + " axes[i].set_title(val)" + ], "outputs": [ { + "output_type": "display_data", "data": { - "application/javascript": [ - "/* Put everything inside the global mpl namespace */\n", - "window.mpl = {};\n", - "\n", - "\n", - "mpl.get_websocket_type = function() {\n", - " if (typeof(WebSocket) !== 'undefined') {\n", - " return WebSocket;\n", - " } else if (typeof(MozWebSocket) !== 'undefined') {\n", - " return MozWebSocket;\n", - " } else {\n", - " alert('Your browser does not have WebSocket support.' +\n", - " 'Please try Chrome, Safari or Firefox ≥ 6. ' +\n", - " 'Firefox 4 and 5 are also supported but you ' +\n", - " 'have to enable WebSockets in about:config.');\n", - " };\n", - "}\n", - "\n", - "mpl.figure = function(figure_id, websocket, ondownload, parent_element) {\n", - " this.id = figure_id;\n", - "\n", - " this.ws = websocket;\n", - "\n", - " this.supports_binary = (this.ws.binaryType != undefined);\n", - "\n", - " if (!this.supports_binary) {\n", - " var warnings = document.getElementById(\"mpl-warnings\");\n", - " if (warnings) {\n", - " warnings.style.display = 'block';\n", - " warnings.textContent = (\n", - " \"This browser does not support binary websocket messages. \" +\n", - " \"Performance may be slow.\");\n", - " }\n", - " }\n", - "\n", - " this.imageObj = new Image();\n", - "\n", - " this.context = undefined;\n", - " this.message = undefined;\n", - " this.canvas = undefined;\n", - " this.rubberband_canvas = undefined;\n", - " this.rubberband_context = undefined;\n", - " this.format_dropdown = undefined;\n", - "\n", - " this.image_mode = 'full';\n", - "\n", - " this.root = $('
');\n", - " this._root_extra_style(this.root)\n", - " this.root.attr('style', 'display: inline-block');\n", - "\n", - " $(parent_element).append(this.root);\n", - "\n", - " this._init_header(this);\n", - " this._init_canvas(this);\n", - " this._init_toolbar(this);\n", - "\n", - " var fig = this;\n", - "\n", - " this.waiting = false;\n", - "\n", - " this.ws.onopen = function () {\n", - " fig.send_message(\"supports_binary\", {value: fig.supports_binary});\n", - " fig.send_message(\"send_image_mode\", {});\n", - " if (mpl.ratio != 1) {\n", - " fig.send_message(\"set_dpi_ratio\", {'dpi_ratio': mpl.ratio});\n", - " }\n", - " fig.send_message(\"refresh\", {});\n", - " }\n", - "\n", - " this.imageObj.onload = function() {\n", - " if (fig.image_mode == 'full') {\n", - " // Full images could contain transparency (where diff images\n", - " // almost always do), so we need to clear the canvas so that\n", - " // there is no ghosting.\n", - " fig.context.clearRect(0, 0, fig.canvas.width, fig.canvas.height);\n", - " }\n", - " fig.context.drawImage(fig.imageObj, 0, 0);\n", - " };\n", - "\n", - " this.imageObj.onunload = function() {\n", - " fig.ws.close();\n", - " }\n", - "\n", - " this.ws.onmessage = this._make_on_message_function(this);\n", - "\n", - " this.ondownload = ondownload;\n", - "}\n", - "\n", - "mpl.figure.prototype._init_header = function() {\n", - " var titlebar = $(\n", - " '
');\n", - " var titletext = $(\n", - " '
');\n", - " titlebar.append(titletext)\n", - " this.root.append(titlebar);\n", - " this.header = titletext[0];\n", - "}\n", - "\n", - "\n", - "\n", - "mpl.figure.prototype._canvas_extra_style = function(canvas_div) {\n", - "\n", - "}\n", - "\n", - "\n", - "mpl.figure.prototype._root_extra_style = function(canvas_div) {\n", - "\n", - "}\n", - "\n", - "mpl.figure.prototype._init_canvas = function() {\n", - " var fig = this;\n", - "\n", - " var canvas_div = $('
');\n", - "\n", - " canvas_div.attr('style', 'position: relative; clear: both; outline: 0');\n", - "\n", - " function canvas_keyboard_event(event) {\n", - " return fig.key_event(event, event['data']);\n", - " }\n", - "\n", - " canvas_div.keydown('key_press', canvas_keyboard_event);\n", - " canvas_div.keyup('key_release', canvas_keyboard_event);\n", - " this.canvas_div = canvas_div\n", - " this._canvas_extra_style(canvas_div)\n", - " this.root.append(canvas_div);\n", - "\n", - " var canvas = $('');\n", - " canvas.addClass('mpl-canvas');\n", - " canvas.attr('style', \"left: 0; top: 0; z-index: 0; outline: 0\")\n", - "\n", - " this.canvas = canvas[0];\n", - " this.context = canvas[0].getContext(\"2d\");\n", - "\n", - " var backingStore = this.context.backingStorePixelRatio ||\n", - "\tthis.context.webkitBackingStorePixelRatio ||\n", - "\tthis.context.mozBackingStorePixelRatio ||\n", - "\tthis.context.msBackingStorePixelRatio ||\n", - "\tthis.context.oBackingStorePixelRatio ||\n", - "\tthis.context.backingStorePixelRatio || 1;\n", - "\n", - " mpl.ratio = (window.devicePixelRatio || 1) / backingStore;\n", - "\n", - " var rubberband = $('');\n", - " rubberband.attr('style', \"position: absolute; left: 0; top: 0; z-index: 1;\")\n", - "\n", - " var pass_mouse_events = true;\n", - "\n", - " canvas_div.resizable({\n", - " start: function(event, ui) {\n", - " pass_mouse_events = false;\n", - " },\n", - " resize: function(event, ui) {\n", - " fig.request_resize(ui.size.width, ui.size.height);\n", - " },\n", - " stop: function(event, ui) {\n", - " pass_mouse_events = true;\n", - " fig.request_resize(ui.size.width, ui.size.height);\n", - " },\n", - " });\n", - "\n", - " function mouse_event_fn(event) {\n", - " if (pass_mouse_events)\n", - " return fig.mouse_event(event, event['data']);\n", - " }\n", - "\n", - " rubberband.mousedown('button_press', mouse_event_fn);\n", - " rubberband.mouseup('button_release', mouse_event_fn);\n", - " // Throttle sequential mouse events to 1 every 20ms.\n", - " rubberband.mousemove('motion_notify', mouse_event_fn);\n", - "\n", - " rubberband.mouseenter('figure_enter', mouse_event_fn);\n", - " rubberband.mouseleave('figure_leave', mouse_event_fn);\n", - "\n", - " canvas_div.on(\"wheel\", function (event) {\n", - " event = event.originalEvent;\n", - " event['data'] = 'scroll'\n", - " if (event.deltaY < 0) {\n", - " event.step = 1;\n", - " } else {\n", - " event.step = -1;\n", - " }\n", - " mouse_event_fn(event);\n", - " });\n", - "\n", - " canvas_div.append(canvas);\n", - " canvas_div.append(rubberband);\n", - "\n", - " this.rubberband = rubberband;\n", - " this.rubberband_canvas = rubberband[0];\n", - " this.rubberband_context = rubberband[0].getContext(\"2d\");\n", - " this.rubberband_context.strokeStyle = \"#000000\";\n", - "\n", - " this._resize_canvas = function(width, height) {\n", - " // Keep the size of the canvas, canvas container, and rubber band\n", - " // canvas in synch.\n", - " canvas_div.css('width', width)\n", - " canvas_div.css('height', height)\n", - "\n", - " canvas.attr('width', width * mpl.ratio);\n", - " canvas.attr('height', height * mpl.ratio);\n", - " canvas.attr('style', 'width: ' + width + 'px; height: ' + height + 'px;');\n", - "\n", - " rubberband.attr('width', width);\n", - " rubberband.attr('height', height);\n", - " }\n", - "\n", - " // Set the figure to an initial 600x600px, this will subsequently be updated\n", - " // upon first draw.\n", - " this._resize_canvas(600, 600);\n", - "\n", - " // Disable right mouse context menu.\n", - " $(this.rubberband_canvas).bind(\"contextmenu\",function(e){\n", - " return false;\n", - " });\n", - "\n", - " function set_focus () {\n", - " canvas.focus();\n", - " canvas_div.focus();\n", - " }\n", - "\n", - " window.setTimeout(set_focus, 100);\n", - "}\n", - "\n", - "mpl.figure.prototype._init_toolbar = function() {\n", - " var fig = this;\n", - "\n", - " var nav_element = $('
')\n", - " nav_element.attr('style', 'width: 100%');\n", - " this.root.append(nav_element);\n", - "\n", - " // Define a callback function for later on.\n", - " function toolbar_event(event) {\n", - " return fig.toolbar_button_onclick(event['data']);\n", - " }\n", - " function toolbar_mouse_event(event) {\n", - " return fig.toolbar_button_onmouseover(event['data']);\n", - " }\n", - "\n", - " for(var toolbar_ind in mpl.toolbar_items) {\n", - " var name = mpl.toolbar_items[toolbar_ind][0];\n", - " var tooltip = mpl.toolbar_items[toolbar_ind][1];\n", - " var image = mpl.toolbar_items[toolbar_ind][2];\n", - " var method_name = mpl.toolbar_items[toolbar_ind][3];\n", - "\n", - " if (!name) {\n", - " // put a spacer in here.\n", - " continue;\n", - " }\n", - " var button = $('');\n button.click(method_name, toolbar_event);\n button.mouseover(tooltip, toolbar_mouse_event);\n nav_element.append(button);\n }\n\n // Add the status bar.\n var status_bar = $('');\n nav_element.append(status_bar);\n this.message = status_bar[0];\n\n // Add the close button to the window.\n var buttongrp = $('
');\n var button = $('');\n button.click(function (evt) { fig.handle_close(fig, {}); } );\n button.mouseover('Stop Interaction', toolbar_mouse_event);\n buttongrp.append(button);\n var titlebar = this.root.find($('.ui-dialog-titlebar'));\n titlebar.prepend(buttongrp);\n}\n\nmpl.figure.prototype._root_extra_style = function(el){\n var fig = this\n el.on(\"remove\", function(){\n\tfig.close_ws(fig, {});\n });\n}\n\nmpl.figure.prototype._canvas_extra_style = function(el){\n // this is important to make the div 'focusable\n el.attr('tabindex', 0)\n // reach out to IPython and tell the keyboard manager to turn it's self\n // off when our div gets focus\n\n // location in version 3\n if (IPython.notebook.keyboard_manager) {\n IPython.notebook.keyboard_manager.register_events(el);\n }\n else {\n // location in version 2\n IPython.keyboard_manager.register_events(el);\n }\n\n}\n\nmpl.figure.prototype._key_event_extra = function(event, name) {\n var manager = IPython.notebook.keyboard_manager;\n if (!manager)\n manager = IPython.keyboard_manager;\n\n // Check for shift+enter\n if (event.shiftKey && event.which == 13) {\n this.canvas_div.blur();\n event.shiftKey = false;\n // Send a \"J\" for go to next cell\n event.which = 74;\n event.keyCode = 74;\n manager.command_mode();\n manager.handle_keydown(event);\n }\n}\n\nmpl.figure.prototype.handle_save = function(fig, msg) {\n fig.ondownload(fig, null);\n}\n\n\nmpl.find_output_cell = function(html_output) {\n // Return the cell and output element which can be found *uniquely* in the notebook.\n // Note - this is a bit hacky, but it is done because the \"notebook_saving.Notebook\"\n // IPython event is triggered only after the cells have been serialised, which for\n // our purposes (turning an active figure into a static one), is too late.\n var cells = IPython.notebook.get_cells();\n var ncells = cells.length;\n for (var i=0; i= 3 moved mimebundle to data attribute of output\n data = data.data;\n }\n if (data['text/html'] == html_output) {\n return [cell, data, j];\n }\n }\n }\n }\n}\n\n// Register the function which deals with the matplotlib target/channel.\n// The kernel may be null if the page has been refreshed.\nif (IPython.notebook.kernel != null) {\n IPython.notebook.kernel.comm_manager.register_target('matplotlib', mpl.mpl_figure_comm);\n}\n", "text/plain": [ "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} }, { + "output_type": "display_data", "data": { "text/html": [ "" @@ -1849,45 +1093,30 @@ "" ] }, - "metadata": {}, - "output_type": "display_data" + "metadata": {} } ], - "source": [ - "parameters = ['rank', 'reg']\n", - "cols = len(parameters)\n", - "f, axes = plt.subplots(nrows=1, ncols=cols, figsize=(15,5))\n", - "cmap = plt.cm.jet\n", - "for i, val in enumerate(parameters):\n", - " xs = np.array([t['misc']['vals'][val] for t in trials.trials]).ravel()\n", - " ys = [t['result']['loss'] for t in trials.trials]\n", - " xs, ys = zip(*sorted(zip(xs, ys)))\n", - " ys = np.array(ys)\n", - " axes[i].scatter(xs, ys, s=20, linewidth=0.01, alpha=0.75, c=cmap(float(i)/len(parameters)))\n", - " axes[i].set_title(val)" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It can be seen from the above plot that\n", "* The actual impact of rank is in line with the intuition - the smaller the value the better the result.\n", "* It is interesting to see that the optimal value of reg is around 0.1 to 0.15. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the best model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 26, - "metadata": {}, - "outputs": [], "source": [ "als = ALS(\n", " rank=best[\"rank\"],\n", @@ -1902,20 +1131,20 @@ ")\n", " \n", "model_best_hyperopt = als.fit(train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Tuning prameters against other metrics can be simply done by modifying the `objective` function. The following shows an objective function of how to tune \"precision@k\". Since `fmin` in `hyperopt` only supports minimization while the actual objective of the loss is to maximize \"precision@k\", `-precision` instead of `precision` is used in the returned value of the `objective` function." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 27, - "metadata": {}, - "outputs": [], "source": [ "# Customize an objective function\n", "def objective_precision(params):\n", @@ -1988,29 +1217,29 @@ " 'status': STATUS_OK,\n", " 'eval_time': time_run_start.interval\n", " }" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4.2 Hyperparameter tuning with `hyperopt` sampling methods" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Though `hyperopt` works well in a single node machine, its features (e.g., `Trials` module) do not support Spark environment, which makes it hard to perform the tuning tasks in a distributed/parallel manner. It is useful to use `hyperopt` for sampling parameter values from the pre-defined sampling space, and then parallelize the model training onto Spark cluster with the sampled parameter combinations.\n", "\n", "The downside of this method is that the intelligent searching algorithm (i.e., TPE) of `hyperopt` cannot be used. The approach introduced here is therefore equivalent to random search." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 28, - "metadata": {}, - "outputs": [], "source": [ "with Timer() as time_sample:\n", " # Sample the parameters used for model building from the pre-defined space. \n", @@ -2018,14 +1247,19 @@ " \n", " # The following runs model building on the sampled parameter values with the pre-defined objective function.\n", " results_map = list(map(lambda x: objective(x), sample_params))\n" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 30, - "metadata": {}, + "source": [ + "results_map" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/plain": [ "[{'eval_time': 9.468051671981812, 'loss': 1.027085217204854, 'status': 'ok'},\n", @@ -2055,46 +1289,41 @@ " {'eval_time': 9.08506464958191, 'loss': 1.254533287299843, 'status': 'ok'}]" ] }, - "execution_count": 30, "metadata": {}, - "output_type": "execute_result" + "execution_count": 30 } ], - "source": [ - "results_map" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Get the best model." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 31, - "metadata": {}, - "outputs": [], "source": [ "loss_metrics = np.array([x['loss'] for x in results_map])\n", "best_loss = np.where(loss_metrics == min(loss_metrics))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 32, - "metadata": {}, - "outputs": [], "source": [ "best_param = sample_params[best_loss[0].item()]" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 33, - "metadata": {}, - "outputs": [], "source": [ "als = ALS(\n", " rank=best_param[\"rank\"],\n", @@ -2109,29 +1338,29 @@ ")\n", " \n", "model_best_sample = als.fit(train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 5 Evaluation on testing data" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The optimal parameters can then be used for building a recommender, which is then evaluated on the testing data.\n", "\n", "The following codes generate the evaluation results by using the testing dataset with the optimal model selected against the pre-defined loss. Without loss of generity, in this case, the optimal model that performs the best w.r.t regression loss (i.e., the RMSE metric) is used. One can simply use other metrics like precision@k, as illustrated in the above sections, to evaluate the optimal model on the testing dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 34, - "metadata": {}, - "outputs": [], "source": [ "# Get prediction results with the optimal modesl from different approaches.\n", "prediction_spark = model_best_spark.transform(test)\n", @@ -2160,14 +1389,19 @@ " }, index=[0])\n", " \n", " test_evaluations = test_evaluations.append(result)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 35, - "metadata": {}, + "source": [ + "test_evaluations" + ], "outputs": [ { + "output_type": "execute_result", "data": { "text/html": [ "
\n", @@ -2235,62 +1469,58 @@ "0 sample 230.902271 0.287638 0.791199 0.232688 0.988922" ] }, - "execution_count": 35, "metadata": {}, - "output_type": "execute_result" + "execution_count": 35 } ], - "source": [ - "test_evaluations" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "From the results, it can be seen that, *with the same number of iterations*, Spark native construct based approach takes the least amount of time, even if there is no parallel computing. This is simply because Spark native constructs leverage the underlying Java codes for running the actual analytics with high performance efficiency. Interestingly, the run time for `hyperopt` with TPE algorithm and random search methods are almost the same. Possible reasons for this are that, the TPE algorithm searches optimal parameters intelligently but runs the tuning iterations sequentially. Also, the advantage of TPE may become obvious when there is a higher dimensionality of hyperparameters. \n", "\n", "The three approaches use the same RMSE loss. In this measure, the native Spark construct performs the best. The `hyperopt` based approach performs the second best, but the advantage is very subtle. It should be noted that these differences may be owing to many factors like characteristics of datasets, dimensionality of hyperparameter space, sampling size in the searching, etc. Note the differences in the RMSE metrics may also come from the randomness of the intermediate steps in parameter tuning process. In practice, multiple runs are required for generating statistically robust comparison results. We have tried 5 times for running the same comparison codes above. The results aligned well with each other in terms of objective metric values and elapsed time. " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Conclusions" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "In summary, there are mainly three different approaches for running hyperparameter tuning for Spark based recommendation algorithm. The three different approaches are compared as follows." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "|Approach|Distributed (on Spark)|Param sampling|Advanced hyperparam searching algo|Custom evaluation metrics|Custom data split|\n", "|---------|-------------|--------------|--------------------------|--------------|------------|\n", "|AzureML Services|Parallelizing Spark sessions on multi-node cluster or single Spark session on one VM node.)|Random, Grid, Bayesian sampling for discrete and continuous variables.|Bandit policy, Median stopping policy, and truncation selection policy.|Yes|Yes|\n", "|Spark native construct|Distributed in single-node standalone Spark environment or multi-node Spark cluster.|No|No|Need to re-engineer Spark modules|Need to re-engineer Spark modules.|\n", "|`hyperopt`|No (only support parallelization on MongoDB)|Random sampling for discrete and continuous variables.|Tree Parzen Estimator|Yes|Yes|" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 36, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# References\n", "\n", @@ -2300,7 +1530,8 @@ "* `hyperopt`, url: http://hyperopt.github.io/hyperopt/.\n", "* Bergstra, J., Yamins, D., Cox, D. D. (2013) Making a Science of Model Search: Hyperparameter Optimization in Hundreds of Dimensions for Vision Architectures. Proc. of the 30th International Conference on Machine Learning (ICML 2013).\n", "* Kris Wright, \"Hyper parameter tuning with hyperopt\", url:https://districtdatalabs.silvrback.com/parameter-tuning-with-hyperopt" - ] + ], + "metadata": {} } ], "metadata": { @@ -2325,4 +1556,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index d96e5c2ca9..6cb300c759 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -31,7 +31,7 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) -@pytest.mark.parametrize("data_size", ["100k", "mock100", "mock10"]) +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["als_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, @@ -50,7 +50,7 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.parametrize("data_size", ["100k", "mock100", "mock10"]) +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, @@ -62,13 +62,15 @@ def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s" ) -def test_spark_tuning(notebooks, output_notebook, kernel_name): +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +def test_spark_tuning(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["spark_tuning"] pm.execute_notebook( notebook_path, output_notebook, kernel_name=kernel_name, parameters=dict( + MOVIELENS_DATA_SIZE=data_size, NUMBER_CORES="*", NUMBER_ITERATIONS=3, SUBSET_RATIO=0.5, diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 76cd854d28..3f06cd9202 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -50,9 +50,11 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks -def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name): +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["surprise_svd_deep_dive"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(MOVIELENS_DATA_SIZE=data_size)) @pytest.mark.notebooks @@ -98,9 +100,11 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks -def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name): +@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["rlrmc_quickstart"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict(rank_parameter=2, MOVIELENS_DATA_SIZE=data_size)) @pytest.mark.notebooks From 0c7adbab22efe8b6c39c8aeb34d3f6ec0f425fe5 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Thu, 23 Sep 2021 16:21:08 +0000 Subject: [PATCH 06/27] Add mock_movielens test marker --- tests/unit/examples/test_notebooks_pyspark.py | 3 +++ tests/unit/examples/test_notebooks_python.py | 2 ++ tox.ini | 1 + 3 files changed, 6 insertions(+) diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index 6cb300c759..48ddeb1162 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -28,6 +28,7 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark +@pytest.mark.mock_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) @@ -50,6 +51,7 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark +@pytest.mark.mock_movielens @pytest.mark.parametrize("data_size", ["100k", "mock100"]) def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] @@ -59,6 +61,7 @@ def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data @pytest.mark.notebooks @pytest.mark.spark +@pytest.mark.mock_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s" ) diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 3f06cd9202..da9e65d214 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -50,6 +50,7 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks +@pytest.mark.mock_movielens @pytest.mark.parametrize("data_size", ["100k", "mock100"]) def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["surprise_svd_deep_dive"] @@ -100,6 +101,7 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks +@pytest.mark.mock_movielens @pytest.mark.parametrize("data_size", ["100k", "mock100"]) def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["rlrmc_quickstart"] diff --git a/tox.ini b/tox.ini index 815e06dc14..bfb0b68833 100644 --- a/tox.ini +++ b/tox.ini @@ -66,6 +66,7 @@ markers = gpu: mark a test as gpu test spark: mark a test as spark test vw: mark a test as vowpal wabbit test + mock_movielens: mark a test that uses the mock dataset instead of real dataset testpaths = tests addopts = From 83f26e800db6bfd164196684e597340c85301fea Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Thu, 23 Sep 2021 17:33:58 +0000 Subject: [PATCH 07/27] Parametrize als_deep_dive NB --- .../als_deep_dive.ipynb | 499 +++++++----------- tests/unit/examples/test_notebooks_pyspark.py | 9 +- 2 files changed, 187 insertions(+), 321 deletions(-) diff --git a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb index ce825152fc..a8b19a4d65 100644 --- a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb @@ -2,31 +2,32 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "# Spark Collaborative Filtering (ALS) Deep Dive" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Spark MLlib provides a collaborative filtering algorithm that can be used for training a matrix factorization model, which predicts explicit or implicit ratings of users on items for recommendations.\n", "\n", "This notebook presents a deep dive into the Spark collaborative filtering algorithm." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 1 Matrix factorization algorithm\n", "\n", @@ -53,11 +54,11 @@ "Owing to the term of $q_{i}^{T}p_{u}$ the loss function is non-convex. Gradient descent method can be applied but this will incur expensive computations. An Alternating Least Square (ALS) algorithm was therefore developed to overcome this issue. \n", "\n", "The basic idea of ALS is to learn one of $q$ and $p$ at a time for optimization while keeping the other as constant. This makes the objective at each iteration convex and solvable. The alternating between $q$ and $p$ stops when there is convergence to the optimal. It is worth noting that this iterative computation can be parallelised and/or distributed, which makes the algorithm desirable for use cases where the dataset is large and thus the user-item rating matrix is super sparse (as is typical in recommendation scenarios). A comprehensive discussion of ALS and its distributed computation can be found [here](http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf)." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 2 Spark Mllib implementation\n", "\n", @@ -66,28 +67,29 @@ "* The uniqueness of ALS implementation is that it distributes the matrix factorization model training by using \"Alternating Least Square\" method. \n", "* In the training method, there are parameters that can be selected to control the model performance.\n", "* Both explicit and implicit ratings are supported by Spark ALS model." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## 3 Spark ALS based MovieLens recommender\n", "\n", "In the following code, the MovieLens-100K dataset is used to illustrate the ALS algorithm in Spark." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -116,31 +118,24 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "System version: 3.5.5 |Anaconda custom (64-bit)| (default, May 13 2018, 21:12:35) \n", - "[GCC 7.2.0]\n", - "Pandas version: 0.23.0\n", - "PySpark version: 2.3.1\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data column names" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": null, + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], "source": [ "MOVIELENS_DATA_SIZE = \"100k\"\n", "\n", @@ -148,8 +143,15 @@ "COL_ITEM = \"MovieId\"\n", "COL_RATING = \"Rating\"\n", "COL_PREDICTION = \"prediction\"\n", - "COL_TIMESTAMP = \"Timestamp\"\n", - "\n", + "COL_TIMESTAMP = \"Timestamp\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ "schema = StructType(\n", " (\n", " StructField(COL_USER, IntegerType()),\n", @@ -158,156 +160,127 @@ " StructField(COL_TIMESTAMP, LongType()),\n", " )\n", ")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Model hyper parameters - these parameters are selected with reference to the benchmarking results [here](http://mymedialite.net/examples/datasets.html)." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "RANK = 10\n", "MAX_ITER = 15\n", "REG_PARAM = 0.05" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Number of recommended items" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "K = 10" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Initialize a Spark session." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 23, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.1 Load and prepare data" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data is read from csv into a Spark DataFrame." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stderr", - "text": [ - "../../recommenders/dataset/movielens.py:471: UserWarning: Both schema and header are provided.\n", - " The header argument will be ignored.\n", - " warnings.warn(WARNING_HAVE_SCHEMA_AND_HEADER)\n", - "100%|██████████| 4.81k/4.81k [00:01<00:00, 2.50kKB/s]\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 25, - "source": [ - "dfs.show(5)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+-------+------+---------+\n", - "|UserId|MovieId|Rating|Timestamp|\n", - "+------+-------+------+---------+\n", - "| 196| 242| 3.0|881250949|\n", - "| 186| 302| 3.0|891717742|\n", - "| 22| 377| 1.0|878887116|\n", - "| 244| 51| 2.0|880606923|\n", - "| 166| 346| 1.0|886397596|\n", - "+------+-------+------+---------+\n", - "only showing top 5 rows\n", - "\n" - ] - } - ], + "execution_count": null, "metadata": { "scrolled": true - } + }, + "outputs": [], + "source": [ + "dfs.show(5)" + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Data is then randomly split by 80-20 ratio for training and testing." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.2 Train a movielens model " - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "It is worth noting that Spark ALS model allows dropping cold users to favor a robust evaluation with the testing data. In case there are cold users, Spark ALS implementation allows users to drop cold users in order to make sure evaluations on the prediction results are sound." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "als = ALS(\n", " maxIter=MAX_ITER, \n", @@ -320,38 +293,38 @@ ")\n", "\n", "model = als.fit(dfs_train)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.3 Prediction with the model\n", "\n", "The trained model can be used to predict ratings with a given test data." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs_pred = model.transform(dfs_test).drop(COL_RATING)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "With the prediction results, the model performance can be evaluated." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 29, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "evaluations = SparkRatingEvaluation(\n", " dfs_test, \n", @@ -369,89 +342,45 @@ " \"Explained variance score = {}\".format(evaluations.exp_var()),\n", " sep=\"\\n\"\n", ")" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "RMSE score = 0.9697095550242029\n", - "MAE score = 0.7554838330206419\n", - "R2 score = 0.24874053010909036\n", - "Explained variance score = 0.2547961843833687\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Oftentimes ranking metrics are also of interest to data scientists. Note usually ranking metrics apply to the scenario of recommending a list of items. In our case, the recommended items should be different from those that have been rated by the users. " - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 30, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# Get the cross join of all user-item pairs and score them.\n", - "users = dfs_train.select('UserId').distinct()\n", - "items = dfs_train.select('MovieId').distinct()\n", + "users = dfs_train.select(COL_USER).distinct()\n", + "items = dfs_train.select(COL_ITEM).distinct()\n", "user_item = users.crossJoin(items)\n", "dfs_pred = model.transform(user_item)\n", "\n", "# Remove seen items.\n", "dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", " dfs_train.alias(\"train\"),\n", - " (dfs_pred['UserId'] == dfs_train['UserId']) & (dfs_pred['MovieId'] == dfs_train['MovieId']),\n", + " (dfs_pred[COL_USER] == dfs_train[COL_USER]) & (dfs_pred[COL_ITEM] == dfs_train[COL_ITEM]),\n", " how='outer'\n", ")\n", "\n", "dfs_pred_final = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", - " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", + " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", "dfs_pred_final.show()" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+-------+----------+\n", - "|UserId|MovieId|prediction|\n", - "+------+-------+----------+\n", - "| 1| 587| 2.9286714|\n", - "| 1| 869| 2.0478792|\n", - "| 1| 1208| 2.349619|\n", - "| 1| 1677| 3.1982298|\n", - "| 2| 80| 2.2628117|\n", - "| 2| 303| 2.9711432|\n", - "| 2| 472| 3.0840402|\n", - "| 2| 582| 4.65145|\n", - "| 2| 838| 1.8449162|\n", - "| 2| 975| 3.177288|\n", - "| 2| 1260| 3.466885|\n", - "| 2| 1325| 1.1348095|\n", - "| 2| 1381| 4.0551796|\n", - "| 2| 1530| 2.1732688|\n", - "| 3| 22| 3.0636034|\n", - "| 3| 57| 2.8428345|\n", - "| 3| 89| 3.459687|\n", - "| 3| 367| 2.3071244|\n", - "| 3| 1091| 1.9453487|\n", - "| 3| 1167| 2.0511415|\n", - "+------+-------+----------+\n", - "only showing top 20 rows\n", - "\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "evaluations = SparkRankingEvaluation(\n", " dfs_test, \n", @@ -470,23 +399,11 @@ " \"Mean average precision = {}\".format(evaluations.map_at_k()),\n", " sep=\"\\n\"\n", ")" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Precision@k = 0.04061505832449631\n", - "Recall@k = 0.013571438145917577\n", - "NDCG@k = 0.03699684800440573\n", - "Mean average precision = 0.003702411260039904\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.4 Fine tune the model\n", "\n", @@ -499,47 +416,48 @@ "|`maxIters`|Maximum number of iterations|10|The more iterations the better the model converges to the optimal point.|\n", "\n", "It is always a good practice to start model building with default parameter values and then sweep the parameter in a range to find the optimal combination of parameters. The following parameter set is used for training ALS models for comparison study purposes." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "param_dict = {\n", " \"rank\": [10, 15, 20],\n", " \"regParam\": [0.001, 0.1, 1.0]\n", "}" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Generate a dictionary for each parameter combination which can then be fed into model training." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 33, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "param_grid = generate_param_grid(param_dict)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "Train models with parameters specified in the parameter grid. Evaluate the model with, for example, the RMSE metric, and then record the metrics for visualization." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 34, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rmse_score = []\n", "\n", @@ -569,164 +487,104 @@ "\n", "rmse_score = [float('%.4f' % x) for x in rmse_score]\n", "rmse_score_array = np.reshape(rmse_score, (len(param_dict[\"rank\"]), len(param_dict[\"regParam\"]))) " - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "rmse_df = pd.DataFrame(data=rmse_score_array, index=pd.Index(param_dict[\"rank\"], name=\"rank\"), \n", " columns=pd.Index(param_dict[\"regParam\"], name=\"reg. parameter\"))" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fig, ax = plt.subplots()\n", "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" - ], - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "" - ] - }, - "metadata": {}, - "execution_count": 36 - }, - { - "output_type": "display_data", - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAEKCAYAAAD6q1UVAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAIABJREFUeJzt3XeYU1X+x/H3yUwywzAFZqjSUVSKCD8BCwsMyoLiosgKLBaKAhYWbCC4soC6riIKu6KsIChFBUR2F8Te2MUyUpQiTao4OCK9OS3J+f2REBhghqySBOZ+Xs+Th+Sec5Pvmct8cnNykjHWWkRExFlcsS5ARESiT+EvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHCg+1gUUZ/e1bfTR47NU5Q82xroE+RX2D8+MdQnyK5Qd8aoJp5/O/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoPhYF1AalL1nKJ4Wl+Pft5f9A/qc0O6+rCVJt9wO1g8+H4cnPYd3zSoAEq7qQJnuPQHInT2d/I/eA8DTqi1lut8KLheFS7L4+eUXojcgB+jQPpOxYx8lzuXipZdn8tSY54u016xZjcmTxlKhYjp79+yjZ+9BbN+ew8UXN+T58U+QkpqMz+fjiSfHM2fOfADaZrZk9Og/4/G4+eqrVfTr/wA+n49OndrzyKgh+P0Wr9fLAw+M5LPPl8Ri2KWOp1M/4s9vij18gNwXhp3QHnf+JXja3oi1Fvw+Ct6bgf/7b3HVboCn/S2hfq4KVcmf+xy+9ctw1WmIp10PMC4oyCN/3kTs3h3RHFZUGGttrGs4qd3XtjkzCzuJ+IaNsXm5JN//p5OGP4llIC8XgLjadUkZNop9d/bEJKeQ9vdJ7L+nP2BJ+/uL7L+nHxgXac9OZv89/bAH9lP2vofI//g9vCu+iu7AfqHKH2yMdQklcrlcrF29iKs79iA7O4esL97mllvvZu3aDaE+s2ZO5K23P2TGjDm0zWxJr17d6d1nEPXq1cVay8aNW6hatTKLs96hUeNMDhw4yOaNi2l/dXc2bNjMqJGD+e67bF6eOouyZZM4fPhnAC66qD4zX3uBRhe1idXwT2n/8MxYlxA2V80LoSCPhM53njT8cSdAYT4AplINEm8cRO6EIUX7JJYlaeBYfh43ELwFlBnwNHmzx2J3/UB8s3a4zjmXgvkTozCa06PsiFdNOP007XMaeFevxB48WHyHYPADmMQyHHlWc1/SgsKvl2IPHcQeOkTh10txX3Iprirn4Pvhe+yB/QAULl9GQsszNyzONi2aN2XTpq1s2bKNwsJCXn99Htd16lCkT/369fj4408B+GThZ1zXqT0AGzZsZuPGLQDk5Ozgp527qVgxg4yM8uTn57Nhw2YAPvzwv3S5oSNAKPgByiYlcaaecJ2N/NvWYXMPFd8hGPwAxpMAJ/nZxzdogW/jCvAWBDZYi0koE9gnIQl7aO9prflMoWmfKPFc3oqkXv0w5cpzcFTgDMWVUQH/zp9Cffy7duLKqEDhsi+Jq14TV6Uq+HftxHP5bzDx7liVXuqcU60K32f/ELqdvT2HFs2bFumzcuUautzQkfHPTaFz52tITU0hPb08e/YcDYLmzZrg8bjZtGkr1lrcbjeX/F9jln21ki5drqV6jXNCfa+//moe/8tDVKqYwXXX94r8ICUk7oJmeK7qjimbSt7MMSe0xze8nMKsd0K38xdMJrHHEKy3EPJzyZ0yMprlRo3O/KOk4ItF7LuzJwcfe5gyt94W3HqSV2fWYg8d4vDz40geNpLUp8bj3/Ej+HxRrbc0M+bEn/vxZ+MPDn2M1q0vY8ni92jd6jKys3Pwer2h9ipVKjF16rP07Xt/aN+bb7mbZ54exRefLeDQocN4vUeP2bx579Loojb8/sbbeWTUcdMOElG+9UvJnTCEvNnj8GR2LdJmksvhqlQD36aVoW3uS68hb+YYcv82EO/y/+Bpf3O0S46KiJz5G2PSgIeAzkDF4OafgHnAk9bafcXs1x/oD/BMo3r0qlk1EuXFlHf1SuKqVMOkpuHfvRP3RU1Cba4KFSlctRyAwsWfU7j4cwASru4Efn9M6i2NtmfnUKP60bPy6tWqkpNT9A29nJwddO3WD4CyZZPocsO1HDgQmNpLSUlm/rzpjBj5FF8uPvo+TNaXy8i8sgsAv23Xmnr16p7w2Is+/ZK6dWuRkVGe3btL53TCmcq/bR2mfCUokwzBqaK4BpfiXbcU/MEn6qQUXJVr4t++CQDv6iwSbx4aq5IjKlJn/q8De4FMa22GtTYDaBvcNqe4nay1k6y1zay1zUpT8LuqVgtdjzu3HiY+HntgP4XLFuNu2hyTnIxJTsbdtDmFyxYDYNLKBf5NTibx2uvJe29BTGovjZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3KLHpGLFDAA8Hg9DBg9g0qQZAJx7bu1Qn6ZNGuHxuBX8UWLKVw5dd1WpDXHxoeAHiG90Bd7VXxzdIfcwJjEJk14FgLi6jfDv2h6tcqMqUnP+ta21o4/dYK39ERhtjLmtmH3OWskPjsB9URNMahrlps0h99WXA//JgPx35uNp2ZqEKzuAz4vNL+Dg6EcAsIcOkjtrOmnjAisJcmdOwx4KnF2WvWMQcXXODW33/5Adg5GVTj6fj3vuHc7bb71GnMvF1GmzWbPmW0aNHMzSZStYsOAD2rS5gscfewiLZdGiLAYOehiArl070arVpaRnlKdnz24A3N73PlasWM3g+++i47XtcLlcTJw4nU8WfgZAlxs6csstN1JY6CUvN4+bbr4rZmMvbRK6DMBVqz4mKYUy946ncOEbod8977KPiK/fnPjGrbB+H3gLyJ87PrSvSauASU3Hv3Xt0Tu0fvLfnExi13ux1g95h8mfPynaw4qKiCz1NMa8D3wITLPW7ghuqwz0Bn5rrW13qvs4m5Z6SlFn+lJPKdnZtNRTThTrpZ7dgQzgP8aYPcaYPcBCIB3oWtKOIiISeRGZ9rHW7gWGBi9FGGP6AC9H4nFFRCQ8sVjq+UgMHlNERI4RqaWeK4trAioX0yYiIlESqdU+lYEOBJZ2HssAn0foMUVEJEyRCv8FQLK1dvnxDcaYhRF6TBERCVOk3vC9vYS2myLxmCIiEj59t4+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXGg+FgXUJzUaS/HugT5pc5pFesKROQUdOYvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOFBY4W+MSTjJtvTTX46IiERDuGf+/zTGuI/cMMZUBT6ITEkiIhJp4Yb/v4E5xpg4Y0xt4D3goUgVJSIikRUfTidr7YvGGA+BJ4HawB3W2s8jWZiIiEROieFvjLn/2JtADWA5cJkx5jJr7dhIFne2GP7Xsfz3s8Wkly/Hv1954YT2xV+tZNCwR6hWtQoA7dpcwV233Ux+fgG9BgyhoLAQn9fHb9v+hj/2vRWArKVf88zzU/D7LUlJiTz+8APUrH5OVMdVmnVon8nYsY8S53Lx0sszeWrM80Xaa9asxuRJY6lQMZ29e/bRs/cgtm/P4eKLG/L8+CdISU3G5/PxxJPjmTNnPgBtM1syevSf8XjcfPXVKvr1fwCfz0e5cmlMfvEZ6tatRX5ePn37P8Dq1etjMexSx9OpH/HnN8UePkDuC8NOaI87/xI8bW/EWgt+HwXvzcD//be4ajfA0/6WUD9Xharkz30O3/pluOo0xNOuBxgXFOSRP28idu+OaA4rKoy1tvhGY0aWtLO19pHTXlFQ4a7NxRd2hlm6fBVJZcrwp8eeLjb8p86cy4QxRX9c1lpyc/NISipDoddLz7sGM+yeO7i4UX2u/UNfnn1yBOfWrsmsfy5g1Zr1PD78gWgN6Vcpc06rWJdQIpfLxdrVi7i6Yw+ys3PI+uJtbrn1btau3RDqM2vmRN56+0NmzJhD28yW9OrVnd59BlGvXl2stWzcuIWqVSuzOOsdGjXO5MCBg2zeuJj2V3dnw4bNjBo5mO++y+blqbMY/cRwDh0+zGN/GccFF5zL+L//lfZXd4/hT6Bk+4dnxrqEsLlqXggFeSR0vvOk4Y87AQrzATCVapB44yByJwwp2iexLEkDx/LzuIHgLaDMgKfJmz0Wu+sH4pu1w3XOuRTMnxiF0ZweZUe8asLpV+KZfyTDvTRp1uQituf872cGxhiSksoA4PV68Xq9GBM4bgY4fPhnAA4eOkzFChmnrV6na9G8KZs2bWXLlm0AvP76PK7r1KFI+NevX48HBo8C4JOFnzH3jSkAbNiwOdQnJ2cHP+3cTcWKGbjd8eTn54faP/zwvwx98I+8PHUW9eufz+inxgOwfv0matWqTqVKFfjpp13RGG6p5t+2DpNWofgOweAHMJ4EOMnJbnyDFvg2rgBvQWCDtZiEMljAJCRhD+09zVWfGcKa8zfGnA8MJjDfH9rHWntlZMoqfVZ8s5Yuve6mUoUMBg/oy3l1awHg8/nodtsgtm3/gR5dfkfjhhcC8Miwe7lr8AgSEzyULZvEa5PGxbL8UuWcalX4PvuH0O3s7Tm0aN60SJ+VK9fQ5YaOjH9uCp07X0Nqagrp6eXZs+doEDRv1gSPx82mTVux1uJ2u7nk/xqz7KuVdOlyLdVrBKbpVq5aww2dO/LZ50to3qwJtWpVp3q1qgr/KIm7oBmeq7pjyqaSN3PMCe3xDS+nMOud0O38BZNJ7DEE6y2E/Fxyp5Q4AXLWCne1zxzga2A4MOSYi4ShwQXn8sHcafxz2gRu+n0nBj30aKgtLi6OudOe56N/zWDVmm/ZsHkrANNn/4t/PP0oH/37FTp3bM9Tz74Yo+pLnyOvro51/PTng0Mfo3Xry1iy+D1at7qM7OwcvF5vqL1KlUpMnfosffveH9r35lvu5pmnR/HFZws4dOgwXq8PgNFPPUe58mksXfI+AwbcxtfLv8Hr80VwhHIs3/ql5E4YQt7scXgyuxZpM8nlcFWqgW/TytA296XXkDdzDLl/G4h3+X/wtL852iVHRbjh77XW/sNau9hau+zIpbjOxpirj7meZoyZYoxZaYx5zRhTuYT9+htjlhpjlk6ePvN/GMaZLbls2dD0TusrWuD1etm7b3+RPqkpyTT/v8Z8mrWUPXv3sX7j5tCrgGuuas3yb9ZEve7Sant2DjWOefO8erWq5Bw3bZeTs4Ou3frRvEUH/jxiNAAHDhwEICUlmfnzpjNi5FN8ufir0D5ZXy4j88ouXN7ydyxalMXGjVsAOHjwEH373U+z5u3p3WcQFStkhKacJHr829ZhyleCMsmhbXENLsW7bin4g0/GSSm4KtfEv30TAN7VWcTVOD8W5UZcuOH/pjHmbmNMVWNM+pFLCf3/esz1Z4AcoBOwBCj2nRNr7SRrbTNrbbO+PXuEWdqZb9fuPaGzw1Vr1uO3lnJpqezZu48DBw8BkJefT9aSr6lTqwapKSkcOvwzW7dlA/D5kq+pW6tmzOovbZYsXc5559Whdu0auN1uunW7njcXvF+kT0ZG+dArhGFDBzJ12iwA3G43c+dM4ZVX3mDu3AVF9qlYMfC+jMfjYcjgAUyaNAOAtLRU3O7AZyRvv+0mFn36JQeDx10iy5Q/eq7pqlIb4uIh9+jPPr7RFXhXf3F0h9zDmMQkTHpgZV5c3Ub4d22PVrlRFdacP9Ar+O+xUz0WqBvGvs2stU2C18cZY3qV2PssNGTkkyz5eiX79h3gqs63cPftt4amCLrfcC3vf/Ips//1FnHxcSR6PIx5ZBjGGHbu3svDf3kan9+P9Vs6XNmKzJaXAjBq6CDue/hxjMuQmpLMYw/dF8shlio+n4977h3O22+9RpzLxdRps1mz5ltGjRzM0mUrWLDgA9q0uYLHH3sIi2XRoiwGDnoYgK5dO9Gq1aWkZ5SnZ89uANze9z5WrFjN4PvvouO17XC5XEycOJ1PFn4GQP0L6/HyS3/H5/exdu239Os/OGZjL20SugzAVas+JimFMveOp3DhG4GAB7zLPiK+fnPiG7fC+n3gLSB/7vjQviatAiY1Hf/WtUfv0PrJf3MyiV3vxVo/5B0mf/6kaA8rKkpc6vmL79SYbGAsgUUrA4BzbfCBjDErrbWNT3UfZ9NSTynqTF/qKSU7m5Z6yolOy1LPYxljGgENgMQj26y104vp/iKQErw+DagA7DTGVCHwITEREYmhcJd6jgQyCYT/28A1wKfAScO/uM8HWGt/NMZ88osqFRGR0ybcN3xvBK4CfrTW9gEuBk74mucw6YNjIiIxFu60T5611m+M8RpjUoGfKOHNXmPMyuKagGKXeoqISHScMvxNYL3bSmNMOQJz+cuAQ8DiEnarDHQAjv9ctAH0baAiIjF2yvC31lpjTBNr7T7gBWPMu0Cqtba4s3uABUCytfaEN3eNMQt/cbUiInJahDvtk2WMaW6tXWKt3Xqqztba20touync4kREJDLCDf+2wB3GmO+AwwSmb2w46/VFROTME274XxPRKkREJKrC/TOO30W6EBERiZ5w1/mLiEgpovAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQeKj3UBxfL7Yl2BiEippTN/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBwoPtYFlAbDn/gb//18Cenl0/j39AkntC/+eiWDHvoL1apWBqBd6yu4q08P8vML6DVwKAUFhfh8fn6b2ZI/3n4zAA8/Po6lK74huWwSAI//6T4urFc3eoMq5Tq0z2Ts2EeJc7l46eWZPDXm+SLtNWtWY/KksVSomM7ePfvo2XsQ27fncPHFDXl+/BOkpCbj8/l44snxzJkzH4Ar2/6GJ58cjsvl4vChw9zW9z42bdrKvff057bbeuD1etm1cw99+9/Ptm3bYzHsUsfTqR/x5zfFHj5A7gvDTmiPO/8SPG1vxFoLfh8F783A//23uGo3wNP+llA/V4Wq5M99Dt/6ZbjqNMTTrgcYFxTkkT9vInbvjmgOKyqMtTbWNZxU4U8bzszCTmLp8m9IKpPInx4fW2z4T535LyY8NbLIdmstubl5JCWVodDrpefdDzLsnv5c3PBCHn58HG2uaE77tr+J1jBOmzLVM2NdQolcLhdrVy/i6o49yM7OIeuLt7nl1rtZu3ZDqM+smRN56+0PmTFjDm0zW9KrV3d69xlEvXp1sdayceMWqlatzOKsd2jUOJP9+w+wZvUiuvy+D+vWbeTOO3rRvHkTbu97H5ltruDLxV+Rm5vHHf170qbN5dx0810x/AmUbP/wzFiXEDZXzQuhII+EzneeNPxxJ0BhPgCmUg0SbxxE7oQhRfskliVp4Fh+HjcQvAWUGfA0ebPHYnf9QHyzdrjOOZeC+ROjMJrTo+yIV004/TTtcxo0a9KItNSU/3k/YwxJSWUA8Hq9eL0+DGEdN/kVWjRvyqZNW9myZRuFhYW8/vo8ruvUoUif+vXr8fHHnwLwycLPuK5TewA2bNjMxo1bAMjJ2cFPO3dTsWIGEHgyT00J/D9IS0shJydwtrjwP5+Tm5sHwJeLl1G9WtXID9Ih/NvWYXMPFd8hGPwAxpMAJznZjW/QAt/GFeAtCGywFpMQ+L00CUnYQ3tPa81nCk37RMmK1evo0vuPVKqQweABt3FenVoA+Hw+uvW9l23bc+hxw7U0bnhBaJ9nX5zBP6bO4rJLLua+O3vj8bhjVX6pck61Knyf/UPodvb2HFo0b1qkz8qVa+hyQ0fGPzeFzp2vITU1hfT08uzZczQImjdrgsfjZtOmrQDcccdg3pw/g9zcPA4cPEjL33Q64bH79O7Bu+99EpmByUnFXdAMz1XdMWVTyZs55oT2+IaXU5j1Tuh2/oLJJPYYgvUWQn4uuVNGnrBPaRCRM39jTJox5kljzDpjzO7gZW1wW7lIPOaZrMH55/HBnJf459TnuOn3v2PQn/4SaouLi2Puy+P5aO5UVq39lg2btwJw7x29ePPVF5j94jj2HzzIlFffiFH1pY8xJ766On7688Ghj9G69WUsWfwerVtdRnZ2Dl6vN9RepUolpk59lr597w/te889/eh03a3UrtuMadNm8/SYoqFx001daHbJxTz9zD8iMCopjm/9UnInDCFv9jg8mV2LtJnkcrgq1cC3aWVom/vSa8ibOYbcvw3Eu/w/eNrfHO2SoyJS0z6vA3uBTGtthrU2A2gb3DanuJ2MMf2NMUuNMUsnT58VodKiL7lsUmh6p/XlzfF6fezdt79In9SUZJo3vYhPv/wKgIoV0jHG4PG46dyxHavWfhv1ukur7dk51Kh+Tuh29WpVQ1M0R+Tk7KBrt340b9GBP48YDcCBAwcBSElJZv686YwY+RRfLg4crwoV0ml8UQMWL/kagNfnzOfyy5uF7u+qK1vx0LBBdO7Sm4KCgoiOT07Ov20dpnwlKJMc2hbX4FK865aC3xfYkJSCq3JN/Ns3AeBdnUVcjfNjUW7ERSr8a1trR1trfzyywVr7o7V2NFCzuJ2stZOstc2stc369vxDhEqLvl2794bODletWY/fbymXlsqevfs5cDAwX5mXn0/W0uXUqVkdgJ279gCBM9KPF2VRr26t2BRfCi1ZupzzzqtD7do1cLvddOt2PW8ueL9In4yM8qFXCMOGDmTqtMDJiNvtZu6cKbzyyhvMnbsg1H/v3v2kpaVSL7giq91VrVm3LvAGcpMmDZnw/JPc0KUPO3fujsYQJciUrxy67qpSG+Li4Zj3COIbXYF39RdHd8g9jElMwqRXASCubiP8u0rnyqxIzfl/Z4x5EJhmrd0BYIypDPQGvo/QY8bMkFFPseTrVezbf4CruvTi7ttuDk0RdO/ckfcXfsrsf79DXJyLxIQExox6EGMMO3fv4eG/jsPn82Otnw5tW5HZsgUAQx97mr379mOt5YLz6jJy8IBYDrFU8fl83HPvcN5+6zXiXC6mTpvNmjXfMmrkYJYuW8GCBR/Qps0VPP7YQ1gsixZlMXDQwwB07dqJVq0uJT2jPD17dgPg9r73sWLFau64awivz56E32/Zt3cfffs/AMDoJ/5McnJZZs0MrBj5/vvt3NClT2wGX8okdBmAq1Z9TFIKZe4dT+HCNwIBD3iXfUR8/ebEN26F9fvAW0D+3PGhfU1aBUxqOv6ta4/eofWT/+ZkErvei7V+yDtM/vxJ0R5WVERkqacxpjwwDLgeqAxYYAcwHxhtrd1zqvs4m5Z6SlFn+lJPKdnZtNRTThTuUs+InPlba/caY14GPgCyrLWh11nGmKuBdyPxuCIiEp5IrfYZBMwD/gh8Y4y5/pjmv0biMUVEJHyRmvPvB1xirT1kjKkNvGGMqW2t/TvoU0wiIrEWqfCPOzLVY63daozJJPAEUAuFv4hIzEVqqeePxpgmR24Enwh+B1QALorQY4qISJgiFf49gR+P3WCt9VprewKtI/SYIiISpkit9skuoe2zSDymiIiET9/qKSLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxIIW/iIgDKfxFRBxI4S8i4kAKfxERB1L4i4g4kMJfRMSBFP4iIg6k8BcRcSCFv4iIAyn8RUQcSOEvIuJACn8REQdS+IuIOJDCX0TEgRT+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQMZaG+saHMkY099aOynWdcgvo+N39tKxC9CZf+z0j3UB8qvo+J29dOxQ+IuIOJLCX0TEgRT+seP4OceznI7f2UvHDr3hKyLiSDrzFxFxIIX/aWCMudoYs94Ys9EYM+wk7QnGmNnB9i+NMbWPaXsouH29MabDMdtfMsb8ZIz5JjqjkOOFcVxbG2O+MsZ4jTE3xqJGOblT/f6YgGeDx3alMeb/ol1jrCn8fyVjTBzwPHAN0ADoYYxpcFy324G91trzgHHA6OC+DYA/AA2Bq4EJwfsDmBrcJjEQ5nHdBvQGXotudRKGqZT8+3MNUC946Q/8Iwo1nVEU/r9eC2CjtXaztbYAmAVcf1yf64FpwetvAFcZY0xw+yxrbb61dguwMXh/WGv/C+yJxgDkpE55XK21W621KwF/LAqU4oXx+3M9MN0GZAHljDFVo1PdmUHh/+tVA74/5nZ2cNtJ+1hrvcB+ICPMfSU2dGxKN8cfX4X/r2dOsu34JVTF9QlnX4kNHZvSzfHHV+H/62UDNY65XR34obg+xph4II3AS9Jw9pXY0LEp3Rx/fBX+v94SoJ4xpo4xxkPgDdz5x/WZD/QKXr8R+NgGPmAxH/hDcDVQHQJvPi2OUt1SsnCOq5xhr+LzAAADzklEQVS95gM9g6t+LgP2W2tzYl1UNMXHuoCznbXWa4z5I/AeEAe8ZK1dbYx5FFhqrZ0PTAFmGGM2Ejjj/0Nw39XGmNeBNYAXGGCt9QEYY2YCmUAFY0w2MNJaOyXKw3OscI6rMaY58C+gPNDJGPOItbZhDMuWoJP9/gBuAGvtC8DbQEcCiyx+BvrEptLY0Sd8RUQcSNM+IiIOpPAXEXEghb+IiAMp/EVEHEjhLyLiQAp/kTOQMaa3MeacWNchpZfCX84awQ/knDH/Z4Of1o6U3sD/FP4RrkdKGa3zlzNa8G8fvAN8AlwOdAYuAB4BEoBNQB9r7SFjTEdgLLAL+Aqoa6393Snu+13gS6Ap8C3Q01r7szFmBNAJKAN8DtxhrbXGmIXB2y0JfEr0W2A44AF2Azdba3cYY0YBdYCqwPnA/cBlBL5KeDvQyVpbaIy5JFhzcrDu3sH7nhrslxscd4Pj+1lrc46vx1r7zP/y8xUHs9bqossZewFqE/jK5MuCtysA/wXKBm8PBUYAiQS+pbFOcPtMYEEY922BlsHbLwGDg9fTj+k3g0BYAywEJhzTVp6jJ1F9gWeC10cBnxL4VOnFBD5Fek2w7V8EnsTcBIK7YnB7dwKfJD7yOM2C10/Vb0JJ49RFl5Nd9DJRzgbf2cB3rkPg7LkB8FngTyLgAb4ALgQ228DfRYBA+PcP476/t9Z+Frz+CjAIeBpoa4x5EEgC0oHVwJvBfrOP2b86MDv4XfAeYMsxbe/YwNn9KgJfEfFucPsqAk88FwCNgA+CY4kDTvb9MqfqN/sk+4iUSOEvZ4PDx1w3wAfW2h7HdjDGNP2F9338vKc1xiQCEwiceX8fnMJJLKae8cBYG/iun0wCZ/xH5ANYa/3GmEJr7ZHH8hP43TPAamvt5aeo8VT9DhezXaRYZ8ybZyJhygJaGmPOAzDGJBljzgfWAXWP+fvI3cO8v5rGmCOh2oPAVM2RoN9ljEkm8E2sxUkjMDcPR7+5NVzrgYpHHt8Y4zbGHPliuINAShj9RH4Rhb+cVay1Owm8KTrTGLOSwJPBhdbaXOBu4F1jzKfADgJ/MQ1jTDNjzORi7nIt0Ct4X+nAP6y1+4AXCUzP/JvA1zsXZxQwxxiziMAbsf/LWAoIPLGMNsasAJYDVwSbpwIvGGOWE5jmKa6fyC+i1T5Sahhjkm1g1Y8h8MfXN1hrx5XQvzaBN4UbRalEkTOGzvylNOkXPFNeTWA6ZmKM6xE5Y+nMX0TEgXTmLyLiQAp/EREHUviLiDiQwl9ExIEU/iIiDqTwFxFxoP8HkqdScClQ374AAAAASUVORK5CYII=", - "text/plain": [ - "
" - ] - }, - "metadata": {} - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The calculated RMSE scores can be visualized to comparatively study how model performance is affected by different parameters." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "It can be seen from this visualization that RMSE first decreases and then increases as rank increases, due to overfitting. When the rank equals 20 and the regularization parameter equals 0.1, the model achieves the lowest RMSE score." - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "### 3.5 Top K recommendation" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.1 Top k for all users (items)" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 37, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs_rec = model.recommendForAllUsers(10)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 38, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs_rec.show(10)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+--------------------+\n", - "|UserId| recommendations|\n", - "+------+--------------------+\n", - "| 471|[[814, 3.7504902]...|\n", - "| 463|[[814, 3.1264882]...|\n", - "| 833|[[814, 3.3154674]...|\n", - "| 496|[[814, 3.0553887]...|\n", - "| 148|[[814, 4.030121],...|\n", - "| 540|[[814, 3.866104],...|\n", - "| 392|[[814, 4.1199512]...|\n", - "| 243|[[814, 3.7487845]...|\n", - "| 623|[[814, 3.9018161]...|\n", - "| 737|[[814, 3.85075], ...|\n", - "+------+--------------------+\n", - "only showing top 10 rows\n", - "\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.2 Top k for a selected set of users (items)" - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 39, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "users = dfs_train.select(als.getUserCol()).distinct().limit(3)\n", "\n", "dfs_rec_subset = model.recommendForUserSubset(users, 10)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 40, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "dfs_rec_subset.show(10)" - ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+--------------------+\n", - "|UserId| recommendations|\n", - "+------+--------------------+\n", - "| 471|[[814, 3.7504902]...|\n", - "| 463|[[814, 3.1264882]...|\n", - "| 148|[[814, 4.030121],...|\n", - "+------+--------------------+\n", - "\n" - ] - } - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "#### 3.5.3 Run-time considerations for top-k recommendations\n", "\n", @@ -735,28 +593,28 @@ "* Inner products of user-item pairs are calculated individually instead of leveraging matrix block multiplication features which are available in certain contemporary computing acceleration libraries (e.g., BLAS).\n", "\n", "More details about possible optimizations of the top k recommendations in Spark can be found [here](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)." - ], - "metadata": {} + ] }, { "cell_type": "code", - "execution_count": 41, + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## References" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "1. Yehuda Koren, Robert Bell, and Chris Volinsky, \"Matrix Factorization Techniques for Recommender Systems\n", "\", ACM Computer, Vol. 42, Issue 8, pp 30-37, Aug., 2009.\n", @@ -766,14 +624,18 @@ "4. Seaborn. url: https://seaborn.pydata.org/\n", "5. Scaling collaborative filtering with PySpark. url: https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html\n", "6. Matrix Completion via Alternating Least Square (ALS). url: http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf" - ], - "metadata": {} + ] } ], "metadata": { + "celltoolbar": "Tags", + "interpreter": { + "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" + }, "kernelspec": { - "name": "python3", - "display_name": "Python 3.6.9 64-bit ('.env': venv)" + "display_name": "Python 3", + "language": "python", + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -786,11 +648,8 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.9" - }, - "interpreter": { - "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" } }, "nbformat": 4, "nbformat_minor": 2 -} \ No newline at end of file +} diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index 48ddeb1162..0e3fb41e09 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -8,6 +8,8 @@ except ImportError: pass # disable error while collecting tests for non-notebook environments +from recommenders.utils.constants import DEFAULT_RATING_COL, DEFAULT_USER_COL, DEFAULT_ITEM_COL + @pytest.mark.notebooks @pytest.mark.spark @@ -36,7 +38,12 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["als_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, - parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=data_size)) + parameters=dict( + MOVIELENS_DATA_SIZE=data_size, + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks From 5e679bef5ee5a919aca0f8b8e312576f40bca13c Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Thu, 23 Sep 2021 20:34:21 +0000 Subject: [PATCH 08/27] Mock movielens schema v2 --- .../als_movielens_diversity_metrics.ipynb | 104 ++++-------------- recommenders/datasets/mock/movielens.py | 81 ++++++++------ recommenders/datasets/movielens.py | 40 +++---- recommenders/utils/constants.py | 9 ++ setup.py | 2 +- tests/unit/examples/test_notebooks_python.py | 2 +- .../datasets/mock/test_movielens.py | 88 +++++++-------- .../recommenders/datasets/test_movielens.py | 29 ++++- 8 files changed, 170 insertions(+), 185 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index aaba0a35d1..38de757530 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "source": [ "# set the environment path to find Recommenders\n", "%load_ext autoreload\n", @@ -174,17 +174,7 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", - "[GCC 8.4.0]\n", - "Spark version: 2.4.8\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -197,7 +187,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "source": [ "# top k items to recommend\n", "TOP_K = 10\n", @@ -206,11 +196,11 @@ "MOVIELENS_DATA_SIZE = 'mock10'\n", "\n", "# user, item column names\n", - "COL_USER=\"userId\"\n", - "COL_ITEM=\"itemID\"\n", - "COL_RATING=\"rating\"\n", - "COL_TITLE=\"title\"\n", - "COL_GENRE=\"genres\"" + "COL_USER=\"UserId\"\n", + "COL_ITEM=\"ItemId\"\n", + "COL_RATING=\"Rating\"\n", + "COL_TITLE=\"Title\"\n", + "COL_GENRE=\"Genre\"" ], "outputs": [], "metadata": { @@ -230,7 +220,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", @@ -251,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "source": [ "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", "schema = StructType(\n", @@ -263,32 +253,10 @@ " )\n", ")\n", "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=\"genres\")\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n", "data.show()" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "+------+------+------+---------+-----+--------+\n", - "|userID|itemID|rating|timestamp|title| genres|\n", - "+------+------+------+---------+-----+--------+\n", - "| 8| 3| 4.0|2022-2-22| foo|genreA|0|\n", - "| 8| 9| 5.0|2022-2-22| foo|genreA|0|\n", - "| 5| 1| 5.0|2022-2-22| foo|genreA|0|\n", - "| 9| 1| 1.0|2022-2-22| foo|genreA|0|\n", - "| 7| 5| 5.0|2022-2-22| foo|genreA|0|\n", - "| 3| 6| 5.0|2022-2-22| foo|genreA|0|\n", - "| 2| 6| 2.0|2022-2-22| foo|genreA|0|\n", - "| 5| 7| 4.0|2022-2-22| foo|genreA|0|\n", - "| 6| 9| 2.0|2022-2-22| foo|genreA|0|\n", - "| 5| 6| 3.0|2022-2-22| foo|genreA|0|\n", - "+------+------+------+---------+-----+--------+\n", - "\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -300,22 +268,13 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "source": [ "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", "print (\"N train_df\", train_df.cache().count())\n", "print (\"N test_df\", test_df.cache().count())" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "N train_df 6\n", - "N test_df 4\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -334,7 +293,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "source": [ "users = train_df.select(COL_USER).distinct()\n", "items = train_df.select(COL_ITEM).distinct()\n", @@ -355,7 +314,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "source": [ "header = {\n", " \"userCol\": COL_USER,\n", @@ -380,22 +339,14 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "source": [ "with Timer() as train_time:\n", " model = als.fit(train_df)\n", "\n", "print(\"Took {} seconds for training.\".format(train_time.interval))" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Took 2.296935658028815 seconds for training.\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -409,7 +360,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "source": [ "# Score all user-item pairs\n", "dfs_pred = model.transform(user_item)\n", @@ -431,16 +382,7 @@ " \n", "print(top_k_reco.count())" ], - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "30\n", - "30\n" - ] - } - ], + "outputs": [], "metadata": {} }, { @@ -688,9 +630,9 @@ "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", - " data.groupBy(COL_ITEM, COL_TITLE, \"genres\").count()\n", + " data.groupBy(COL_ITEM, COL_TITLE, COL_GENRE).count()\n", " .na.drop() # remove rows with null values\n", - " .withColumn(\"genres\", F.split(F.col(\"genres\"), \"\\|\")) # convert to array of genres\n", + " .withColumn(COL_GENRE, F.split(F.col(COL_GENRE), \"\\|\")) # convert to array of genres\n", " .withColumn(COL_TITLE, F.regexp_replace(F.col(COL_TITLE), \"[\\(),:^0-9]\", \"\")) # remove year from title\n", " .drop(\"count\") # remove unused columns\n", ")" @@ -724,7 +666,7 @@ "hashed_data = text_hasher.transform(clean_data)\n", "\n", "# step 2: fit a CountVectorizerModel from column \"genres\".\n", - "count_vectorizer = CountVectorizer(inputCol=\"genres\", outputCol=\"genres_features\")\n", + "count_vectorizer = CountVectorizer(inputCol=COL_GENRE, outputCol=\"genres_features\")\n", "count_vectorizer_model = count_vectorizer.fit(hashed_data)\n", "vectorized_data = count_vectorizer_model.transform(hashed_data)\n", "\n", diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index 4344de7e42..44c5acc221 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -14,22 +14,26 @@ DEFAULT_ITEM_COL, DEFAULT_RATING_COL, DEFAULT_TIMESTAMP_COL, + DEFAULT_TITLE_COL, + DEFAULT_GENRE_COL, + DEFAULT_HEADER ) import random from typing import Optional -from pandera.typing import DateTime, Series -from pandera import Field, Check -from pandera.schemas import DataFrameSchema +import pandas +import pyspark.sql +from pandera.typing import Series +from pandera import Field from pyspark.sql import SparkSession -from pyspark.sql.types import StructField, StructType, LongType, IntegerType, StringType, FloatType +from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType -class MockMovielens100kSchema(pa.SchemaModel): +class MockMovielensSchema(pa.SchemaModel): """ Mock dataset schema to generate fake data for testing purpose. - This schema is configured to mimic the Movielens 100k dataset + This schema is configured to mimic the Movielens dataset http://files.grouplens.org/datasets/movielens/ml-100k/ """ @@ -38,71 +42,78 @@ class MockMovielens100kSchema(pa.SchemaModel): # And 1682 total items itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # Rating is on the scale from 1 to 5 - rating: Series[int] = Field(in_range={"min_value": 1, "max_value": 5}) + rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) timestamp: Series[str] = Field(eq="2022-2-22") title: Series[str] = Field(eq="foo") - genres: Series[str] = Field(eq="genreA|0") + genre: Series[str] = Field(eq="genreA|0") @classmethod def get_df( cls, size: int = 3, seed: int = 100, - # title_col: Optional[str] = None, genres_col: Optional[str] = None - ): + keep_first_n_cols: Optional[int] = None, + keep_title_col: bool = False, keep_genre_col: bool = False, + ) -> pandas.DataFrame: """Return fake movielens dataset as a Pandas Dataframe with specified rows. Args: size (int): number of rows to generate seed (int, optional): seeding the pseudo-number generation. Defaults to 100. - title_col (str, optional): if not None, append a title column. Defaults to None. - genres_col (str, optional): if not None, append a genre column. Defaults to None. + keep_first_n_cols (int, optional): keep the first n default movielens columns. + keep_title_col (bool): remove the title column if False. Defaults to True. + keep_genre_col (bool): remove the genre column if False. Defaults to True. Returns: pandas.DataFrame: a mock dataset """ + schema = cls.to_schema() + if keep_first_n_cols is not None: + if keep_first_n_cols < 1 or keep_first_n_cols > len(DEFAULT_HEADER): + raise ValueError(f"Invalid value for 'keep_first_n_cols': {keep_first_n_cols}. Valid range: [1-{len(DEFAULT_HEADER)}]") + schema = schema.remove_columns(DEFAULT_HEADER[keep_first_n_cols:]) + if not keep_title_col: + schema = schema.remove_columns([DEFAULT_TITLE_COL]) + if not keep_genre_col: + schema = schema.remove_columns([DEFAULT_GENRE_COL]) + random.seed(seed) - return cls.example(size=size) + return schema.example(size=size) @classmethod def get_spark_df( cls, spark: SparkSession, size: int = 3, seed: int = 100, - # title_col: Optional[str] = None, genres_col: Optional[str] = None, - # schema: Optional[StructType] = None - ): + keep_title_col: bool = False, keep_genre_col: bool = False, + ) -> pyspark.sql.DataFrame: """Return fake movielens dataset as a Spark Dataframe with specified rows Args: spark (SparkSession): spark session to load the dataframe into size (int): number of rows to generate seed (int, optional): seeding the pseudo-number generation. Defaults to 100. - title_col (str, optional): if not None, append a title column. Defaults to None. - genres_col (str, optional): if not None, append a genre column. Defaults to None. - schema (pyspark.sql.types.StructType, optional): dataset schema. Defaults to None. + keep_title_col (bool): remove the title column if False. Defaults to False. + keep_genre_col (bool): remove the genre column if False. Defaults to False. Returns: pyspark.sql.DataFrame: a mock dataset """ - pandas_df = cls.get_df(size=size, seed=seed) + pandas_df = cls.get_df(size=size, seed=seed, keep_title_col=True, keep_genre_col=True) + # serialize the pandas.df to avoid the expensive java <-> python communication pandas_df.to_csv('test.csv', header=False, index=False) - default_schema = StructType([ + + deserialization_schema = StructType([ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), StructField(DEFAULT_TIMESTAMP_COL, StringType()), - StructField("title", StringType()), - StructField("genres", StringType()), + StructField(DEFAULT_TITLE_COL, StringType()), + StructField(DEFAULT_GENRE_COL, StringType()), ]) - return spark.read.csv('test.csv', schema=default_schema) - - # @classmethod - # def _get_item_df(cls, size, title_col: Optional[str] = None, genres_col: Optional[str] = None): - # schema = DataFrameSchema() # create an empty schema - # if title_col is not None: - # # adds a title column with random alphabets - # schema = schema.add_columns({title_col: pa.Column(str, Check.str_matches(r'^[a-z]+$'))}) - # if genres_col is not None: - # # adds a genre column with '|' separated string - # schema = schema.add_columns({genres_col: pa.Column(str, Check.str_matches(r'^[a-z]+\|[0-9]$'))}) - # schema.example() \ No newline at end of file + spark_df = spark.read.csv('test.csv', schema=deserialization_schema) + + if not keep_title_col: + spark_df = spark_df.drop(DEFAULT_TITLE_COL) + if not keep_genre_col: + spark_df = spark_df.drop(DEFAULT_GENRE_COL) + return spark_df diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 863578902b..c47865afcb 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -7,14 +7,12 @@ import warnings import pandas as pd from zipfile import ZipFile -from recommenders.datasets.mock.movielens import MockMovielens100kSchema +from recommenders.datasets.mock.movielens import MockMovielensSchema from recommenders.datasets.download_utils import maybe_download, download_path from recommenders.utils.notebook_utils import is_databricks from recommenders.utils.constants import ( - DEFAULT_USER_COL, + DEFAULT_HEADER, DEFAULT_ITEM_COL, - DEFAULT_RATING_COL, - DEFAULT_TIMESTAMP_COL, ) try: @@ -130,12 +128,6 @@ def item_has_header(self): "Western", ) -DEFAULT_HEADER = ( - DEFAULT_USER_COL, - DEFAULT_ITEM_COL, - DEFAULT_RATING_COL, - DEFAULT_TIMESTAMP_COL, -) # Warning and error messages WARNING_MOVIE_LENS_HEADER = """MovieLens rating dataset has four columns @@ -197,10 +189,6 @@ def load_pandas_df( if size not in DATA_FORMAT and size not in MOCK_DATA_FORMAT: raise ValueError(ERROR_MOVIE_LENS_SIZE) - if size in MOCK_DATA_FORMAT: - # generate fake data using the dictionary as a kwarg to the generation function - return MockMovielens100kSchema.get_df(**MOCK_DATA_FORMAT[size]) - if header is None: header = DEFAULT_HEADER elif len(header) < 2: @@ -209,6 +197,15 @@ def load_pandas_df( warnings.warn(WARNING_MOVIE_LENS_HEADER) header = header[:4] + if size in MOCK_DATA_FORMAT: + # generate fake data + return MockMovielensSchema.get_df( + keep_first_n_cols=len(header), + keep_title_col=(title_col is not None), + keep_genre_col=(genres_col is not None), + **MOCK_DATA_FORMAT[size] # supply the rest of the kwarg with the dictionary + ) + movie_col = header[1] with download_path(local_cache_path) as path: @@ -368,9 +365,9 @@ def load_spark_df( schema* (pyspark.StructType): Dataset schema. local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. - dbutils* (Databricks.dbutils): Databricks utility object - title_col* (str): Title column name. If None, the column will not be loaded. - genres_col* (str): Genres column name. Genres are '|' separated string. + dbutils (Databricks.dbutils): Databricks utility object + title_col (str): Title column name. If None, the column will not be loaded. + genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col* (str): Movie release year column name. If None, the column will not be loaded. @@ -413,8 +410,13 @@ def load_spark_df( raise ValueError(ERROR_MOVIE_LENS_SIZE) if size in MOCK_DATA_FORMAT: - # generate fake data using the dictionary as a kwarg to the generation function - return MockMovielens100kSchema.get_spark_df(spark, **MOCK_DATA_FORMAT[size]) + # generate fake data + return MockMovielensSchema.get_spark_df( + spark, + keep_title_col=(title_col is not None), + keep_genre_col=(genres_col is not None), + **MOCK_DATA_FORMAT[size] # supply the rest of the kwarg with the dictionary + ) schema = _get_schema(header, schema) if len(schema) < 2: diff --git a/recommenders/utils/constants.py b/recommenders/utils/constants.py index 0e7ed34a9e..e24a58d725 100644 --- a/recommenders/utils/constants.py +++ b/recommenders/utils/constants.py @@ -6,6 +6,8 @@ DEFAULT_ITEM_COL = "itemID" DEFAULT_RATING_COL = "rating" DEFAULT_LABEL_COL = "label" +DEFAULT_TITLE_COL = "title" +DEFAULT_GENRE_COL = "genre" DEFAULT_RELEVANCE_COL = "relevance" DEFAULT_TIMESTAMP_COL = "timestamp" DEFAULT_PREDICTION_COL = "prediction" @@ -13,6 +15,13 @@ DEFAULT_ITEM_FEATURES_COL = "features" DEFAULT_ITEM_SIM_MEASURE = "item_cooccurrence_count" +DEFAULT_HEADER = ( + DEFAULT_USER_COL, + DEFAULT_ITEM_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, +) + COL_DICT = { "col_user": DEFAULT_USER_COL, "col_item": DEFAULT_ITEM_COL, diff --git a/setup.py b/setup.py index 7aef19fe52..3bcdb23f05 100644 --- a/setup.py +++ b/setup.py @@ -75,9 +75,9 @@ ], "dev": [ "black>=18.6b4,<21", + "pandera[strategies]>=0.6.5", # For generating fake datasets "pytest>=3.6.4", "pytest-cov>=2.12.1", - "pytest-lazy-fixture>=0.6.3", # Allow using fixtures in pytest.mark.parametrize ], } # for the brave of heart diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index da9e65d214..7b2de60ef5 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -102,7 +102,7 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks @pytest.mark.mock_movielens -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["10m", "mock100"]) def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["rlrmc_quickstart"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py index 446f5cd75e..ae5ea765de 100644 --- a/tests/unit/recommenders/datasets/mock/test_movielens.py +++ b/tests/unit/recommenders/datasets/mock/test_movielens.py @@ -1,72 +1,66 @@ -from recommenders.datasets.mock.movielens import MockMovielens100kSchema +from recommenders.datasets.mock.movielens import MockMovielensSchema from recommenders.datasets.movielens import DEFAULT_HEADER from recommenders.utils.constants import ( - DEFAULT_USER_COL, - DEFAULT_ITEM_COL, - DEFAULT_RATING_COL, - DEFAULT_TIMESTAMP_COL, + DEFAULT_GENRE_COL, + DEFAULT_TITLE_COL, ) import pytest import pandas import pyspark.sql from pyspark.sql import SparkSession -from pyspark.sql.types import IntegerType, FloatType, LongType, StructField, StructType - - -@pytest.fixture(scope="module") -def default_schema(): - return StructType([ - StructField(DEFAULT_USER_COL, IntegerType()), - StructField(DEFAULT_ITEM_COL, IntegerType()), - StructField(DEFAULT_RATING_COL, FloatType()), - StructField(DEFAULT_TIMESTAMP_COL, LongType()), - ]) - - -@pytest.fixture(scope="module") -def custom_schema(): - return StructType([ - StructField("userID", IntegerType()), - StructField("itemID", IntegerType()), - StructField("rating", FloatType()), - ]) @pytest.mark.parametrize("size", [10, 100]) def test_mock_movielens_schema__has_default_col_names(size): - df = MockMovielens100kSchema.example(size=size) + df = MockMovielensSchema.example(size=size) for col_name in DEFAULT_HEADER: assert col_name in df.columns +@pytest.mark.parametrize("keep_first_n_cols", [1, 2, 3, 4]) +def test_mock_movielens_schema__get_df_remove_default_col__return_success(keep_first_n_cols): + df = MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + assert len(df) > 0 + assert len(df.columns) == keep_first_n_cols + + +@pytest.mark.parametrize("keep_first_n_cols", [-1, 0, 100]) +def test_mock_movielens_schema__get_df_invalid_param__return_failure(keep_first_n_cols): + with pytest.raises(ValueError, match=r"Invalid value.*"): + MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + + +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("keep_first_n_cols", [None, 2]) @pytest.mark.parametrize("seed", [-1]) # seed for pseudo-random # generation @pytest.mark.parametrize("size", [0, 3, 10]) -def test_mock_movielens_schema__get_df__return_success(size, seed): - df = MockMovielens100kSchema.get_df(size, seed=seed) +def test_mock_movielens_schema__get_df__return_success(size, seed, keep_first_n_cols, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_df( + size=size, seed=seed, + keep_first_n_cols=keep_first_n_cols, + keep_title_col=keep_title_col, keep_genre_col=keep_genre_col + ) assert type(df) == pandas.DataFrame assert len(df) == size - -@pytest.mark.parametrize("seed", [0, 101]) # seed for pseudo-random # generation -@pytest.mark.parametrize("size", [3, 10]) -def test_mock_movielens_schema__get_spark_df__return_success(spark: SparkSession, size, seed): - df = MockMovielens100kSchema.get_spark_df(spark, size, seed=seed) - assert type(df) == pyspark.sql.DataFrame - assert df.count() == size + if keep_title_col: + assert len(df[DEFAULT_TITLE_COL]) == size + if keep_genre_col: + assert len(df[DEFAULT_GENRE_COL]) == size -@pytest.mark.parametrize("schema", [ - None, - pytest.lazy_fixture('default_schema'), - pytest.lazy_fixture('custom_schema') -]) -def test_mock_movielens_schema__get_spark_df__with_custom_schema_return_success(spark: SparkSession, schema): - df = MockMovielens100kSchema.get_spark_df(spark, schema=schema) +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("seed", [101]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_spark_df__return_success(spark: SparkSession, size, seed, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_spark_df(spark, size=size, seed=seed, keep_title_col=keep_title_col, keep_genre_col=keep_genre_col) assert type(df) == pyspark.sql.DataFrame - assert df.count() >= 0 - + assert df.count() == size -def test_mock_movielens_schema__get_spark_df__fail_on_empty_rows(spark: SparkSession): - with pytest.raises(ValueError, match="can not infer schema from empty dataset.*"): - MockMovielens100kSchema.get_spark_df(spark, 0) + if keep_title_col: + assert df.schema[DEFAULT_TITLE_COL] + if keep_genre_col: + assert df.schema[DEFAULT_GENRE_COL] diff --git a/tests/unit/recommenders/datasets/test_movielens.py b/tests/unit/recommenders/datasets/test_movielens.py index f05cc24882..d53f5d594c 100644 --- a/tests/unit/recommenders/datasets/test_movielens.py +++ b/tests/unit/recommenders/datasets/test_movielens.py @@ -1,7 +1,10 @@ from recommenders.datasets.movielens import DATA_FORMAT, MOCK_DATA_FORMAT from recommenders.datasets.movielens import load_pandas_df, load_spark_df +from recommenders.utils.constants import DEFAULT_GENRE_COL, DEFAULT_TITLE_COL import pyspark.sql +import pandas +from pandas.core.series import Series from pyspark.sql import SparkSession @@ -15,7 +18,31 @@ def test_mock_movielens_data__no_name_collision(): assert not collision -def test_mock_movielens_data_generation_succeed(spark: SparkSession): +def test_load_spark_df_mock_100__with_default_param__succeed(spark: SparkSession): df = load_spark_df(spark, "mock100") assert type(df) == pyspark.sql.DataFrame assert df.count() == 100 + + +def test_load_pandas_df_mock_100__with_default_param__succeed(): + df = load_pandas_df("mock100") + assert type(df) == pandas.DataFrame + assert len(df) == 100 + + +def test_load_spark_df_mock_100__with_custom_param__succeed(spark: SparkSession): + df = load_spark_df(spark, "mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) + assert df.schema[DEFAULT_TITLE_COL] + assert df.schema[DEFAULT_GENRE_COL] + assert df.count() == 100 + assert '|' in df.take(1)[0][DEFAULT_GENRE_COL] + assert df.take(1)[0][DEFAULT_TITLE_COL] == 'foo' + + +def test_load_pandas_df_mock_100__with_custom_param__succeed(spark: SparkSession): + df = load_pandas_df("mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) + assert type(df[DEFAULT_TITLE_COL]) == Series + assert type(df[DEFAULT_GENRE_COL]) == Series + assert len(df) == 100 + assert '|' in df.loc[0, DEFAULT_GENRE_COL] + assert df.loc[0, DEFAULT_TITLE_COL] == 'foo' From eb939b8f0b09431327d0eee31ebf259f471809fb Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 00:28:54 +0000 Subject: [PATCH 09/27] Don't use 100k dataset --- tests/unit/examples/test_notebooks_pyspark.py | 6 +++--- tests/unit/examples/test_notebooks_python.py | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index 0e3fb41e09..f25dbe388c 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -34,7 +34,7 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["mock100"]) def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["als_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, @@ -59,7 +59,7 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark @pytest.mark.mock_movielens -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["mock100"]) def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, @@ -72,7 +72,7 @@ def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s" ) -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["mock100"]) def test_spark_tuning(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["spark_tuning"] pm.execute_notebook( diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 7b2de60ef5..0d809ee51e 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -51,7 +51,7 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.mock_movielens -@pytest.mark.parametrize("data_size", ["100k", "mock100"]) +@pytest.mark.parametrize("data_size", ["mock100"]) def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["surprise_svd_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, @@ -102,7 +102,7 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks @pytest.mark.mock_movielens -@pytest.mark.parametrize("data_size", ["10m", "mock100"]) +@pytest.mark.parametrize("data_size", ["mock100"]) def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["rlrmc_quickstart"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, From 581c1ed00ec3c07517d35afc7ac4df81066a2886 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 13:27:41 +0000 Subject: [PATCH 10/27] Re-wire local import to minimize module-wide dependency --- recommenders/datasets/mock/movielens.py | 2 +- recommenders/datasets/movielens.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index 44c5acc221..c8ddf6a80c 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -7,7 +7,7 @@ try: import pandera as pa except ImportError as e: - raise ImportError("Pandera not installed. Try `pip install recommender['dev']`") from e + raise ImportError("pandera is not installed. Try `pip install recommenders['dev']`") from e from recommenders.utils.constants import ( DEFAULT_USER_COL, diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index c47865afcb..60f33a5e92 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -7,7 +7,6 @@ import warnings import pandas as pd from zipfile import ZipFile -from recommenders.datasets.mock.movielens import MockMovielensSchema from recommenders.datasets.download_utils import maybe_download, download_path from recommenders.utils.notebook_utils import is_databricks from recommenders.utils.constants import ( @@ -198,6 +197,8 @@ def load_pandas_df( header = header[:4] if size in MOCK_DATA_FORMAT: + # function-wide import to isolate extra dependencies from the mock schema will use + from recommenders.datasets.mock.movielens import MockMovielensSchema # generate fake data return MockMovielensSchema.get_df( keep_first_n_cols=len(header), @@ -410,6 +411,8 @@ def load_spark_df( raise ValueError(ERROR_MOVIE_LENS_SIZE) if size in MOCK_DATA_FORMAT: + # function-wide import to isolate extra dependencies from the mock schema will use + from recommenders.datasets.mock.movielens import MockMovielensSchema # generate fake data return MockMovielensSchema.get_spark_df( spark, From 5862f257a6f172544099aba0f418a9f8da7f201d Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 14:46:10 +0000 Subject: [PATCH 11/27] Runnable in non-spark env --- recommenders/datasets/mock/movielens.py | 19 ++++++++++++------- .../datasets/mock/test_movielens.py | 6 ++---- .../recommenders/datasets/test_movielens.py | 12 ++++++------ 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index c8ddf6a80c..e71da13f41 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -9,6 +9,11 @@ except ImportError as e: raise ImportError("pandera is not installed. Try `pip install recommenders['dev']`") from e +try: + from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType +except ImportError: + pass # so the environment without spark doesn't break + from recommenders.utils.constants import ( DEFAULT_USER_COL, DEFAULT_ITEM_COL, @@ -23,11 +28,8 @@ from typing import Optional import pandas -import pyspark.sql from pandera.typing import Series from pandera import Field -from pyspark.sql import SparkSession -from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType class MockMovielensSchema(pa.SchemaModel): @@ -36,10 +38,12 @@ class MockMovielensSchema(pa.SchemaModel): This schema is configured to mimic the Movielens dataset http://files.grouplens.org/datasets/movielens/ml-100k/ + + Dataset schema and generation is configured using pandera. + Please see https://pandera.readthedocs.io/en/latest/schema_models.html + for more information. """ - # The 100k dataset has 943 total users userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) - # And 1682 total items itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) # Rating is on the scale from 1 to 5 rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) @@ -77,15 +81,16 @@ def get_df( schema = schema.remove_columns([DEFAULT_GENRE_COL]) random.seed(seed) + # For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html return schema.example(size=size) @classmethod def get_spark_df( cls, - spark: SparkSession, + spark, size: int = 3, seed: int = 100, keep_title_col: bool = False, keep_genre_col: bool = False, - ) -> pyspark.sql.DataFrame: + ): """Return fake movielens dataset as a Spark Dataframe with specified rows Args: diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py index ae5ea765de..e8a6e5f8be 100644 --- a/tests/unit/recommenders/datasets/mock/test_movielens.py +++ b/tests/unit/recommenders/datasets/mock/test_movielens.py @@ -7,8 +7,6 @@ import pytest import pandas -import pyspark.sql -from pyspark.sql import SparkSession @pytest.mark.parametrize("size", [10, 100]) @@ -51,13 +49,13 @@ def test_mock_movielens_schema__get_df__return_success(size, seed, keep_first_n_ assert len(df[DEFAULT_GENRE_COL]) == size +@pytest.mark.spark @pytest.mark.parametrize("keep_genre_col", [True, False]) @pytest.mark.parametrize("keep_title_col", [True, False]) @pytest.mark.parametrize("seed", [101]) # seed for pseudo-random # generation @pytest.mark.parametrize("size", [0, 3, 10]) -def test_mock_movielens_schema__get_spark_df__return_success(spark: SparkSession, size, seed, keep_title_col, keep_genre_col): +def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, keep_title_col, keep_genre_col): df = MockMovielensSchema.get_spark_df(spark, size=size, seed=seed, keep_title_col=keep_title_col, keep_genre_col=keep_genre_col) - assert type(df) == pyspark.sql.DataFrame assert df.count() == size if keep_title_col: diff --git a/tests/unit/recommenders/datasets/test_movielens.py b/tests/unit/recommenders/datasets/test_movielens.py index d53f5d594c..ddba43a580 100644 --- a/tests/unit/recommenders/datasets/test_movielens.py +++ b/tests/unit/recommenders/datasets/test_movielens.py @@ -2,10 +2,9 @@ from recommenders.datasets.movielens import load_pandas_df, load_spark_df from recommenders.utils.constants import DEFAULT_GENRE_COL, DEFAULT_TITLE_COL -import pyspark.sql import pandas +import pytest from pandas.core.series import Series -from pyspark.sql import SparkSession def test_mock_movielens_data__no_name_collision(): @@ -18,9 +17,9 @@ def test_mock_movielens_data__no_name_collision(): assert not collision -def test_load_spark_df_mock_100__with_default_param__succeed(spark: SparkSession): +@pytest.mark.spark +def test_load_spark_df_mock_100__with_default_param__succeed(spark): df = load_spark_df(spark, "mock100") - assert type(df) == pyspark.sql.DataFrame assert df.count() == 100 @@ -30,7 +29,8 @@ def test_load_pandas_df_mock_100__with_default_param__succeed(): assert len(df) == 100 -def test_load_spark_df_mock_100__with_custom_param__succeed(spark: SparkSession): +@pytest.mark.spark +def test_load_spark_df_mock_100__with_custom_param__succeed(spark): df = load_spark_df(spark, "mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) assert df.schema[DEFAULT_TITLE_COL] assert df.schema[DEFAULT_GENRE_COL] @@ -39,7 +39,7 @@ def test_load_spark_df_mock_100__with_custom_param__succeed(spark: SparkSession) assert df.take(1)[0][DEFAULT_TITLE_COL] == 'foo' -def test_load_pandas_df_mock_100__with_custom_param__succeed(spark: SparkSession): +def test_load_pandas_df_mock_100__with_custom_param__succeed(): df = load_pandas_df("mock100", title_col=DEFAULT_TITLE_COL, genres_col=DEFAULT_GENRE_COL) assert type(df[DEFAULT_TITLE_COL]) == Series assert type(df[DEFAULT_GENRE_COL]) == Series From eca5abf8161c5d2c8b4355a06f2b0d241fed6ac5 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 14:51:40 +0000 Subject: [PATCH 12/27] Rename test marker to fake_movielens --- tests/unit/examples/test_notebooks_pyspark.py | 6 +++--- tests/unit/examples/test_notebooks_python.py | 4 ++-- tox.ini | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index f25dbe388c..46691f3885 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -30,7 +30,7 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.mock_movielens +@pytest.mark.fake_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) @@ -58,7 +58,7 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.mock_movielens +@pytest.mark.fake_movielens @pytest.mark.parametrize("data_size", ["mock100"]) def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["evaluation_diversity"] @@ -68,7 +68,7 @@ def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.mock_movielens +@pytest.mark.fake_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s" ) diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 0d809ee51e..021d80fdc3 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -50,7 +50,7 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks -@pytest.mark.mock_movielens +@pytest.mark.fake_movielens @pytest.mark.parametrize("data_size", ["mock100"]) def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["surprise_svd_deep_dive"] @@ -101,7 +101,7 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks -@pytest.mark.mock_movielens +@pytest.mark.fake_movielens @pytest.mark.parametrize("data_size", ["mock100"]) def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): notebook_path = notebooks["rlrmc_quickstart"] diff --git a/tox.ini b/tox.ini index bfb0b68833..7ede574a2e 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,7 @@ markers = gpu: mark a test as gpu test spark: mark a test as spark test vw: mark a test as vowpal wabbit test - mock_movielens: mark a test that uses the mock dataset instead of real dataset + fake_movielens: mark a test that uses the fake dataset instead testpaths = tests addopts = From 760078489d0939cf47faf538e0946e9ff7970dd1 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 16:52:09 +0000 Subject: [PATCH 13/27] Re-render diversity_metric NB outputs --- .../als_movielens_diversity_metrics.ipynb | 287 +++++++++++++++--- 1 file changed, 242 insertions(+), 45 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 38de757530..69db3dc8dd 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "source": [ "# set the environment path to find Recommenders\n", "%load_ext autoreload\n", @@ -174,7 +174,17 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", + "Spark version: 2.4.8\n" + ] + } + ], "metadata": {} }, { @@ -187,13 +197,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "source": [ "# top k items to recommend\n", "TOP_K = 10\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = 'mock10'\n", + "MOVIELENS_DATA_SIZE = '100k'\n", "\n", "# user, item column names\n", "COL_USER=\"UserId\"\n", @@ -220,7 +230,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", @@ -241,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "source": [ "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", "schema = StructType(\n", @@ -256,7 +266,47 @@ "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema, title_col=COL_TITLE, genres_col=COL_GENRE)\n", "data.show()" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 4.81k/4.81k [00:00<00:00, 15.6kKB/s]\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+------+------+---------+--------------------+------+\n", + "|ItemId|UserId|Rating|Timestamp| Title| Genre|\n", + "+------+------+------+---------+--------------------+------+\n", + "| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n", + "| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n", + "| 26| 18| 4.0|880129731|Brothers McMullen...|Comedy|\n", + "| 26| 222| 3.0|878183043|Brothers McMullen...|Comedy|\n", + "| 26| 43| 5.0|883954901|Brothers McMullen...|Comedy|\n", + "| 26| 201| 4.0|884111927|Brothers McMullen...|Comedy|\n", + "| 26| 299| 4.0|878192601|Brothers McMullen...|Comedy|\n", + "| 26| 95| 3.0|880571951|Brothers McMullen...|Comedy|\n", + "| 26| 89| 3.0|879459909|Brothers McMullen...|Comedy|\n", + "| 26| 361| 3.0|879440941|Brothers McMullen...|Comedy|\n", + "| 26| 194| 3.0|879522240|Brothers McMullen...|Comedy|\n", + "| 26| 391| 5.0|877399745|Brothers McMullen...|Comedy|\n", + "| 26| 345| 3.0|884993555|Brothers McMullen...|Comedy|\n", + "| 26| 303| 4.0|879468307|Brothers McMullen...|Comedy|\n", + "| 26| 401| 3.0|891033395|Brothers McMullen...|Comedy|\n", + "| 26| 429| 3.0|882386333|Brothers McMullen...|Comedy|\n", + "| 26| 293| 3.0|888907015|Brothers McMullen...|Comedy|\n", + "| 26| 270| 5.0|876954995|Brothers McMullen...|Comedy|\n", + "| 26| 442| 3.0|883388576|Brothers McMullen...|Comedy|\n", + "| 26| 342| 2.0|875320037|Brothers McMullen...|Comedy|\n", + "+------+------+------+---------+--------------------+------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], "metadata": {} }, { @@ -268,13 +318,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "source": [ "train_df, test_df = spark_random_split(data.select(COL_USER, COL_ITEM, COL_RATING), ratio=0.75, seed=123)\n", "print (\"N train_df\", train_df.cache().count())\n", "print (\"N test_df\", test_df.cache().count())" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "N train_df 75066\n", + "N test_df 24934\n" + ] + } + ], "metadata": {} }, { @@ -293,7 +352,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "source": [ "users = train_df.select(COL_USER).distinct()\n", "items = train_df.select(COL_ITEM).distinct()\n", @@ -314,7 +373,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "source": [ "header = {\n", " \"userCol\": COL_USER,\n", @@ -339,14 +398,22 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "source": [ "with Timer() as train_time:\n", " model = als.fit(train_df)\n", "\n", "print(\"Took {} seconds for training.\".format(train_time.interval))" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Took 4.189040212018881 seconds for training.\n" + ] + } + ], "metadata": {} }, { @@ -360,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "source": [ "# Score all user-item pairs\n", "dfs_pred = model.transform(user_item)\n", @@ -382,7 +449,16 @@ " \n", "print(top_k_reco.count())" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "1464853\n", + "9430\n" + ] + } + ], "metadata": {} }, { @@ -396,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "source": [ "# random recommender\n", "window = Window.partitionBy(COL_USER).orderBy(F.rand())\n", @@ -430,7 +506,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "source": [ "def get_ranking_results(ranking_eval):\n", " metrics = {\n", @@ -457,7 +533,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "source": [ "def generate_summary(data, algo, k, ranking_metrics, diversity_metrics):\n", " summary = {\"Data\": data, \"Algo\": algo, \"K\": k}\n", @@ -485,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "source": [ "als_ranking_eval = SparkRankingEvaluation(\n", " test_df, \n", @@ -505,7 +581,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "source": [ "als_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -521,7 +597,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "source": [ "als_results = generate_summary(MOVIELENS_DATA_SIZE, \"als\", TOP_K, als_ranking_metrics, als_diversity_metrics)" ], @@ -537,7 +613,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 16, "source": [ "random_ranking_eval = SparkRankingEvaluation(\n", " test_df,\n", @@ -556,7 +632,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 17, "source": [ "random_diversity_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -572,7 +648,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "source": [ "random_results = generate_summary(MOVIELENS_DATA_SIZE, \"random\", TOP_K, random_ranking_metrics, random_diversity_metrics)" ], @@ -588,7 +664,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 19, "source": [ "cols = [\"Data\", \"Algo\", \"K\", \"Precision@k\", \"Recall@k\", \"NDCG@k\", \"Mean average precision\",\"catalog_coverage\", \"distributional_coverage\",\"novelty\", \"diversity\", \"serendipity\" ]\n", "df_results = pd.DataFrame(columns=cols)\n", @@ -601,11 +677,100 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 20, "source": [ "df_results" ], - "outputs": [], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DataAlgoKPrecision@kRecall@kNDCG@kMean average precisioncatalog_coveragedistributional_coveragenoveltydiversityserendipity
1100kals100.0472960.0160150.0430970.0045790.3857937.96725711.6597760.8922770.878733
2100krandom100.0165430.0055660.0163730.0014410.99448910.54185012.1364390.9226130.892511
\n", + "
" + ], + "text/plain": [ + " Data Algo K Precision@k Recall@k NDCG@k Mean average precision \\\n", + "1 100k als 10 0.047296 0.016015 0.043097 0.004579 \n", + "2 100k random 10 0.016543 0.005566 0.016373 0.001441 \n", + "\n", + " catalog_coverage distributional_coverage novelty diversity \\\n", + "1 0.385793 7.967257 11.659776 0.892277 \n", + "2 0.994489 10.541850 12.136439 0.922613 \n", + "\n", + " serendipity \n", + "1 0.878733 \n", + "2 0.892511 " + ] + }, + "metadata": {}, + "execution_count": 20 + } + ], "metadata": {} }, { @@ -626,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 21, "source": [ "# Get movie features \"title\" and \"genres\"\n", "movies = (\n", @@ -642,7 +807,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 22, "source": [ "# tokenize \"title\" column\n", "title_tokenizer = Tokenizer(inputCol=COL_TITLE, outputCol=\"title_words\")\n", @@ -657,12 +822,12 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 23, "source": [ "# convert text input into feature vectors\n", "\n", "# step 1: perform HashingTF on column \"text\"\n", - "text_hasher = HashingTF(inputCol=\"text\", outputCol=\"text_features\", numFeatures=3)\n", + "text_hasher = HashingTF(inputCol=\"text\", outputCol=\"text_features\", numFeatures=1024)\n", "hashed_data = text_hasher.transform(clean_data)\n", "\n", "# step 2: fit a CountVectorizerModel from column \"genres\".\n", @@ -679,7 +844,30 @@ "\n", "feature_data.show(10, False)" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+---------------------------------------------+\n", + "|ItemId|features |\n", + "+------+---------------------------------------------+\n", + "|167 |(1043,[128,544,1025],[1.0,1.0,1.0]) |\n", + "|1343 |(1043,[38,300,1024],[1.0,1.0,1.0]) |\n", + "|1607 |(1043,[592,821,1024],[1.0,1.0,1.0]) |\n", + "|966 |(1043,[389,502,1028],[1.0,1.0,1.0]) |\n", + "|9 |(1043,[11,342,1014,1024],[1.0,1.0,1.0,1.0]) |\n", + "|1230 |(1043,[597,740,902,1025],[1.0,1.0,1.0,1.0]) |\n", + "|1118 |(1043,[702,1025],[1.0,1.0]) |\n", + "|673 |(1043,[169,690,1027,1040],[1.0,1.0,1.0,1.0]) |\n", + "|879 |(1043,[909,1026,1027,1034],[1.0,1.0,1.0,1.0])|\n", + "|66 |(1043,[256,1025,1028],[1.0,1.0,1.0]) |\n", + "+------+---------------------------------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], "metadata": {} }, { @@ -691,16 +879,7 @@ }, { "cell_type": "code", - "execution_count": null, - "source": [ - "feature_data.count()" - ], - "outputs": [], - "metadata": {} - }, - { - "cell_type": "code", - "execution_count": null, + "execution_count": 24, "source": [ "als_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -716,12 +895,21 @@ "print(als_diversity)\n", "print(als_serendipity)" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.8738984131037538\n", + "0.8873467159479473\n" + ] + } + ], "metadata": {} }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "source": [ "random_eval = SparkDiversityEvaluation(\n", " train_df = train_df, \n", @@ -737,7 +925,16 @@ "print(random_diversity)\n", "print(random_serendipity)" ], - "outputs": [], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "0.8982144953920664\n", + "0.8941807579293202\n" + ] + } + ], "metadata": {} }, { From 04f1371705f6839d23fbf477528be14139bf4804 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 16:56:41 +0000 Subject: [PATCH 14/27] Re-render als_deep_dive NB outputs --- .../als_deep_dive.ipynb | 484 ++++++++++++------ 1 file changed, 318 insertions(+), 166 deletions(-) diff --git a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb index a8b19a4d65..0d90bb65d4 100644 --- a/examples/02_model_collaborative_filtering/als_deep_dive.ipynb +++ b/examples/02_model_collaborative_filtering/als_deep_dive.ipynb @@ -2,32 +2,31 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Spark Collaborative Filtering (ALS) Deep Dive" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Spark MLlib provides a collaborative filtering algorithm that can be used for training a matrix factorization model, which predicts explicit or implicit ratings of users on items for recommendations.\n", "\n", "This notebook presents a deep dive into the Spark collaborative filtering algorithm." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 1 Matrix factorization algorithm\n", "\n", @@ -54,11 +53,11 @@ "Owing to the term of $q_{i}^{T}p_{u}$ the loss function is non-convex. Gradient descent method can be applied but this will incur expensive computations. An Alternating Least Square (ALS) algorithm was therefore developed to overcome this issue. \n", "\n", "The basic idea of ALS is to learn one of $q$ and $p$ at a time for optimization while keeping the other as constant. This makes the objective at each iteration convex and solvable. The alternating between $q$ and $p$ stops when there is convergence to the optimal. It is worth noting that this iterative computation can be parallelised and/or distributed, which makes the algorithm desirable for use cases where the dataset is large and thus the user-item rating matrix is super sparse (as is typical in recommendation scenarios). A comprehensive discussion of ALS and its distributed computation can be found [here](http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 2 Spark Mllib implementation\n", "\n", @@ -67,29 +66,28 @@ "* The uniqueness of ALS implementation is that it distributes the matrix factorization model training by using \"Alternating Least Square\" method. \n", "* In the training method, there are parameters that can be selected to control the model performance.\n", "* Both explicit and implicit ratings are supported by Spark ALS model." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## 3 Spark ALS based MovieLens recommender\n", "\n", "In the following code, the MovieLens-100K dataset is used to illustrate the ALS algorithm in Spark." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](https://github.com/Microsoft/Recommenders/blob/master/SETUP.md#dependencies-setup) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 1, "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -118,24 +116,31 @@ "print(\"System version: {}\".format(sys.version))\n", "print(\"Pandas version: {}\".format(pd.__version__))\n", "print(\"PySpark version: {}\".format(pyspark.__version__))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", + "Pandas version: 1.1.5\n", + "PySpark version: 2.4.8\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data column names" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], + "execution_count": 2, "source": [ "MOVIELENS_DATA_SIZE = \"100k\"\n", "\n", @@ -144,13 +149,17 @@ "COL_RATING = \"Rating\"\n", "COL_PREDICTION = \"prediction\"\n", "COL_TIMESTAMP = \"Timestamp\"" - ] + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 3, "source": [ "schema = StructType(\n", " (\n", @@ -160,127 +169,153 @@ " StructField(COL_TIMESTAMP, LongType()),\n", " )\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Model hyper parameters - these parameters are selected with reference to the benchmarking results [here](http://mymedialite.net/examples/datasets.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 4, "source": [ "RANK = 10\n", "MAX_ITER = 15\n", "REG_PARAM = 0.05" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Number of recommended items" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 5, "source": [ "K = 10" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Initialize a Spark session." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 6, "source": [ "spark = start_or_get_spark(\"ALS Deep Dive\", memory=\"16g\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.1 Load and prepare data" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is read from csv into a Spark DataFrame." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 7, "source": [ "dfs = movielens.load_spark_df(spark=spark, size=MOVIELENS_DATA_SIZE, schema=schema)" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "text": [ + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.5kKB/s]\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": { - "scrolled": true - }, - "outputs": [], + "execution_count": 8, "source": [ "dfs.show(5)" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+-------+------+---------+\n", + "|UserId|MovieId|Rating|Timestamp|\n", + "+------+-------+------+---------+\n", + "| 196| 242| 3.0|881250949|\n", + "| 186| 302| 3.0|891717742|\n", + "| 22| 377| 1.0|878887116|\n", + "| 244| 51| 2.0|880606923|\n", + "| 166| 346| 1.0|886397596|\n", + "+------+-------+------+---------+\n", + "only showing top 5 rows\n", + "\n" + ] + } + ], + "metadata": { + "scrolled": true + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Data is then randomly split by 80-20 ratio for training and testing." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 9, "source": [ "dfs_train, dfs_test = spark_random_split(dfs, ratio=0.75, seed=42)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.2 Train a movielens model " - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It is worth noting that Spark ALS model allows dropping cold users to favor a robust evaluation with the testing data. In case there are cold users, Spark ALS implementation allows users to drop cold users in order to make sure evaluations on the prediction results are sound." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 10, "source": [ "als = ALS(\n", " maxIter=MAX_ITER, \n", @@ -293,38 +328,38 @@ ")\n", "\n", "model = als.fit(dfs_train)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.3 Prediction with the model\n", "\n", "The trained model can be used to predict ratings with a given test data." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 11, "source": [ "dfs_pred = model.transform(dfs_test).drop(COL_RATING)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "With the prediction results, the model performance can be evaluated." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 12, "source": [ "evaluations = SparkRatingEvaluation(\n", " dfs_test, \n", @@ -342,20 +377,31 @@ " \"Explained variance score = {}\".format(evaluations.exp_var()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "RMSE score = 0.9726930349322086\n", + "MAE score = 0.7565710909806911\n", + "R2 score = 0.24411065820407096\n", + "Explained variance score = 0.249700271662727\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Oftentimes ranking metrics are also of interest to data scientists. Note usually ranking metrics apply to the scenario of recommending a list of items. In our case, the recommended items should be different from those that have been rated by the users. " - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 13, "source": [ "# Get the cross join of all user-item pairs and score them.\n", "users = dfs_train.select(COL_USER).distinct()\n", @@ -374,13 +420,46 @@ " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", "dfs_pred_final.show()" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+-------+----------+\n", + "|UserId|MovieId|prediction|\n", + "+------+-------+----------+\n", + "| 1| 587| 3.2763875|\n", + "| 1| 869| 1.996331|\n", + "| 1| 1208| 3.0924819|\n", + "| 1| 1677| 3.0549564|\n", + "| 2| 80| 2.2266486|\n", + "| 2| 303| 3.5071766|\n", + "| 2| 472| 2.4076686|\n", + "| 2| 582| 4.137449|\n", + "| 2| 838| 1.6214753|\n", + "| 2| 975| 2.7880914|\n", + "| 2| 1260| 3.155648|\n", + "| 2| 1325| 1.2494813|\n", + "| 2| 1381| 3.712147|\n", + "| 2| 1530| 2.04168|\n", + "| 3| 22| 2.5458775|\n", + "| 3| 57| 1.7472819|\n", + "| 3| 89| 3.85607|\n", + "| 3| 367| 3.2235723|\n", + "| 3| 1091| 1.5452085|\n", + "| 3| 1167| 3.5050836|\n", + "+------+-------+----------+\n", + "only showing top 20 rows\n", + "\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 14, "source": [ "evaluations = SparkRankingEvaluation(\n", " dfs_test, \n", @@ -399,11 +478,23 @@ " \"Mean average precision = {}\".format(evaluations.map_at_k()),\n", " sep=\"\\n\"\n", ")" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Precision@k = 0.03170731707317073\n", + "Recall@k = 0.012679519170565132\n", + "NDCG@k = 0.02914424248125332\n", + "Mean average precision = 0.0033674440032626088\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.4 Fine tune the model\n", "\n", @@ -416,48 +507,47 @@ "|`maxIters`|Maximum number of iterations|10|The more iterations the better the model converges to the optimal point.|\n", "\n", "It is always a good practice to start model building with default parameter values and then sweep the parameter in a range to find the optimal combination of parameters. The following parameter set is used for training ALS models for comparison study purposes." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 15, "source": [ "param_dict = {\n", " \"rank\": [10, 15, 20],\n", " \"regParam\": [0.001, 0.1, 1.0]\n", "}" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Generate a dictionary for each parameter combination which can then be fed into model training." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 16, "source": [ "param_grid = generate_param_grid(param_dict)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Train models with parameters specified in the parameter grid. Evaluate the model with, for example, the RMSE metric, and then record the metrics for visualization." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 17, "source": [ "rmse_score = []\n", "\n", @@ -487,104 +577,166 @@ "\n", "rmse_score = [float('%.4f' % x) for x in rmse_score]\n", "rmse_score_array = np.reshape(rmse_score, (len(param_dict[\"rank\"]), len(param_dict[\"regParam\"]))) " - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 18, "source": [ "rmse_df = pd.DataFrame(data=rmse_score_array, index=pd.Index(param_dict[\"rank\"], name=\"rank\"), \n", " columns=pd.Index(param_dict[\"regParam\"], name=\"reg. parameter\"))" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 19, "source": [ "fig, ax = plt.subplots()\n", "sns.heatmap(rmse_df, cbar=False, annot=True, fmt=\".4g\")" - ] + ], + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "metadata": {}, + "execution_count": 19 + }, + { + "output_type": "display_data", + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXwAAAEGCAYAAABmXi5tAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAitElEQVR4nO3deXxU1d3H8c9vMlkhLAnIjmjBRxDRyiKIIFoFRFGkAloQcK2i4AIIWhXQtooIVn1wwaosPoJg2cSFpYWKCBIEAVksyKKBgLILJCSZnOePGQKBJKTKZEju9/16zYuZe86993dzX/nOzblnBnPOISIipZ8v0gWIiEjxUOCLiHiEAl9ExCMU+CIiHqHAFxHxCH+kCyjI3i5tNH2ohKo8Y0OkS5BfYf/gVpEuQX6FMk9PsoLadIUvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEf4I11AaZBw36NEN26B27+PA/1vP6k9uklL4m65A5yDQIDDY/+XwPrVRNWpS8LdD2PxCbicHDKmvkvWF/MBKPv0y1h8AgC+chXI3rieQyOeKNbjKs3atW3DqFFPE+Xz8fY7E3l+xOg87bVr1+DvY0ZRqXISe/fso2fvfmzblsZFF13A6FeeJbFcWQKBAM8+9wpTpswE4Mo2LRk+/EliYqJZvnw1d9/Tn0AgAMAVrVswcuQwoqP97N61h6uuvrnYj7k0iun0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0Dfna26XNmVlYPvz1G+Ey0inzwOP5Bj5x8ZCRDkBU7XMp88hQDjzUE1+1muAcOTu2YRWTKTd8DAce6oU7fDDP6mX6DyMrZRGZn80phqP59SrP2BDpEgrl8/lYt2Yh7TvcSmpqGksWf0yP2/qwbt2xuidNfIOPPp7HhAlTuLJNS3r16kbv2/tRr965OOfYuHEz1apVYemST2jYqA0HDvzMpo1Ladu+Gxs2bGLokAFs3ZrKO2MnUb58ORZ+NoPrru/ODz9sp3LlZH76aXcEfwKF2z+4VaRLKDLf2edDZgaxne/PN/CJiYXMIwBYldrEdX2Q9Ff65+0TX4aEB1/i8Mg+kJVJfL8XyXhvBG7XdvxNr8FXsy6Z014rhqM5Pco8PckKatOQzmmQvW4V7uDPBXcIhT0AcXHBK30gJy2VnB3bAHB7d5Ozfy9WrnzedeMT8De8hMyUz0932Z7VrOlv+e67LWze/D1ZWVlMnjyDGzq2y9Onfv16zJ+/CID5CxZxQ8e2AGzYsImNGzcDkJa2kx9/2k3lyskkJ1ckMzOTDRs2ATBv3md0vqkDALfechPTp3/CDz9sBzijw76kydm6Hpd+qOAOobAHsJjYfLv4GzQnsOFryMoMLXFYXPCva4tLwP289zRVG3kK/GIS3exyyv1tPGUfe45Drw0/qT2q7vmYP5qcndvzLI9pejnZ3yyH9MPFVWqpV71GVX5IPfZzTt2WRvXqVfP0WbVqLTd1uhaATp2upVy5RJKSKubp07TJxcTERPPdd1vYtWsPfr+fxpc0AqBz5+uoWas6APXqnUuFCuX559wpfLnkE3r00HBOcYqq35T4viOJ6z6II9NfP6ndf2ELsld/kfv6yIwxxPUYRHz/0fgvakXWwhnFWW5YKfCLSdbSzznwUE8OPf8E8d3uzNNmFZIo0/dxDr06PPfq/6iYy39H5uf/LM5SBXh00DO0bt2clKWzad2qOampabnj8QBVq57F2LEvc9ddj3B0WLR7jz6MfGEoixfN4uDBQwQCOQD4/VE0vqQRHW/sSYfr/sCfHnuIevXOjchxeVFgXQrpr/QnY+ILxFzVNU+bla2Ar0ptAhtX5i6LbtGBjHeHkz7yfrJXLCCm/W3FXXLYhCXwzay8mT1nZuvNbI+Z7TazdaFlFQpZ7x4zW2Zmy8Zu2l5QtxIte90qfFWqYYmhoZv4BMo+9hzpE98isGFtnr6WWJ6ouueTtXxJBCotvbZv20GtmtVzX9esUY3t23fk6ZOWtpMuXe+mabN2PPlU8C+y/fsPAJCYWJaZM8bz5FPD+XLp8tx1lnz5FW2u6kyLltezcOGS3OGdbdvSmDN3AYcPp7N7914Wfr6ERo0ahPsw5QQ5W9djFc+ChMTcZVENW5C9LgVyQm/mCYn4qp5NTupGALK/WUxUrfMiUW5YhOsKfzKwF2jjnEtyziUDV4aWTS5oJefcGOdcE+dck97nVi+oW4njq1oj93nUOfWw6Gjcz/vB76fswGfI/Pccspb8+6T1optfQdZXi48bW5TTIWXZ19Stew516tQiOjqarl1v5MNZeW+IJydXxCx472vwoL6MHTcJgOjoaP4x5S3effcDpk79KM86lSsnAxATE8PAAfczZswEAGZ+OJuWlzUjKiqK+Pg4mjX7LevXn9k3tksLS6qS+9xXrQ74o+Hwsftt/gsvI3v1omMrZBzCYuOx5GoARP2mETk/bSuucsMuXNMy6zjn8gxUO+d2AMPN7I4w7TNiyjz4JP4LLsYSy1P+9SmkT34HooI/2sy5M4m+tDWxV7TFBQKQeYSDLz4NQEyLK/HXvwhLLE/Mle0BODz6OQJbglcXMS2vImP6e5E5qFIsEAjw4ENP8PFH7xHl8zF23PusXfsfhg4ZwLKvVjJr1lyuuOIy/vLMYzgcCxcuoW+/PwHQpUtHWrW6lKTkivTsGRweuPOuh1m5cg0DHrmPDtddjc/n4403xjN/QTBI1q/fyOw581mxfB45OTm8/fZE1qz5NmLHX5rE3twX3zkNsIRE4vuPJmv+B+CLAiB72Tz8DS7Ff3Gr4O9ediZHJr+Uu65VqIyVTyZny7pjG8zJ4cjMN4m75eHgUF36oXzH/UuqsEzLNLM5wDxgnHNuZ2hZFaA3cI1z7upTbaMkTcuUvM70aZlSuJI0LVNOFolpmd2AZODfoTH8PcACIAnoEqZ9iohIIcIypOOc2wsMCj3yMLPbgXfCsV8RESlYJKZlDovAPkVEPC8sV/hmtqqgJqBKAW0iIhJG4ZqlUwVoR3Aa5vEM+OLk7iIiEm7hCvxZQFnn3NcnNpjZgjDtU0REChGum7Z3FtL2h3DsU0RECqfv0hER8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh7hj3QBBSk7+s1IlyC/1Iw2ka5ARPKhK3wREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI4oU+GYWm8+ypNNfjoiIhEtRr/Cnmln00RdmVg2YG56SREQkHIoa+NOByWYWZWZ1gNnAY+EqSkRETj9/UTo55940sxiCwV8H+KNz7osw1iUiIqdZoYFvZo8c/xKoDXwNNDez5s65UWGsTURETqNTXeEnnvB6agHLPe2JZ//GZ1+kkFSxPNPHv3pS+9IVq+j32J+pUa0KAFe3voz7br81tz0QCNDt7oc5q1Iyrz4/BIBBT49gzfqN+P1RNKx/HkMGPkC0v0h/kEkRtGvbhlGjnibK5+Ptdyby/IjRedpr167B38eMolLlJPbu2UfP3v3Yti2N2rVr8MGUt/D5fERH+xk9+h3GvDkBgEt+eyFvvfUi8XFxfPLpv3j4kacAGP7sE1x3/TVkZmayadNW7rzrEfbvP1Dsx1waxXT6I/7zLsEdOkD66IEntUed35iYq7rinIOcAJmfjCfn+2/xndOAmPY9c/v5KlXnyJSXCaxfhu/chsS07Q5mkJnBkWmv4fbsLM7DChtzzkW6hnxl/bjhzCwsH8u+/oaE+Dge/8uoAgN/7MRpuWF+onGTprHm240cPHQ4t89ni1No1bwJAI8OG0Hjixpyy00dwncQp1F8zTaRLqFQPp+PdWsW0r7DraSmprFk8cf0uK0P69ZtyO0zaeIbfPTxPCZMmMKVbVrSq1c3et/ej+joaMyMzMxMypRJYOWKf9HqihtJS9vJ4kWzeOjhp/hy6XJmzZzA/45+m09nz+eaq1vzr/mLCAQCPPvXxwF47PG/RurwT2n/4FaRLqHIfGefD5kZxHa+P9/AJyYWMo8AYFVqE9f1QdJf6Z+3T3wZEh58icMj+0BWJvH9XiTjvRG4XdvxN70GX826ZE57rRiO5vQo8/QkK6itqNMyzzOzMWY2x8z+dfRx+kos2Zpc3JDy5X7ZHz07ftzFZ4tT+P31bfMsb92iKWaGmXFh/fPY+dOu01GqAM2a/pbvvtvC5s3fk5WVxeTJM7ihY7s8ferXr8f8+YsAmL9gETd0DJ6frKwsMjMzAYiNjcXnC/4KVa16FonlEvly6XIAJvzfB9xwQ3sA5s77jEAgAMCSL5dTo0a18B+kR+RsXY9LP1Rwh1DYA1jMSbPLAfA3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojr6izdKYAK4AngIHHPaSIVq5ZT+feD3DvgCFs3Lw1d/nwl8fwSJ87MF/+b8pZ2dl8OHs+l196SXGVWupVr1GVH1K3575O3ZZG9epV8/RZtWotN3W6FoBOna6lXLlEkpIqAlCzZnWWfzWXLZtSGPHCaNLSdlKjelW2pablrr8tNY0aJ2wT4Pbet/Dp7PnhOCwpQFT9psT3HUlc90Ecmf76Se3+C1uQvfrYHJQjM8YQ12MQ8f1H47+oFVkLZxRnuWFV1MDPds695pxb6pz76ujjdBdjZveY2TIzW/b38ZNO9+YjpsF5dZk75W2mjv1f/vD76+n3+J8BWLBoKUkVK3DB/9QtcN0/j3yVxhdfQOOLGhZXuQI8OugZWrduTsrS2bRu1ZzU1LTcq/TU1O1c0vga/qd+S3re1oWzzqpUpG0+Nrgf2dnZvPfe1FN3ltMmsC6F9Ff6kzHxBWKu6pqnzcpWwFelNoGNK3OXRbfoQMa7w0kfeT/ZKxYQ0/624i45bIoa+B+aWR8zq2ZmSUcfBXU2s/bHPS9vZm+Z2Soze8/MqhS0nnNujHOuiXOuyV09b/kvDuPMVrZMAgkJ8UBwqCY7O8DefftZsXotCxZ9SdsudzBw6PMsXb6KQU+/kLveq++8x959B3j0gbsiVXqptH3bDmrVrJ77umaNamzfviNPn7S0nXTpejdNm7XjyaeGA5x0ozUtbSffrPmWyy+/lG3bd1Cj5rGhmho1q7HtuG32vK0r13W4mtt6PhCOQ5IiyNm6Hqt4FiQcG36NatiC7HUpkBN8MychEV/Vs8lJ3QhA9jeLiap1XiTKDYuiBn4vgkM4XwBfhR7LCul//B2pkUAa0BFIAd7478ss2Xbt3svRm+Or135LTo6jQvlyPHxvb/45dRxzprzNiKGP0uySRgx/agAAH3w4m0VLl/P80IG548RyeqQs+5q6dc+hTp1aREdH07XrjXw4a06ePsnJFTELDrMNHtSXseOCf3HWqFGNuLg4ACpUKE/Lls34z3++Y8eOH/n5wM9c2iw49HZb95v58MPZQHBG0IAB99Gpc2/S0zOK6zAFsKRj15e+anXAHw2Hf85d5r/wMrJXLzq2QsYhLDYeSw6+eUf9phE5P20rrnLDrqgfvDrnV+yjiXPu4tDzF82s16/Y1hlp4NDnSVmxmn37D/C7zr3oc0d3srOzAejWqQNzFnzO+9M/ISrKR1xsLCOGPpobJgV5ZuRoqlU5i+73Bt8ATpzKKb9cIBDgwYee4OOP3iPK52PsuPdZu/Y/DB0ygGVfrWTWrLlcccVl/OWZx3A4Fi5cQt9+fwKg/vl1ef75p3AuOGtv1KjX+eab9QA80Pfx3GmZn86ezyefBuc1vPS3PxMbG8unnwTfNL78cjn3PzA4MgdfysTe3BffOQ2whETi+48ma/4H4IsCIHvZPPwNLsV/cStcIADZmRyZ/FLuulahMlY+mZwt645tMCeHIzPfJO6Wh4MXaemH8h33L6mKPC3TzBoCDYC4o8ucc+ML6JsKjCL4Ya37gd+40I7MbJVzrtGp9leSpmVKXmf6tEwpXEmaliknK2xaZpGu8M1sCNCGYOB/DFwLfA7kG/jAmxz7cNY4oBLwk5lVJfhJXRERKWZF/ejmzcBFwArn3O2hG6/vFtTZOTesgOU7zExz0kREIqCodwMznHM5QLaZlQN+BGr9wn3m+2YgIiLhdcorfAveXVxlZhUIDtV8BRwEFheyzqqCmoACp2WKiEj4nDLwnXPOzJo55/YBr5vZp0A551xBoQ7BUG8HnPiZZCM4tVNERIpZUcfwl5tZU+dcinNuSxH6zwLKOue+PrHBzBYUvTwRETldihr4lwLdzWwrcIjglboraHqlc+7OgjbknPvDf12liIj8akUN/Han7iIiImeyon7Sduupe4mIyJlMX9IiIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8Qh/pAsokC8q0hWIiJQqusIXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCP8kS6gNHjir6P4bNFSkipWYPq7r5/UvnT5KvoNHkaNalUBuPqKy7jvju657YFAgG539uOsypV4dcQwAJxzvDxmHHPmf47P56PbTdfRo8uNxXNAHtCubRtGjXqaKJ+Pt9+ZyPMjRudpr127Bn8fM4pKlZPYu2cfPXv3Y9u2NC666AJGv/IsieXKEggEePa5V5gyZSYAV7ZpyfDhTxITE83y5au5+57+BAIB+j9yL7fe2hkAvz+K+ufXo2r1Ruzdu6+4D7vUien0R/znXYI7dID00QNPao86vzExV3XFOQc5ATI/GU/O99/iO6cBMe175vbzVarOkSkvE1i/DN+5DYlp2x3MIDODI9New+3ZWZyHFTbmnIt0DfnK2rXpzCwsH8u+Xk1CfDyPP/NCgYE/duI/csP8ROMmTWXN+g0cPHQ4t8+0j+awdPkq/vKnR/D5fOzeu4/kihXCeRinTXz1VpEuoVA+n491axbSvsOtpKamsWTxx/S4rQ/r1m3I7TNp4ht89PE8JkyYwpVtWtKrVzd6396PevXOxTnHxo2bqVatCkuXfELDRm04cOBnNm1cStv23diwYRNDhwxg69ZU3hk7Kc++r7/uGh7sdzfXtOta3IddZPsHn9nn73i+s8+HzAxiO9+fb+ATEwuZRwCwKrWJ6/og6a/0z9snvgwJD77E4ZF9ICuT+H4vkvHeCNyu7fibXoOvZl0yp71WDEdzepR5epIV1KYhndOgycUXUr5c4i9ad8ePP/HZF0v5fcd2eZa/P+0j7rv9D/h8wVNUUsK+JGjW9Ld8990WNm/+nqysLCZPnsENJ/z869evx/z5iwCYv2ARN3RsC8CGDZvYuHEzAGlpO/nxp91UrpxMcnJFMjMz2bBhEwDz5n1G55s6nLTvbt1uZNL708N4dN6Ss3U9Lv1QwR1CYQ9gMbH5dvE3aE5gw9eQlRla4rC4hOA6cQm4n/eepmojT4FfTFZ+s47Ovfpwb/8n2bhpa+7y4S+9wSN97sQs76n4YVsan/zz33S9ox/39n+SrT9sK+6SS63qNaryQ+r23Nep29KoXr1qnj6rVq3lpk7XAtCp07WUK5dIUlLFPH2aNrmYmJhovvtuC7t27cHv99P4kkYAdO58HTVrVc/TPz4+jnZt2zB12sfhOCwpQFT9psT3HUlc90EcmX7yX+D+C1uQvfqL3NdHZowhrscg4vuPxn9RK7IWzijOcsMqLIFvZuXN7DkzW29me8xst5mtCy2rEI59nska/M9vmPuPcUwd9yp/+H1H+j32NAALFn1JUsUKXHB+vZPWyczKIjYmhslvv8zvO7bnyb++WNxle9qjg56hdevmpCydTetWzUlNTSMQCOS2V616FmPHvsxddz3C0WHR7j36MPKFoSxeNIuDBw8RCOTk2eb117fli8XLNHZfzALrUkh/pT8ZE18g5qq8Q2lWtgK+KrUJbFyZuyy6RQcy3h1O+sj7yV6xgJj2txV3yWETriv8ycBeoI1zLsk5lwxcGVo2uaCVzOweM1tmZsv+Pn5imEorfmXLlCEhIR6A1pc1Izs7m7379rNi1VoWfL6Etr/vxcAhz7H0q5UMGvY8AFUrV+LqK1oCwZu8//luc8TqL222b9tBrZrHrr5r1qjG9u078vRJS9tJl65307RZO558ajgA+/cfACAxsSwzZ4znyaeG8+XS5bnrLPnyK9pc1ZkWLa9n4cIlucM7R3XreoOGcyIoZ+t6rOJZkHBs+DWqYQuy16VATujNPCERX9WzyUndCED2N4uJqnVeJMoNi3AFfh3n3HDnXO5vkXNuh3NuOHB2QSs558Y455o455rc1fPWMJVW/Hbt3pN7Fbh67bfkOEeF8uV4+L7b+ef0d5nzj3GMGDaYZo0vYviQRwG4qnULli4PXnWkrFjN2bVqRKz+0iZl2dfUrXsOderUIjo6mq5db+TDWXPy9ElOrohZ8N7X4EF9GTsuePM1Ojqaf0x5i3ff/YCpUz/Ks07lyskAxMTEMHDA/YwZMyG3rVy5RFq3as7MmbPDeWhyAkuqkvvcV60O+KPh8M+5y/wXXkb26kXHVsg4hMXGY8nVAIj6TSNyfio9w6nhmpa51cweBcY553YCmFkVoDfwQ5j2GTEDhzxHyopV7Nt3gN916kGfO28jOzsbgG43Xcec+Z/z/rSPiPJHERcTw4hhg3PDpCB39ujKoGHPM+H96STExzFs8EPFcCTeEAgEePChJ/j4o/eI8vkYO+591q79D0OHDGDZVyuZNWsuV1xxGX955jEcjoULl9C3358A6NKlI61aXUpSckV69gwOD9x518OsXLmGAY/cR4frrsbn8/HGG+OZv+BYkHS68VrmzvuMw4fTI3LMpVXszX3xndMAS0gkvv9osuZ/AL4oALKXzcPf4FL8F7fCBQKQncmRyS/lrmsVKmPlk8nZsu7YBnNyODLzTeJueTh4kZZ+KN9x/5IqLNMyzawiMBi4EagCOGAnMBMY7pzbc6ptlKRpmZLXmT4tUwpXkqZlyskKm5YZlit859xeM3sHmAsscc4dPNpmZu2BT8OxXxERKVi4Zun0A2YADwDfmNnxHxH9azj2KSIihQvXGP7dQGPn3EEzqwN8YGZ1nHMvAYUPXouISFiEK/B9R4dxnHNbzKwNwdA/GwW+iEhEhGta5k4zu/joi1D4Xw9UAi4M0z5FRKQQ4Qr8nkCeT7I457Kdcz2B1mHap4iIFCJcs3RSC2lbVFCbiIiEj748TUTEIxT4IiIeocAXEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CIiHqHAFxHxCAW+iIhHKPBFRDxCgS8i4hEKfBERj1Dgi4h4hAJfRMQjFPgiIh6hwBcR8QgFvoiIRyjwRUQ8QoEvIuIRCnwREY9Q4IuIeIQCX0TEI8w5F+kaPMnM7nHOjYl0HfLL6PyVXF4+d7rCj5x7Il2A/Co6fyWXZ8+dAl9ExCMU+CIiHqHAjxxPjiGWIjp/JZdnz51u2oqIeISu8EVEPEKBLyLiEQr808DM2pvZt2a20cwG59Mea2bvh9q/NLM6x7U9Flr+rZm1O27522b2o5l9U0yHIScownltbWbLzSzbzG6ORI2Sv1P9/ljQy6Fzu8rMLinuGiNBgf8rmVkUMBq4FmgA3GpmDU7odiew1zlXF3gRGB5atwFwC3AB0B54NbQ9gLGhZRIBRTyv3wO9gfeKtzopgrEU/vtzLVAv9LgHeK0Yaoo4Bf6v1wzY6Jzb5JzLBCYBN57Q50ZgXOj5B8DvzMxCyyc554445zYDG0Pbwzn3GbCnOA5A8nXK8+qc2+KcWwXkRKJAKVgRfn9uBMa7oCVABTOrVjzVRY4C/9erAfxw3OvU0LJ8+zjnsoH9QHIR15XI0Lkp3Tx5fhX4IiIeocD/9bYBtY57XTO0LN8+ZuYHygO7i7iuRIbOTenmyfOrwP/1UoB6ZnaOmcUQvAk784Q+M4Feoec3A/9ywU+8zQRuCc3iOYfgDaSlxVS3FK4o51VKrplAz9BsnebAfudcWqSLCjd/pAso6Zxz2Wb2ADAbiALeds6tMbOngWXOuZnAW8AEM9tI8EbSLaF115jZZGAtkA3c75wLAJjZRKANUMnMUoEhzrm3ivnwPKso59XMmgLTgIpARzMb5py7IIJlS0h+vz9ANIBz7nXgY6ADwYkSh4HbI1Np8dJXK4iIeISGdEREPEKBLyLiEQp8ERGPUOCLiHiEAl9ExCMU+CJnIDN7PNI1SOmjaZlSYoS+cM6cc2fEl5WZWdTRz02EYdsHnXNlz5R6pHTQFb6c0cysTug76ccD3wC1zGygmaWEvsd82HF9nwz1/dzMJprZgFNsu7eZzTCzBWa2wcyGHNc23cy+MrM1ZnbPccsPmtlIM1sJtDCzp0K1fGNmY0JvSoS2+aKZLTOzdWbW1Mymhvbz5+O218PMlprZ12b2hplFmdlzQHxo2f8V1C+/ek7LD11KL+ecHnqcsQ+gDsGvH24eet2W4H9CbQQvWGYBrYGmwNdAHJAIbAAGnGLbvYE0gt9cGk/wDaVJqC0p9O/R5cmh1w7oetw2ko57PgHoGHq+ABgeev4gsB2oBsQS/GbGZKA+8CEQHer3KtAz9PzgcdstrF+eevTQo7CHvlpBSoKtLvid5RAM/LbAitDrsgS/gygRmOGcywAyzOzDIm57rnNuN4CZTQUuB5YB/czsplCfWqF97AYCwD+OW/9KM3sUSACSgDUEwxmOfffOamCNC31Xi5ltCm3zcqAxkBL6wyAe+DGfGn9XSL8T6xEpkAJfSoJDxz034Fnn3BvHdzCzh37htk+8ieXMrA1wNdDCOXfYzBYQ/MsBIMMd+76jOIJX202ccz+Y2dDj+gEcCf2bc9zzo6/9oWMZ55x77BQ1FtYvtx6RU9EYvpQ0s4E7zKwsgJnVMLOzgEUEv8AsLtR2fRG3d42ZJZlZPNAptJ3yBP9LysNmdj7QvIB1j4b7rtA+/9v/1/afwM2h+gnVcXaoLcvMoovQT6TIdIUvJYpzbo6Z1QcWh4Y3DgI9nHMpZjYTWAXsJDiMsh/AzO4Nrft6PptcSnBIpCbwrnNumZmtBu41s3XAt8CSfNbDObfPzN4kOMa/g+BXKv83x7LWzJ4A5piZD8gC7ge2ErxPscrMljvnuhfST6TINC1TSg0zK+ucO2hmCcBnwD3OueWF9O9NcDjmgeKqUSSSdIUvpckYM2tAcKhlXGFhL+JFusIXEfEI3bQVEfEIBb6IiEco8EVEPEKBLyLiEQp8ERGP+H8bcalIQKGLvQAAAABJRU5ErkJggg==", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + } + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "The calculated RMSE scores can be visualized to comparatively study how model performance is affected by different parameters." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "It can be seen from this visualization that RMSE first decreases and then increases as rank increases, due to overfitting. When the rank equals 20 and the regularization parameter equals 0.1, the model achieves the lowest RMSE score." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3.5 Top K recommendation" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.1 Top k for all users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 20, "source": [ "dfs_rec = model.recommendForAllUsers(10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 21, "source": [ "dfs_rec.show(10)" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+--------------------+\n", + "|UserId| recommendations|\n", + "+------+--------------------+\n", + "| 471|[[814, 3.7504895]...|\n", + "| 463|[[814, 3.1264873]...|\n", + "| 833|[[814, 3.3154662]...|\n", + "| 496|[[814, 3.055388],...|\n", + "| 148|[[814, 4.03012], ...|\n", + "| 540|[[814, 3.8661027]...|\n", + "| 392|[[814, 4.119951],...|\n", + "| 243|[[814, 3.748784],...|\n", + "| 623|[[814, 3.9018161]...|\n", + "| 737|[[814, 3.8507497]...|\n", + "+------+--------------------+\n", + "only showing top 10 rows\n", + "\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.2 Top k for a selected set of users (items)" - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 22, "source": [ "users = dfs_train.select(als.getUserCol()).distinct().limit(3)\n", "\n", "dfs_rec_subset = model.recommendForUserSubset(users, 10)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 23, "source": [ "dfs_rec_subset.show(10)" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "+------+--------------------+\n", + "|UserId| recommendations|\n", + "+------+--------------------+\n", + "| 471|[[814, 3.7504895]...|\n", + "| 463|[[814, 3.1264873]...|\n", + "| 148|[[814, 4.03012], ...|\n", + "+------+--------------------+\n", + "\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "#### 3.5.3 Run-time considerations for top-k recommendations\n", "\n", @@ -593,28 +745,28 @@ "* Inner products of user-item pairs are calculated individually instead of leveraging matrix block multiplication features which are available in certain contemporary computing acceleration libraries (e.g., BLAS).\n", "\n", "More details about possible optimizations of the top k recommendations in Spark can be found [here](https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html)." - ] + ], + "metadata": {} }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], + "execution_count": 24, "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "## References" - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "1. Yehuda Koren, Robert Bell, and Chris Volinsky, \"Matrix Factorization Techniques for Recommender Systems\n", "\", ACM Computer, Vol. 42, Issue 8, pp 30-37, Aug., 2009.\n", @@ -624,7 +776,8 @@ "4. Seaborn. url: https://seaborn.pydata.org/\n", "5. Scaling collaborative filtering with PySpark. url: https://engineeringblog.yelp.com/2018/05/scaling-collaborative-filtering-with-pyspark.html\n", "6. Matrix Completion via Alternating Least Square (ALS). url: http://stanford.edu/~rezab/classes/cme323/S15/notes/lec14.pdf" - ] + ], + "metadata": {} } ], "metadata": { @@ -633,9 +786,8 @@ "hash": "7ec2189bea0434770dca7423a25e631e1cca9c4e2b4ff137a82f4dff32ac9607" }, "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" + "name": "python3", + "display_name": "Python 3.6.9 64-bit ('.env': venv)" }, "language_info": { "codemirror_mode": { @@ -652,4 +804,4 @@ }, "nbformat": 4, "nbformat_minor": 2 -} +} \ No newline at end of file From 79eff3bf5c0b53c6414ade74e12ffb1bf543b6af Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 18:02:54 +0000 Subject: [PATCH 15/27] Specify tmp path for data serialization --- recommenders/datasets/mock/movielens.py | 45 +++++++++++++------ recommenders/datasets/movielens.py | 19 ++++---- .../datasets/mock/test_movielens.py | 19 ++++++++ 3 files changed, 60 insertions(+), 23 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index e71da13f41..8e12a7c3b5 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -6,8 +6,8 @@ """ try: import pandera as pa -except ImportError as e: - raise ImportError("pandera is not installed. Try `pip install recommenders['dev']`") from e +except ImportError: + raise ImportError("pandera is not installed. Try `pip install recommenders['dev']`") try: from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType @@ -23,7 +23,9 @@ DEFAULT_GENRE_COL, DEFAULT_HEADER ) +from recommenders.datasets.download_utils import download_path +import os import random from typing import Optional @@ -43,9 +45,10 @@ class MockMovielensSchema(pa.SchemaModel): Please see https://pandera.readthedocs.io/en/latest/schema_models.html for more information. """ + # Some notebooks will do a cross join with userID and itemID, + # a sparse range for these IDs can slow down the notebook tests userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) - # Rating is on the scale from 1 to 5 rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) timestamp: Series[str] = Field(eq="2022-2-22") title: Series[str] = Field(eq="foo") @@ -90,24 +93,45 @@ def get_spark_df( spark, size: int = 3, seed: int = 100, keep_title_col: bool = False, keep_genre_col: bool = False, + tmp_path: Optional[str] = None, ): """Return fake movielens dataset as a Spark Dataframe with specified rows Args: spark (SparkSession): spark session to load the dataframe into size (int): number of rows to generate - seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + seed (int): seeding the pseudo-number generation. Defaults to 100. keep_title_col (bool): remove the title column if False. Defaults to False. keep_genre_col (bool): remove the genre column if False. Defaults to False. + tmp_path (str, optional): path to store files for serialization purpose + when transferring data from python to java. + If None, a temporal path is used instead Returns: pyspark.sql.DataFrame: a mock dataset """ pandas_df = cls.get_df(size=size, seed=seed, keep_title_col=True, keep_genre_col=True) - # serialize the pandas.df to avoid the expensive java <-> python communication - pandas_df.to_csv('test.csv', header=False, index=False) - deserialization_schema = StructType([ + # generate temp folder + with download_path(tmp_path) as tmp_folder: + filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") + # serialize the pandas.df as a csv to avoid the expensive java <-> python communication + pandas_df.to_csv(filepath, header=False, index=False) + print(f"Saving file {filepath}.") + spark_df = spark.read.csv(filepath, schema=cls._get_spark_deserialization_schema()) + # Cache and force trigger action since data-file might be removed. + spark_df.cache() + spark_df.count() + + if not keep_title_col: + spark_df = spark_df.drop(DEFAULT_TITLE_COL) + if not keep_genre_col: + spark_df = spark_df.drop(DEFAULT_GENRE_COL) + return spark_df + + @classmethod + def _get_spark_deserialization_schema(cls): + return StructType([ StructField(DEFAULT_USER_COL, IntegerType()), StructField(DEFAULT_ITEM_COL, IntegerType()), StructField(DEFAULT_RATING_COL, FloatType()), @@ -115,10 +139,3 @@ def get_spark_df( StructField(DEFAULT_TITLE_COL, StringType()), StructField(DEFAULT_GENRE_COL, StringType()), ]) - spark_df = spark.read.csv('test.csv', schema=deserialization_schema) - - if not keep_title_col: - spark_df = spark_df.drop(DEFAULT_TITLE_COL) - if not keep_genre_col: - spark_df = spark_df.drop(DEFAULT_GENRE_COL) - return spark_df diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 60f33a5e92..fb9ba4aede 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -101,7 +101,6 @@ def item_has_header(self): # Fake data for testing only MOCK_DATA_FORMAT = { "mock100": {"size": 100, "seed": 0}, - "mock10": {"size": 10, "seed": 6} } # 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset. @@ -153,15 +152,16 @@ def load_pandas_df( Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). - header* (list or tuple or None): Rating dataset header. 'DEFAULT_HEADER' is set for all mock data sizes ("mock*"). + header* (list or tuple or None): Rating dataset header. + If size is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored and data is rendered using the 'DEFAULT_HEADER' instead. local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. - title_col* (str): Movie title column name. If None, the column will not be loaded. - genres_col* (str): Genres column name. Genres are '|' separated string. + title_col (str): Movie title column name. If None, the column will not be loaded. + genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. year_col* (str): Movie release year column name. If None, the column will not be loaded. - All (*) arguments are not applicable when mock dataset is specified (size = "mock*") + All (*) arguments are not applicable when mock dataset is specified (size = "mock*") Returns: pandas.DataFrame: Movie rating dataset. @@ -361,10 +361,11 @@ def load_spark_df( Args: spark (pyspark.SparkSession): Spark session. size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). - header* (list or tuple): Rating dataset header. 'DEFAULT_HEADER' is set for all mock data sizes ("mock*"). + header* (list or tuple): Rating dataset header. If schema is provided, this argument is ignored. - schema* (pyspark.StructType): Dataset schema. - local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. + schema* (pyspark.StructType): Dataset schema. + If size is set to any of 'MOCK_DATA_FORMAT', data is rendered in the 'MockMovielensSchema' instead. + local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object title_col (str): Title column name. If None, the column will not be loaded. @@ -372,7 +373,7 @@ def load_spark_df( If None, the column will not be loaded. year_col* (str): Movie release year column name. If None, the column will not be loaded. - All (*) arguments are not applicable when mock dataset is specified (size = "mock*") + All (*) arguments are not applicable if size is set to any of 'MOCK_DATA_FORMAT' Returns: pyspark.sql.DataFrame: Movie rating dataset. diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py index e8a6e5f8be..0db9e26b59 100644 --- a/tests/unit/recommenders/datasets/mock/test_movielens.py +++ b/tests/unit/recommenders/datasets/mock/test_movielens.py @@ -1,3 +1,5 @@ +import os + from recommenders.datasets.mock.movielens import MockMovielensSchema from recommenders.datasets.movielens import DEFAULT_HEADER from recommenders.utils.constants import ( @@ -7,6 +9,7 @@ import pytest import pandas +from pytest_mock import MockerFixture @pytest.mark.parametrize("size", [10, 100]) @@ -62,3 +65,19 @@ def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, assert df.schema[DEFAULT_TITLE_COL] if keep_genre_col: assert df.schema[DEFAULT_GENRE_COL] + + +def test_mock_movielens_schema__get_spark_df__store_tmp_file(spark, tmp_path): + data_size = 3 + MockMovielensSchema.get_spark_df(spark, size=data_size, tmp_path=tmp_path) + assert os.path.exists(os.path.join(tmp_path, f"mock_movielens_{data_size}.csv")) + + +def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(spark, mocker: MockerFixture): + data_size = 3 + to_csv_spy = mocker.spy(pandas.DataFrame, "to_csv") + + df = MockMovielensSchema.get_spark_df(spark, size=data_size) + # assertions + to_csv_spy.assert_called_once() + assert df.count() == data_size From 477391f207e1bcd0fe41c1f7d4525f9e0d4762bc Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 18:51:26 +0000 Subject: [PATCH 16/27] Add pytest-mock as 'dev' dependency --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 3bcdb23f05..c0d7d67377 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ "pandera[strategies]>=0.6.5", # For generating fake datasets "pytest>=3.6.4", "pytest-cov>=2.12.1", + "pytest-mock>=3.6.1", # for access to mock fixtures in pytest ], } # for the brave of heart From 97c5be0ea19d41c268d393b92ab3bc64f4ce9663 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 24 Sep 2021 19:00:25 +0000 Subject: [PATCH 17/27] Add spark test markers to new tests --- tests/unit/recommenders/datasets/mock/test_movielens.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py index 0db9e26b59..bff8e05f62 100644 --- a/tests/unit/recommenders/datasets/mock/test_movielens.py +++ b/tests/unit/recommenders/datasets/mock/test_movielens.py @@ -67,12 +67,15 @@ def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, assert df.schema[DEFAULT_GENRE_COL] +@pytest.mark.spark def test_mock_movielens_schema__get_spark_df__store_tmp_file(spark, tmp_path): data_size = 3 MockMovielensSchema.get_spark_df(spark, size=data_size, tmp_path=tmp_path) assert os.path.exists(os.path.join(tmp_path, f"mock_movielens_{data_size}.csv")) + +@pytest.mark.spark def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(spark, mocker: MockerFixture): data_size = 3 to_csv_spy = mocker.spy(pandas.DataFrame, "to_csv") From cb2d140a15d00d7bccccdafe1f85f4e9a0cbec8e Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Mon, 27 Sep 2021 16:01:16 +0000 Subject: [PATCH 18/27] Small code cleanup --- recommenders/datasets/mock/movielens.py | 1 - recommenders/datasets/movielens.py | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py index 8e12a7c3b5..d7e1264607 100644 --- a/recommenders/datasets/mock/movielens.py +++ b/recommenders/datasets/mock/movielens.py @@ -117,7 +117,6 @@ def get_spark_df( filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") # serialize the pandas.df as a csv to avoid the expensive java <-> python communication pandas_df.to_csv(filepath, header=False, index=False) - print(f"Saving file {filepath}.") spark_df = spark.read.csv(filepath, schema=cls._get_spark_deserialization_schema()) # Cache and force trigger action since data-file might be removed. spark_df.cache() diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index fb9ba4aede..00a80f9d1a 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -20,9 +20,7 @@ StructField, IntegerType, FloatType, - DoubleType, - LongType, - StringType, + LongType ) from pyspark.sql.functions import concat_ws, col except ImportError: From 65a5327145d8ac792f42e9f8e3bbc86ecb8d3d28 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Mon, 27 Sep 2021 16:11:32 +0000 Subject: [PATCH 19/27] Install 'dev' dependencies in ADO build --- tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml | 2 +- tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml | 2 +- 9 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml index 2c5a698243..15b237650a 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_cpu.yml @@ -33,6 +33,6 @@ extends: timeout: 180 conda_env: "nightly_linux_cpu" conda_opts: "python=3.6" - pip_opts: "[examples]" + pip_opts: "[examples,dev]" pytest_markers: "not spark and not gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml index b1182c34c9..c43e8ec981 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_gpu.yml @@ -32,6 +32,6 @@ extends: timeout: 240 conda_env: "nightly_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu,examples] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,examples,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "not spark and gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml index 6fd4e526ea..f542f059ff 100644 --- a/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_nightly_linux_pyspark.yml @@ -33,6 +33,6 @@ extends: timeout: 180 conda_env: "nightly_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark,examples]" + pip_opts: "[spark,examples,dev]" pytest_markers: "spark and not gpu" pytest_params: "-x" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml index b75cc0c3f5..93eaeacc84 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_cpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux CPU" conda_env: "unit_notebook_linux_cpu" conda_opts: "python=3.6" - pip_opts: "[examples]" + pip_opts: "[examples,dev]" pytest_markers: "notebooks and not spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml index 9cb44639e0..6d7594a143 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_gpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux GPU" conda_env: "unit_notebook_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu,examples] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,examples,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "notebooks and not spark and gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml index 535f6936a7..31d699588d 100644 --- a/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_notebook_linux_pyspark.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Notebook Linux Spark" conda_env: "unit_notebook_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark,examples]" + pip_opts: "[spark,examples,dev]" pytest_markers: "notebooks and spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml index be3b95c587..26ed5bdf2f 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_cpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux CPU" conda_env: "unit_linux_cpu" conda_opts: "python=3.6" - pip_opts: "" + pip_opts: "[dev]" pytest_markers: "not notebooks and not spark and not gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml index b9a76211d9..9aa46047e6 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_gpu.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux GPU" conda_env: "unit_linux_gpu" conda_opts: "python=3.6 cudatoolkit=10.0 \"cudnn>=7.6\"" - pip_opts: "[gpu] -f https://download.pytorch.org/whl/cu100/torch_stable.html" + pip_opts: "[gpu,dev] -f https://download.pytorch.org/whl/cu100/torch_stable.html" pytest_markers: "not notebooks and not spark and gpu" diff --git a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml index f99b151cad..1f3006a05e 100644 --- a/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml +++ b/tests/ci/azure_pipeline_test/dsvm_unit_linux_pyspark.yml @@ -60,5 +60,5 @@ extends: task_name: "Test - Unit Linux Spark" conda_env: "unit_linux_spark" conda_opts: "python=3.6" - pip_opts: "[spark]" + pip_opts: "[spark,dev]" pytest_markers: "not notebooks and spark and not gpu" From c2a44589fd61db9f7ea231d56f8b73a86c8e3942 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Mon, 27 Sep 2021 19:09:53 +0000 Subject: [PATCH 20/27] Undone default partition changes --- recommenders/evaluation/spark_evaluation.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/recommenders/evaluation/spark_evaluation.py b/recommenders/evaluation/spark_evaluation.py index 5110d72e82..e5112965b2 100644 --- a/recommenders/evaluation/spark_evaluation.py +++ b/recommenders/evaluation/spark_evaluation.py @@ -1,10 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. - -import numpy as np -from pyspark.sql.types import LongType - try: from pyspark.mllib.evaluation import RegressionMetrics, RankingMetrics from pyspark.sql import Window, DataFrame @@ -618,7 +614,7 @@ def _get_pairwise_items(self, df): .select(self.col_user, "i1", "i2") ) - def _get_cosine_similarity(self, n_partitions=10): + def _get_cosine_similarity(self, n_partitions=200): if self.item_sim_measure == "item_cooccurrence_count": # calculate item-item similarity based on item co-occurrence count From 2306b2b94040696f1ad73f6766c97fdbdab15011 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Wed, 29 Sep 2021 21:11:32 +0000 Subject: [PATCH 21/27] Fix bug after merge --- examples/03_evaluate/als_movielens_diversity_metrics.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 1f4a4ed081..356224adef 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -153,7 +153,7 @@ "import pyspark\n", "from pyspark.ml.recommendation import ALS\n", "import pyspark.sql.functions as F\n", - "from pyspark.sql.types import FloatType, IntegerType, LongType, StructType, StructField\n", + "from pyspark.sql.types import FloatType, IntegerType, StringType, StructType, StructField\n", "from pyspark.ml.feature import Tokenizer, StopWordsRemover\n", "from pyspark.ml.feature import HashingTF, CountVectorizer, VectorAssembler\n", "\n", From b0bcd75e4149cb323a10e3d6b4c0c3ba111dca73 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Tue, 5 Oct 2021 19:58:24 +0000 Subject: [PATCH 22/27] Undo datatype changes --- .../als_movielens_diversity_metrics.ipynb | 60 +++++++++---------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb index 356224adef..289d5b93fd 100644 --- a/examples/03_evaluate/als_movielens_diversity_metrics.ipynb +++ b/examples/03_evaluate/als_movielens_diversity_metrics.ipynb @@ -153,7 +153,7 @@ "import pyspark\n", "from pyspark.ml.recommendation import ALS\n", "import pyspark.sql.functions as F\n", - "from pyspark.sql.types import FloatType, IntegerType, StringType, StructType, StructField\n", + "from pyspark.sql.types import FloatType, IntegerType, LongType, StructType, StructField\n", "from pyspark.ml.feature import Tokenizer, StopWordsRemover\n", "from pyspark.ml.feature import HashingTF, CountVectorizer, VectorAssembler\n", "\n", @@ -177,8 +177,8 @@ "output_type": "stream", "name": "stdout", "text": [ - "System version: 3.6.13 |Anaconda, Inc.| (default, Jun 4 2021, 14:25:59) \n", - "[GCC 7.5.0]\n", + "System version: 3.6.9 (default, Jan 26 2021, 15:33:00) \n", + "[GCC 8.4.0]\n", "Spark version: 2.4.8\n" ] } @@ -205,7 +205,7 @@ "\n", "# user, item column names\n", "COL_USER=\"UserId\"\n", - "COL_ITEM=\"ItemId\"\n", + "COL_ITEM=\"MovieId\"\n", "COL_RATING=\"Rating\"\n", "COL_TITLE=\"Title\"\n", "COL_GENRE=\"Genre\"" @@ -257,7 +257,7 @@ " StructField(COL_USER, IntegerType()),\n", " StructField(COL_ITEM, IntegerType()),\n", " StructField(COL_RATING, FloatType()),\n", - " StructField(\"Timestamp\", StringType()),\n", + " StructField(\"Timestamp\", LongType()),\n", " )\n", ")\n", "\n", @@ -269,37 +269,37 @@ "output_type": "stream", "name": "stderr", "text": [ - "100%|██████████| 4.81k/4.81k [00:00<00:00, 15.6kKB/s]\n" + "100%|██████████| 4.81k/4.81k [00:00<00:00, 20.1kKB/s]\n" ] }, { "output_type": "stream", "name": "stdout", "text": [ - "+------+------+------+---------+--------------------+------+\n", - "|ItemId|UserId|Rating|Timestamp| Title| Genre|\n", - "+------+------+------+---------+--------------------+------+\n", - "| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n", - "| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n", - "| 26| 18| 4.0|880129731|Brothers McMullen...|Comedy|\n", - "| 26| 222| 3.0|878183043|Brothers McMullen...|Comedy|\n", - "| 26| 43| 5.0|883954901|Brothers McMullen...|Comedy|\n", - "| 26| 201| 4.0|884111927|Brothers McMullen...|Comedy|\n", - "| 26| 299| 4.0|878192601|Brothers McMullen...|Comedy|\n", - "| 26| 95| 3.0|880571951|Brothers McMullen...|Comedy|\n", - "| 26| 89| 3.0|879459909|Brothers McMullen...|Comedy|\n", - "| 26| 361| 3.0|879440941|Brothers McMullen...|Comedy|\n", - "| 26| 194| 3.0|879522240|Brothers McMullen...|Comedy|\n", - "| 26| 391| 5.0|877399745|Brothers McMullen...|Comedy|\n", - "| 26| 345| 3.0|884993555|Brothers McMullen...|Comedy|\n", - "| 26| 303| 4.0|879468307|Brothers McMullen...|Comedy|\n", - "| 26| 401| 3.0|891033395|Brothers McMullen...|Comedy|\n", - "| 26| 429| 3.0|882386333|Brothers McMullen...|Comedy|\n", - "| 26| 293| 3.0|888907015|Brothers McMullen...|Comedy|\n", - "| 26| 270| 5.0|876954995|Brothers McMullen...|Comedy|\n", - "| 26| 442| 3.0|883388576|Brothers McMullen...|Comedy|\n", - "| 26| 342| 2.0|875320037|Brothers McMullen...|Comedy|\n", - "+------+------+------+---------+--------------------+------+\n", + "+-------+------+------+---------+--------------------+------+\n", + "|MovieId|UserId|Rating|Timestamp| Title| Genre|\n", + "+-------+------+------+---------+--------------------+------+\n", + "| 26| 138| 5.0|879024232|Brothers McMullen...|Comedy|\n", + "| 26| 224| 3.0|888104153|Brothers McMullen...|Comedy|\n", + "| 26| 18| 4.0|880129731|Brothers McMullen...|Comedy|\n", + "| 26| 222| 3.0|878183043|Brothers McMullen...|Comedy|\n", + "| 26| 43| 5.0|883954901|Brothers McMullen...|Comedy|\n", + "| 26| 201| 4.0|884111927|Brothers McMullen...|Comedy|\n", + "| 26| 299| 4.0|878192601|Brothers McMullen...|Comedy|\n", + "| 26| 95| 3.0|880571951|Brothers McMullen...|Comedy|\n", + "| 26| 89| 3.0|879459909|Brothers McMullen...|Comedy|\n", + "| 26| 361| 3.0|879440941|Brothers McMullen...|Comedy|\n", + "| 26| 194| 3.0|879522240|Brothers McMullen...|Comedy|\n", + "| 26| 391| 5.0|877399745|Brothers McMullen...|Comedy|\n", + "| 26| 345| 3.0|884993555|Brothers McMullen...|Comedy|\n", + "| 26| 303| 4.0|879468307|Brothers McMullen...|Comedy|\n", + "| 26| 401| 3.0|891033395|Brothers McMullen...|Comedy|\n", + "| 26| 429| 3.0|882386333|Brothers McMullen...|Comedy|\n", + "| 26| 293| 3.0|888907015|Brothers McMullen...|Comedy|\n", + "| 26| 270| 5.0|876954995|Brothers McMullen...|Comedy|\n", + "| 26| 442| 3.0|883388576|Brothers McMullen...|Comedy|\n", + "| 26| 342| 2.0|875320037|Brothers McMullen...|Comedy|\n", + "+-------+------+------+---------+--------------------+------+\n", "only showing top 20 rows\n", "\n" ] From fd33efe2d6fc1f736809665c8d5657476374de32 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Tue, 5 Oct 2021 20:20:54 +0000 Subject: [PATCH 23/27] Merge mock schema into movielens.py --- recommenders/datasets/mock/__init__.py | 0 recommenders/datasets/mock/movielens.py | 140 ---------------- recommenders/datasets/movielens.py | 151 ++++++++++++++++-- .../recommenders/datasets/mock/__init__.py | 0 .../datasets/mock/test_movielens.py | 86 ---------- .../recommenders/datasets/test_movielens.py | 83 +++++++++- 6 files changed, 214 insertions(+), 246 deletions(-) delete mode 100644 recommenders/datasets/mock/__init__.py delete mode 100644 recommenders/datasets/mock/movielens.py delete mode 100644 tests/unit/recommenders/datasets/mock/__init__.py delete mode 100644 tests/unit/recommenders/datasets/mock/test_movielens.py diff --git a/recommenders/datasets/mock/__init__.py b/recommenders/datasets/mock/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/recommenders/datasets/mock/movielens.py b/recommenders/datasets/mock/movielens.py deleted file mode 100644 index d7e1264607..0000000000 --- a/recommenders/datasets/mock/movielens.py +++ /dev/null @@ -1,140 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -""" -Mock dataset schema to generate fake data for testing use. This will mimic the Movielens Dataset -""" -try: - import pandera as pa -except ImportError: - raise ImportError("pandera is not installed. Try `pip install recommenders['dev']`") - -try: - from pyspark.sql.types import StructField, StructType, IntegerType, StringType, FloatType -except ImportError: - pass # so the environment without spark doesn't break - -from recommenders.utils.constants import ( - DEFAULT_USER_COL, - DEFAULT_ITEM_COL, - DEFAULT_RATING_COL, - DEFAULT_TIMESTAMP_COL, - DEFAULT_TITLE_COL, - DEFAULT_GENRE_COL, - DEFAULT_HEADER -) -from recommenders.datasets.download_utils import download_path - -import os -import random -from typing import Optional - -import pandas -from pandera.typing import Series -from pandera import Field - - -class MockMovielensSchema(pa.SchemaModel): - """ - Mock dataset schema to generate fake data for testing purpose. - This schema is configured to mimic the Movielens dataset - - http://files.grouplens.org/datasets/movielens/ml-100k/ - - Dataset schema and generation is configured using pandera. - Please see https://pandera.readthedocs.io/en/latest/schema_models.html - for more information. - """ - # Some notebooks will do a cross join with userID and itemID, - # a sparse range for these IDs can slow down the notebook tests - userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) - itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) - rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) - timestamp: Series[str] = Field(eq="2022-2-22") - title: Series[str] = Field(eq="foo") - genre: Series[str] = Field(eq="genreA|0") - - @classmethod - def get_df( - cls, - size: int = 3, seed: int = 100, - keep_first_n_cols: Optional[int] = None, - keep_title_col: bool = False, keep_genre_col: bool = False, - ) -> pandas.DataFrame: - """Return fake movielens dataset as a Pandas Dataframe with specified rows. - - Args: - size (int): number of rows to generate - seed (int, optional): seeding the pseudo-number generation. Defaults to 100. - keep_first_n_cols (int, optional): keep the first n default movielens columns. - keep_title_col (bool): remove the title column if False. Defaults to True. - keep_genre_col (bool): remove the genre column if False. Defaults to True. - - Returns: - pandas.DataFrame: a mock dataset - """ - schema = cls.to_schema() - if keep_first_n_cols is not None: - if keep_first_n_cols < 1 or keep_first_n_cols > len(DEFAULT_HEADER): - raise ValueError(f"Invalid value for 'keep_first_n_cols': {keep_first_n_cols}. Valid range: [1-{len(DEFAULT_HEADER)}]") - schema = schema.remove_columns(DEFAULT_HEADER[keep_first_n_cols:]) - if not keep_title_col: - schema = schema.remove_columns([DEFAULT_TITLE_COL]) - if not keep_genre_col: - schema = schema.remove_columns([DEFAULT_GENRE_COL]) - - random.seed(seed) - # For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html - return schema.example(size=size) - - @classmethod - def get_spark_df( - cls, - spark, - size: int = 3, seed: int = 100, - keep_title_col: bool = False, keep_genre_col: bool = False, - tmp_path: Optional[str] = None, - ): - """Return fake movielens dataset as a Spark Dataframe with specified rows - - Args: - spark (SparkSession): spark session to load the dataframe into - size (int): number of rows to generate - seed (int): seeding the pseudo-number generation. Defaults to 100. - keep_title_col (bool): remove the title column if False. Defaults to False. - keep_genre_col (bool): remove the genre column if False. Defaults to False. - tmp_path (str, optional): path to store files for serialization purpose - when transferring data from python to java. - If None, a temporal path is used instead - - Returns: - pyspark.sql.DataFrame: a mock dataset - """ - pandas_df = cls.get_df(size=size, seed=seed, keep_title_col=True, keep_genre_col=True) - - # generate temp folder - with download_path(tmp_path) as tmp_folder: - filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") - # serialize the pandas.df as a csv to avoid the expensive java <-> python communication - pandas_df.to_csv(filepath, header=False, index=False) - spark_df = spark.read.csv(filepath, schema=cls._get_spark_deserialization_schema()) - # Cache and force trigger action since data-file might be removed. - spark_df.cache() - spark_df.count() - - if not keep_title_col: - spark_df = spark_df.drop(DEFAULT_TITLE_COL) - if not keep_genre_col: - spark_df = spark_df.drop(DEFAULT_GENRE_COL) - return spark_df - - @classmethod - def _get_spark_deserialization_schema(cls): - return StructType([ - StructField(DEFAULT_USER_COL, IntegerType()), - StructField(DEFAULT_ITEM_COL, IntegerType()), - StructField(DEFAULT_RATING_COL, FloatType()), - StructField(DEFAULT_TIMESTAMP_COL, StringType()), - StructField(DEFAULT_TITLE_COL, StringType()), - StructField(DEFAULT_GENRE_COL, StringType()), - ]) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 00a80f9d1a..88b67322d4 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -3,21 +3,29 @@ import os import re +import random import shutil import warnings import pandas as pd +from typing import Optional from zipfile import ZipFile from recommenders.datasets.download_utils import maybe_download, download_path from recommenders.utils.notebook_utils import is_databricks from recommenders.utils.constants import ( DEFAULT_HEADER, DEFAULT_ITEM_COL, + DEFAULT_USER_COL, + DEFAULT_RATING_COL, + DEFAULT_TIMESTAMP_COL, + DEFAULT_TITLE_COL, + DEFAULT_GENRE_COL, ) try: from pyspark.sql.types import ( StructType, StructField, + StringType, IntegerType, FloatType, LongType @@ -26,6 +34,13 @@ except ImportError: pass # so the environment without spark doesn't break +try: + import pandera as pa + from pandera.typing import Series + from pandera import Field +except ImportError: + pass # so the environment without recommender['dev'] doesn't break + class _DataFormat: def __init__( @@ -150,16 +165,16 @@ def load_pandas_df( Args: size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). - header* (list or tuple or None): Rating dataset header. - If size is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored and data is rendered using the 'DEFAULT_HEADER' instead. - local_cache_path* (str): Path (directory or a zip file) to cache the downloaded zip file. + header (list or tuple or None): Rating dataset header. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored and data is rendered using the 'DEFAULT_HEADER' instead. + local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. title_col (str): Movie title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. - year_col* (str): Movie release year column name. If None, the column will not be loaded. - - All (*) arguments are not applicable when mock dataset is specified (size = "mock*") + year_col (str): Movie release year column name. If None, the column will not be loaded. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. Returns: pandas.DataFrame: Movie rating dataset. @@ -195,8 +210,6 @@ def load_pandas_df( header = header[:4] if size in MOCK_DATA_FORMAT: - # function-wide import to isolate extra dependencies from the mock schema will use - from recommenders.datasets.mock.movielens import MockMovielensSchema # generate fake data return MockMovielensSchema.get_df( keep_first_n_cols=len(header), @@ -359,19 +372,19 @@ def load_spark_df( Args: spark (pyspark.SparkSession): Spark session. size (str): Size of the data to load. One of ("100k", "1m", "10m", "20m", "mock100"). - header* (list or tuple): Rating dataset header. - If schema is provided, this argument is ignored. - schema* (pyspark.StructType): Dataset schema. - If size is set to any of 'MOCK_DATA_FORMAT', data is rendered in the 'MockMovielensSchema' instead. + header (list or tuple): Rating dataset header. + If `schema` is provided or `size` is set to any of 'MOCK_DATA_FORMAT', this argument is ignored. + schema (pyspark.StructType): Dataset schema. + If `size` is set to any of 'MOCK_DATA_FORMAT', data is rendered in the 'MockMovielensSchema' instead. local_cache_path (str): Path (directory or a zip file) to cache the downloaded zip file. If None, all the intermediate files will be stored in a temporary directory and removed after use. dbutils (Databricks.dbutils): Databricks utility object + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. title_col (str): Title column name. If None, the column will not be loaded. genres_col (str): Genres column name. Genres are '|' separated string. If None, the column will not be loaded. - year_col* (str): Movie release year column name. If None, the column will not be loaded. - - All (*) arguments are not applicable if size is set to any of 'MOCK_DATA_FORMAT' + year_col (str): Movie release year column name. If None, the column will not be loaded. + If `size` is set to any of 'MOCK_DATA_FORMAT', this parameter is ignored. Returns: pyspark.sql.DataFrame: Movie rating dataset. @@ -410,8 +423,6 @@ def load_spark_df( raise ValueError(ERROR_MOVIE_LENS_SIZE) if size in MOCK_DATA_FORMAT: - # function-wide import to isolate extra dependencies from the mock schema will use - from recommenders.datasets.mock.movielens import MockMovielensSchema # generate fake data return MockMovielensSchema.get_spark_df( spark, @@ -560,3 +571,109 @@ def extract_movielens(size, rating_path, item_path, zip_path): shutil.copyfileobj(zf, f) with z.open(DATA_FORMAT[size].item_path) as zf, open(item_path, "wb") as f: shutil.copyfileobj(zf, f) + + +class MockMovielensSchema(pa.SchemaModel): + """ + Mock dataset schema to generate fake data for testing purpose. + This schema is configured to mimic the Movielens dataset + + http://files.grouplens.org/datasets/movielens/ml-100k/ + + Dataset schema and generation is configured using pandera. + Please see https://pandera.readthedocs.io/en/latest/schema_models.html + for more information. + """ + # Some notebooks will do a cross join with userID and itemID, + # a sparse range for these IDs can slow down the notebook tests + userID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) + itemID: Series[int] = Field(in_range={"min_value": 1, "max_value": 10}) + rating: Series[float] = Field(in_range={"min_value": 1, "max_value": 5}) + timestamp: Series[int] + title: Series[str] = Field(eq="foo") + genre: Series[str] = Field(eq="genreA|0") + + @classmethod + def get_df( + cls, + size: int = 3, seed: int = 100, + keep_first_n_cols: Optional[int] = None, + keep_title_col: bool = False, keep_genre_col: bool = False, + ) -> pd.DataFrame: + """Return fake movielens dataset as a Pandas Dataframe with specified rows. + + Args: + size (int): number of rows to generate + seed (int, optional): seeding the pseudo-number generation. Defaults to 100. + keep_first_n_cols (int, optional): keep the first n default movielens columns. + keep_title_col (bool): remove the title column if False. Defaults to True. + keep_genre_col (bool): remove the genre column if False. Defaults to True. + + Returns: + pandas.DataFrame: a mock dataset + """ + schema = cls.to_schema() + if keep_first_n_cols is not None: + if keep_first_n_cols < 1 or keep_first_n_cols > len(DEFAULT_HEADER): + raise ValueError(f"Invalid value for 'keep_first_n_cols': {keep_first_n_cols}. Valid range: [1-{len(DEFAULT_HEADER)}]") + schema = schema.remove_columns(DEFAULT_HEADER[keep_first_n_cols:]) + if not keep_title_col: + schema = schema.remove_columns([DEFAULT_TITLE_COL]) + if not keep_genre_col: + schema = schema.remove_columns([DEFAULT_GENRE_COL]) + + random.seed(seed) + # For more information on data synthesis, see https://pandera.readthedocs.io/en/latest/data_synthesis_strategies.html + return schema.example(size=size) + + @classmethod + def get_spark_df( + cls, + spark, + size: int = 3, seed: int = 100, + keep_title_col: bool = False, keep_genre_col: bool = False, + tmp_path: Optional[str] = None, + ): + """Return fake movielens dataset as a Spark Dataframe with specified rows + + Args: + spark (SparkSession): spark session to load the dataframe into + size (int): number of rows to generate + seed (int): seeding the pseudo-number generation. Defaults to 100. + keep_title_col (bool): remove the title column if False. Defaults to False. + keep_genre_col (bool): remove the genre column if False. Defaults to False. + tmp_path (str, optional): path to store files for serialization purpose + when transferring data from python to java. + If None, a temporal path is used instead + + Returns: + pyspark.sql.DataFrame: a mock dataset + """ + pandas_df = cls.get_df(size=size, seed=seed, keep_title_col=True, keep_genre_col=True) + + # generate temp folder + with download_path(tmp_path) as tmp_folder: + filepath = os.path.join(tmp_folder, f"mock_movielens_{size}.csv") + # serialize the pandas.df as a csv to avoid the expensive java <-> python communication + pandas_df.to_csv(filepath, header=False, index=False) + spark_df = spark.read.csv(filepath, schema=cls._get_spark_deserialization_schema()) + # Cache and force trigger action since data-file might be removed. + spark_df.cache() + spark_df.count() + + if not keep_title_col: + spark_df = spark_df.drop(DEFAULT_TITLE_COL) + if not keep_genre_col: + spark_df = spark_df.drop(DEFAULT_GENRE_COL) + return spark_df + + @classmethod + def _get_spark_deserialization_schema(cls): + return StructType([ + StructField(DEFAULT_USER_COL, IntegerType()), + StructField(DEFAULT_ITEM_COL, IntegerType()), + StructField(DEFAULT_RATING_COL, FloatType()), + StructField(DEFAULT_TIMESTAMP_COL, StringType()), + StructField(DEFAULT_TITLE_COL, StringType()), + StructField(DEFAULT_GENRE_COL, StringType()), + ]) diff --git a/tests/unit/recommenders/datasets/mock/__init__.py b/tests/unit/recommenders/datasets/mock/__init__.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/tests/unit/recommenders/datasets/mock/test_movielens.py b/tests/unit/recommenders/datasets/mock/test_movielens.py deleted file mode 100644 index bff8e05f62..0000000000 --- a/tests/unit/recommenders/datasets/mock/test_movielens.py +++ /dev/null @@ -1,86 +0,0 @@ -import os - -from recommenders.datasets.mock.movielens import MockMovielensSchema -from recommenders.datasets.movielens import DEFAULT_HEADER -from recommenders.utils.constants import ( - DEFAULT_GENRE_COL, - DEFAULT_TITLE_COL, -) - -import pytest -import pandas -from pytest_mock import MockerFixture - - -@pytest.mark.parametrize("size", [10, 100]) -def test_mock_movielens_schema__has_default_col_names(size): - df = MockMovielensSchema.example(size=size) - for col_name in DEFAULT_HEADER: - assert col_name in df.columns - - -@pytest.mark.parametrize("keep_first_n_cols", [1, 2, 3, 4]) -def test_mock_movielens_schema__get_df_remove_default_col__return_success(keep_first_n_cols): - df = MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) - assert len(df) > 0 - assert len(df.columns) == keep_first_n_cols - - -@pytest.mark.parametrize("keep_first_n_cols", [-1, 0, 100]) -def test_mock_movielens_schema__get_df_invalid_param__return_failure(keep_first_n_cols): - with pytest.raises(ValueError, match=r"Invalid value.*"): - MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) - - -@pytest.mark.parametrize("keep_genre_col", [True, False]) -@pytest.mark.parametrize("keep_title_col", [True, False]) -@pytest.mark.parametrize("keep_first_n_cols", [None, 2]) -@pytest.mark.parametrize("seed", [-1]) # seed for pseudo-random # generation -@pytest.mark.parametrize("size", [0, 3, 10]) -def test_mock_movielens_schema__get_df__return_success(size, seed, keep_first_n_cols, keep_title_col, keep_genre_col): - df = MockMovielensSchema.get_df( - size=size, seed=seed, - keep_first_n_cols=keep_first_n_cols, - keep_title_col=keep_title_col, keep_genre_col=keep_genre_col - ) - assert type(df) == pandas.DataFrame - assert len(df) == size - - if keep_title_col: - assert len(df[DEFAULT_TITLE_COL]) == size - if keep_genre_col: - assert len(df[DEFAULT_GENRE_COL]) == size - - -@pytest.mark.spark -@pytest.mark.parametrize("keep_genre_col", [True, False]) -@pytest.mark.parametrize("keep_title_col", [True, False]) -@pytest.mark.parametrize("seed", [101]) # seed for pseudo-random # generation -@pytest.mark.parametrize("size", [0, 3, 10]) -def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, keep_title_col, keep_genre_col): - df = MockMovielensSchema.get_spark_df(spark, size=size, seed=seed, keep_title_col=keep_title_col, keep_genre_col=keep_genre_col) - assert df.count() == size - - if keep_title_col: - assert df.schema[DEFAULT_TITLE_COL] - if keep_genre_col: - assert df.schema[DEFAULT_GENRE_COL] - - -@pytest.mark.spark -def test_mock_movielens_schema__get_spark_df__store_tmp_file(spark, tmp_path): - data_size = 3 - MockMovielensSchema.get_spark_df(spark, size=data_size, tmp_path=tmp_path) - assert os.path.exists(os.path.join(tmp_path, f"mock_movielens_{data_size}.csv")) - - - -@pytest.mark.spark -def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(spark, mocker: MockerFixture): - data_size = 3 - to_csv_spy = mocker.spy(pandas.DataFrame, "to_csv") - - df = MockMovielensSchema.get_spark_df(spark, size=data_size) - # assertions - to_csv_spy.assert_called_once() - assert df.count() == data_size diff --git a/tests/unit/recommenders/datasets/test_movielens.py b/tests/unit/recommenders/datasets/test_movielens.py index ddba43a580..d8f12771f9 100644 --- a/tests/unit/recommenders/datasets/test_movielens.py +++ b/tests/unit/recommenders/datasets/test_movielens.py @@ -1,10 +1,87 @@ -from recommenders.datasets.movielens import DATA_FORMAT, MOCK_DATA_FORMAT +import os +import pandas +import pytest + +from recommenders.datasets.movielens import MockMovielensSchema from recommenders.datasets.movielens import load_pandas_df, load_spark_df +from recommenders.datasets.movielens import DATA_FORMAT, MOCK_DATA_FORMAT, DEFAULT_HEADER from recommenders.utils.constants import DEFAULT_GENRE_COL, DEFAULT_TITLE_COL -import pandas -import pytest from pandas.core.series import Series +from pytest_mock import MockerFixture + + +@pytest.mark.parametrize("size", [10, 100]) +def test_mock_movielens_schema__has_default_col_names(size): + df = MockMovielensSchema.example(size=size) + for col_name in DEFAULT_HEADER: + assert col_name in df.columns + + +@pytest.mark.parametrize("keep_first_n_cols", [1, 2, 3, 4]) +def test_mock_movielens_schema__get_df_remove_default_col__return_success(keep_first_n_cols): + df = MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + assert len(df) > 0 + assert len(df.columns) == keep_first_n_cols + + +@pytest.mark.parametrize("keep_first_n_cols", [-1, 0, 100]) +def test_mock_movielens_schema__get_df_invalid_param__return_failure(keep_first_n_cols): + with pytest.raises(ValueError, match=r"Invalid value.*"): + MockMovielensSchema.get_df(size=3, keep_first_n_cols=keep_first_n_cols) + + +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("keep_first_n_cols", [None, 2]) +@pytest.mark.parametrize("seed", [-1]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_df__return_success(size, seed, keep_first_n_cols, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_df( + size=size, seed=seed, + keep_first_n_cols=keep_first_n_cols, + keep_title_col=keep_title_col, keep_genre_col=keep_genre_col + ) + assert type(df) == pandas.DataFrame + assert len(df) == size + + if keep_title_col: + assert len(df[DEFAULT_TITLE_COL]) == size + if keep_genre_col: + assert len(df[DEFAULT_GENRE_COL]) == size + + +@pytest.mark.spark +@pytest.mark.parametrize("keep_genre_col", [True, False]) +@pytest.mark.parametrize("keep_title_col", [True, False]) +@pytest.mark.parametrize("seed", [101]) # seed for pseudo-random # generation +@pytest.mark.parametrize("size", [0, 3, 10]) +def test_mock_movielens_schema__get_spark_df__return_success(spark, size, seed, keep_title_col, keep_genre_col): + df = MockMovielensSchema.get_spark_df(spark, size=size, seed=seed, keep_title_col=keep_title_col, keep_genre_col=keep_genre_col) + assert df.count() == size + + if keep_title_col: + assert df.schema[DEFAULT_TITLE_COL] + if keep_genre_col: + assert df.schema[DEFAULT_GENRE_COL] + + +@pytest.mark.spark +def test_mock_movielens_schema__get_spark_df__store_tmp_file(spark, tmp_path): + data_size = 3 + MockMovielensSchema.get_spark_df(spark, size=data_size, tmp_path=tmp_path) + assert os.path.exists(os.path.join(tmp_path, f"mock_movielens_{data_size}.csv")) + + +@pytest.mark.spark +def test_mock_movielens_schema__get_spark_df__data_serialization_default_param(spark, mocker: MockerFixture): + data_size = 3 + to_csv_spy = mocker.spy(pandas.DataFrame, "to_csv") + + df = MockMovielensSchema.get_spark_df(spark, size=data_size) + # assertions + to_csv_spy.assert_called_once() + assert df.count() == data_size def test_mock_movielens_data__no_name_collision(): From 33c05cdeb562f982fd5263dfb833f4eb7a6c56c1 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Tue, 5 Oct 2021 21:07:41 +0000 Subject: [PATCH 24/27] Remove fake_movielens marker --- recommenders/datasets/movielens.py | 4 +-- tests/unit/examples/test_notebooks_pyspark.py | 26 +++++++++---------- tests/unit/examples/test_notebooks_python.py | 12 +++------ tox.ini | 1 - 4 files changed, 19 insertions(+), 24 deletions(-) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index 88b67322d4..d054bc64fb 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -36,8 +36,8 @@ try: import pandera as pa - from pandera.typing import Series from pandera import Field + from pandera.typing import Series except ImportError: pass # so the environment without recommender['dev'] doesn't break @@ -113,7 +113,7 @@ def item_has_header(self): # Fake data for testing only MOCK_DATA_FORMAT = { - "mock100": {"size": 100, "seed": 0}, + "mock100": {"size": 100, "seed": 6}, } # 100K data genres index to string mapper. For 1m, 10m, and 20m, the genres labels are already in the dataset. diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index 46691f3885..15a5a8ad7c 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -30,16 +30,14 @@ def test_data_split_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.fake_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2764.50s in Windows, while in Linux 124.35s" ) -@pytest.mark.parametrize("data_size", ["mock100"]) -def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): +def test_als_deep_dive_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["als_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, parameters=dict( - MOVIELENS_DATA_SIZE=data_size, + MOVIELENS_DATA_SIZE="mock100", COL_USER=DEFAULT_USER_COL, COL_ITEM=DEFAULT_ITEM_COL, COL_RATING=DEFAULT_RATING_COL, @@ -58,29 +56,31 @@ def test_evaluation_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.fake_movielens -@pytest.mark.parametrize("data_size", ["mock100"]) -def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name, data_size): +def test_evaluation_diversity_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["evaluation_diversity"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, - parameters=dict(TOP_K=10, MOVIELENS_DATA_SIZE=data_size)) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict( + TOP_K=10, + MOVIELENS_DATA_SIZE="mock100", + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks @pytest.mark.spark -@pytest.mark.fake_movielens @pytest.mark.skipif( sys.platform == "win32", reason="Takes 2409.69s in Windows, while in Linux 138.30s" ) -@pytest.mark.parametrize("data_size", ["mock100"]) -def test_spark_tuning(notebooks, output_notebook, kernel_name, data_size): +def test_spark_tuning(notebooks, output_notebook, kernel_name): notebook_path = notebooks["spark_tuning"] pm.execute_notebook( notebook_path, output_notebook, kernel_name=kernel_name, parameters=dict( - MOVIELENS_DATA_SIZE=data_size, + MOVIELENS_DATA_SIZE="mock100", NUMBER_CORES="*", NUMBER_ITERATIONS=3, SUBSET_RATIO=0.5, diff --git a/tests/unit/examples/test_notebooks_python.py b/tests/unit/examples/test_notebooks_python.py index 021d80fdc3..e9cda6810e 100644 --- a/tests/unit/examples/test_notebooks_python.py +++ b/tests/unit/examples/test_notebooks_python.py @@ -50,12 +50,10 @@ def test_baseline_deep_dive_runs(notebooks, output_notebook, kernel_name): @pytest.mark.notebooks -@pytest.mark.fake_movielens -@pytest.mark.parametrize("data_size", ["mock100"]) -def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name, data_size): +def test_surprise_deep_dive_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["surprise_svd_deep_dive"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, - parameters=dict(MOVIELENS_DATA_SIZE=data_size)) + parameters=dict(MOVIELENS_DATA_SIZE="mock100")) @pytest.mark.notebooks @@ -101,12 +99,10 @@ def test_wikidata_runs(notebooks, output_notebook, kernel_name, tmp): @pytest.mark.notebooks -@pytest.mark.fake_movielens -@pytest.mark.parametrize("data_size", ["mock100"]) -def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name, data_size): +def test_rlrmc_quickstart_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["rlrmc_quickstart"] pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, - parameters=dict(rank_parameter=2, MOVIELENS_DATA_SIZE=data_size)) + parameters=dict(rank_parameter=2, MOVIELENS_DATA_SIZE="mock100")) @pytest.mark.notebooks diff --git a/tox.ini b/tox.ini index 7ede574a2e..815e06dc14 100644 --- a/tox.ini +++ b/tox.ini @@ -66,7 +66,6 @@ markers = gpu: mark a test as gpu test spark: mark a test as spark test vw: mark a test as vowpal wabbit test - fake_movielens: mark a test that uses the fake dataset instead testpaths = tests addopts = From 84d83e2e7d2dfe19440df4e558e5c469ecc96f56 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 8 Oct 2021 14:28:53 +0000 Subject: [PATCH 25/27] Add pandera as a core dependency --- recommenders/datasets/movielens.py | 11 ++++------- setup.py | 2 +- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index d054bc64fb..cf19874f71 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -7,6 +7,9 @@ import shutil import warnings import pandas as pd +import pandera as pa +from pandera import Field +from pandera.typing import Series from typing import Optional from zipfile import ZipFile from recommenders.datasets.download_utils import maybe_download, download_path @@ -21,6 +24,7 @@ DEFAULT_GENRE_COL, ) + try: from pyspark.sql.types import ( StructType, @@ -34,13 +38,6 @@ except ImportError: pass # so the environment without spark doesn't break -try: - import pandera as pa - from pandera import Field - from pandera.typing import Series -except ImportError: - pass # so the environment without recommender['dev'] doesn't break - class _DataFormat: def __init__( diff --git a/setup.py b/setup.py index c0d7d67377..51a330d65d 100644 --- a/setup.py +++ b/setup.py @@ -45,6 +45,7 @@ "cornac>=1.1.2,<2", "scikit-surprise>=0.19.1,<=1.1.1", "retrying>=1.3.3", + "pandera[strategies]>=0.6.5", # For generating fake datasets ] # shared dependencies @@ -75,7 +76,6 @@ ], "dev": [ "black>=18.6b4,<21", - "pandera[strategies]>=0.6.5", # For generating fake datasets "pytest>=3.6.4", "pytest-cov>=2.12.1", "pytest-mock>=3.6.1", # for access to mock fixtures in pytest From 06ff90124762be8737bc54416e057517cb090d7e Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 8 Oct 2021 14:53:43 +0000 Subject: [PATCH 26/27] Run als quickstart NB on mock100 --- examples/00_quick_start/als_movielens.ipynb | 296 +++++++++--------- tests/unit/examples/test_notebooks_pyspark.py | 8 +- 2 files changed, 158 insertions(+), 146 deletions(-) diff --git a/examples/00_quick_start/als_movielens.ipynb b/examples/00_quick_start/als_movielens.ipynb index 059d7f0ead..1f1b1a0bf0 100644 --- a/examples/00_quick_start/als_movielens.ipynb +++ b/examples/00_quick_start/als_movielens.ipynb @@ -2,46 +2,34 @@ "cells": [ { "cell_type": "markdown", - "metadata": {}, "source": [ "Copyright (c) Microsoft Corporation. All rights reserved.\n", "\n", "Licensed under the MIT License." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "# Running ALS on MovieLens (PySpark)\n", "\n", "Matrix factorization by [ALS](https://spark.apache.org/docs/latest/api/python/_modules/pyspark/ml/recommendation.html#ALS) (Alternating Least Squares) is a well known collaborative filtering algorithm.\n", "\n", "This notebook provides an example of how to utilize and evaluate ALS PySpark ML (DataFrame-based API) implementation, meant for large-scale distributed datasets. We use a smaller dataset in this example to run ALS efficiently on multiple cores of a [Data Science Virtual Machine](https://azure.microsoft.com/en-gb/services/virtual-machines/data-science-virtual-machines/)." - ] + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "**Note**: This notebook requires a PySpark environment to run properly. Please follow the steps in [SETUP.md](../../SETUP.md) to install the PySpark environment." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", - "[GCC 7.3.0]\n", - "Spark version: 2.3.1\n" - ] - } - ], "source": [ "# set the environment path to find Recommenders\n", "import sys\n", @@ -61,74 +49,105 @@ "\n", "print(\"System version: {}\".format(sys.version))\n", "print(\"Spark version: {}\".format(pyspark.__version__))\n" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "System version: 3.6.8 |Anaconda, Inc.| (default, Dec 30 2018, 01:22:34) \n", + "[GCC 7.3.0]\n", + "Spark version: 2.3.1\n" + ] + } + ], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "Set the default parameters." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 2, - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], "source": [ "# top k items to recommend\n", "TOP_K = 10\n", "\n", "# Select MovieLens data size: 100k, 1m, 10m, or 20m\n", - "MOVIELENS_DATA_SIZE = '100k'" - ] + "MOVIELENS_DATA_SIZE = '100k'\n", + "\n", + "# Column names for the dataset\n", + "COL_USER = \"UserId\"\n", + "COL_ITEM = \"MovieId\"\n", + "COL_RATING = \"Rating\"\n", + "COL_TIMESTAMP = \"Timestamp\"" + ], + "outputs": [], + "metadata": { + "tags": [ + "parameters" + ] + } }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 0. Set up Spark context\n", "\n", "The following settings work well for debugging locally on VM - change when running on a cluster. We set up a giant single executor with many threads and specify memory cap. " - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 4, - "metadata": {}, - "outputs": [], "source": [ "# the following settings work well for debugging locally on VM - change when running on a cluster\n", "# set up a giant single executor with many threads and specify memory cap\n", "spark = start_or_get_spark(\"ALS PySpark\", memory=\"16g\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 1. Download the MovieLens dataset" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 5, - "metadata": {}, + "source": [ + "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", + "schema = StructType(\n", + " (\n", + " StructField(COL_USER, IntegerType()),\n", + " StructField(COL_ITEM, IntegerType()),\n", + " StructField(COL_RATING, FloatType()),\n", + " StructField(COL_TIMESTAMP, LongType()),\n", + " )\n", + ")\n", + "\n", + "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", + "data.show()" + ], "outputs": [ { - "name": "stderr", "output_type": "stream", + "name": "stderr", "text": [ "100%|██████████| 4.81k/4.81k [00:00<00:00, 19.9kKB/s]\n" ] }, { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+\n", "|UserId|MovieId|Rating|Timestamp|\n", @@ -159,68 +178,53 @@ ] } ], - "source": [ - "# Note: The DataFrame-based API for ALS currently only supports integers for user and item ids.\n", - "schema = StructType(\n", - " (\n", - " StructField(\"UserId\", IntegerType()),\n", - " StructField(\"MovieId\", IntegerType()),\n", - " StructField(\"Rating\", FloatType()),\n", - " StructField(\"Timestamp\", LongType()),\n", - " )\n", - ")\n", - "\n", - "data = movielens.load_spark_df(spark, size=MOVIELENS_DATA_SIZE, schema=schema)\n", - "data.show()" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 2. Split the data using the Spark random splitter provided in utilities" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 6, - "metadata": {}, + "source": [ + "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", + "print (\"N train\", train.cache().count())\n", + "print (\"N test\", test.cache().count())" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "N train 75193\n", "N test 24807\n" ] } ], - "source": [ - "train, test = spark_random_split(data, ratio=0.75, seed=123)\n", - "print (\"N train\", train.cache().count())\n", - "print (\"N test\", test.cache().count())" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 3. Train the ALS model on the training data, and get the top-k recommendations for our testing data\n", "\n", "To predict movie ratings, we use the rating data in the training set as users' explicit feedback. The hyperparameters used in building the model are referenced from [here](http://mymedialite.net/examples/datasets.html). We do not constrain the latent factors (`nonnegative = False`) in order to allow for both positive and negative preferences towards movies.\n", "Timing will vary depending on the machine being used to train." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 7, - "metadata": {}, - "outputs": [], "source": [ "header = {\n", - " \"userCol\": \"UserId\",\n", - " \"itemCol\": \"MovieId\",\n", - " \"ratingCol\": \"Rating\",\n", + " \"userCol\": COL_USER,\n", + " \"itemCol\": COL_ITEM,\n", + " \"ratingCol\": COL_RATING,\n", "}\n", "\n", "\n", @@ -234,84 +238,88 @@ " seed=42,\n", " **header\n", ")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 8, - "metadata": {}, + "source": [ + "with Timer() as train_time:\n", + " model = als.fit(train)\n", + "\n", + "print(\"Took {} seconds for training.\".format(train_time.interval))" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Took 3.2410509269684553 seconds for training.\n" ] } ], - "source": [ - "with Timer() as train_time:\n", - " model = als.fit(train)\n", - "\n", - "print(\"Took {} seconds for training.\".format(train_time.interval))" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "In the movie recommendation use case, recommending movies that have been rated by the users do not make sense. Therefore, the rated movies are removed from the recommended items.\n", "\n", "In order to achieve this, we recommend all movies to all users, and then remove the user-movie pairs that exist in the training dataset." - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Took 10.559875106438994 seconds for prediction.\n" - ] - } - ], "source": [ "with Timer() as test_time:\n", "\n", " # Get the cross join of all user-item pairs and score them.\n", - " users = train.select('UserId').distinct()\n", - " items = train.select('MovieId').distinct()\n", + " users = train.select(COL_USER).distinct()\n", + " items = train.select(COL_ITEM).distinct()\n", " user_item = users.crossJoin(items)\n", " dfs_pred = model.transform(user_item)\n", "\n", " # Remove seen items.\n", " dfs_pred_exclude_train = dfs_pred.alias(\"pred\").join(\n", " train.alias(\"train\"),\n", - " (dfs_pred['UserId'] == train['UserId']) & (dfs_pred['MovieId'] == train['MovieId']),\n", + " (dfs_pred[COL_USER] == train[COL_USER]) & (dfs_pred[COL_ITEM] == train[COL_ITEM]),\n", " how='outer'\n", " )\n", "\n", - " top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[\"train.Rating\"].isNull()) \\\n", - " .select('pred.' + 'UserId', 'pred.' + 'MovieId', 'pred.' + \"prediction\")\n", + " top_all = dfs_pred_exclude_train.filter(dfs_pred_exclude_train[f\"train.{COL_RATING}\"].isNull()) \\\n", + " .select('pred.' + COL_USER, 'pred.' + COL_ITEM, 'pred.' + \"prediction\")\n", "\n", " # In Spark, transformations are lazy evaluation\n", " # Use an action to force execute and measure the test time \n", " top_all.cache().count()\n", "\n", "print(\"Took {} seconds for prediction.\".format(test_time.interval))" - ] + ], + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "text": [ + "Took 10.559875106438994 seconds for prediction.\n" + ] + } + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 10, - "metadata": {}, + "source": [ + "top_all.show()" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+----------+\n", "|UserId|MovieId|prediction|\n", @@ -342,36 +350,41 @@ ] } ], - "source": [ - "top_all.show()" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 4. Evaluate how well ALS performs" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 11, - "metadata": {}, - "outputs": [], "source": [ - "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=\"UserId\", col_item=\"MovieId\", \n", - " col_rating=\"Rating\", col_prediction=\"prediction\", \n", + "rank_eval = SparkRankingEvaluation(test, top_all, k = TOP_K, col_user=COL_USER, col_item=COL_ITEM, \n", + " col_rating=COL_RATING, col_prediction=\"prediction\", \n", " relevancy_method=\"top_k\")" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 12, - "metadata": {}, + "source": [ + "print(\"Model:\\tALS\",\n", + " \"Top K:\\t%d\" % rank_eval.k,\n", + " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", + " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", + " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", + " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Model:\tALS\n", "Top K:\t10\n", @@ -382,30 +395,27 @@ ] } ], - "source": [ - "print(\"Model:\\tALS\",\n", - " \"Top K:\\t%d\" % rank_eval.k,\n", - " \"MAP:\\t%f\" % rank_eval.map_at_k(),\n", - " \"NDCG:\\t%f\" % rank_eval.ndcg_at_k(),\n", - " \"Precision@K:\\t%f\" % rank_eval.precision_at_k(),\n", - " \"Recall@K:\\t%f\" % rank_eval.recall_at_k(), sep='\\n')" - ] + "metadata": {} }, { "cell_type": "markdown", - "metadata": {}, "source": [ "### 5. Evaluate rating prediction" - ] + ], + "metadata": {} }, { "cell_type": "code", "execution_count": 13, - "metadata": {}, + "source": [ + "# Generate predicted ratings.\n", + "prediction = model.transform(test)\n", + "prediction.cache().show()\n" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "+------+-------+------+---------+----------+\n", "|UserId|MovieId|Rating|Timestamp|prediction|\n", @@ -436,20 +446,25 @@ ] } ], - "source": [ - "# Generate predicted ratings.\n", - "prediction = model.transform(test)\n", - "prediction.cache().show()\n" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": 14, - "metadata": {}, + "source": [ + "rating_eval = SparkRatingEvaluation(test, prediction, col_user=COL_USER, col_item=COL_ITEM, \n", + " col_rating=COL_RATING, col_prediction=\"prediction\")\n", + "\n", + "print(\"Model:\\tALS rating prediction\",\n", + " \"RMSE:\\t%f\" % rating_eval.rmse(),\n", + " \"MAE:\\t%f\" % rating_eval.mae(),\n", + " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n", + " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')" + ], "outputs": [ { - "name": "stdout", "output_type": "stream", + "name": "stdout", "text": [ "Model:\tALS rating prediction\n", "RMSE:\t0.967296\n", @@ -459,22 +474,11 @@ ] } ], - "source": [ - "rating_eval = SparkRatingEvaluation(test, prediction, col_user=\"UserId\", col_item=\"MovieId\", \n", - " col_rating=\"Rating\", col_prediction=\"prediction\")\n", - "\n", - "print(\"Model:\\tALS rating prediction\",\n", - " \"RMSE:\\t%f\" % rating_eval.rmse(),\n", - " \"MAE:\\t%f\" % rating_eval.mae(),\n", - " \"Explained variance:\\t%f\" % rating_eval.exp_var(),\n", - " \"R squared:\\t%f\" % rating_eval.rsquared(), sep='\\n')" - ] + "metadata": {} }, { "cell_type": "code", "execution_count": null, - "metadata": {}, - "outputs": [], "source": [ "if is_jupyter():\n", " # Record results with papermill for tests\n", @@ -490,17 +494,19 @@ " sb.glue(\"rsquared\", rating_eval.rsquared())\n", " sb.glue(\"train_time\", train_time.interval)\n", " sb.glue(\"test_time\", test_time.interval)" - ] + ], + "outputs": [], + "metadata": {} }, { "cell_type": "code", "execution_count": 17, - "metadata": {}, - "outputs": [], "source": [ "# cleanup spark instance\n", "spark.stop()" - ] + ], + "outputs": [], + "metadata": {} } ], "metadata": { diff --git a/tests/unit/examples/test_notebooks_pyspark.py b/tests/unit/examples/test_notebooks_pyspark.py index 15a5a8ad7c..6ccd970492 100644 --- a/tests/unit/examples/test_notebooks_pyspark.py +++ b/tests/unit/examples/test_notebooks_pyspark.py @@ -18,7 +18,13 @@ ) def test_als_pyspark_runs(notebooks, output_notebook, kernel_name): notebook_path = notebooks["als_pyspark"] - pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name) + pm.execute_notebook(notebook_path, output_notebook, kernel_name=kernel_name, + parameters=dict( + MOVIELENS_DATA_SIZE="mock100", + COL_USER=DEFAULT_USER_COL, + COL_ITEM=DEFAULT_ITEM_COL, + COL_RATING=DEFAULT_RATING_COL, + )) @pytest.mark.notebooks From a097dc988bea8438f26363d2639495eedbf83218 Mon Sep 17 00:00:00 2001 From: Jianjie Liu Date: Fri, 8 Oct 2021 16:08:59 +0000 Subject: [PATCH 27/27] Revert "Add pandera as a core dependency" This reverts commit 84d83e2e7d2dfe19440df4e558e5c469ecc96f56. --- recommenders/datasets/movielens.py | 11 +++++++---- setup.py | 2 +- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/recommenders/datasets/movielens.py b/recommenders/datasets/movielens.py index cf19874f71..d054bc64fb 100644 --- a/recommenders/datasets/movielens.py +++ b/recommenders/datasets/movielens.py @@ -7,9 +7,6 @@ import shutil import warnings import pandas as pd -import pandera as pa -from pandera import Field -from pandera.typing import Series from typing import Optional from zipfile import ZipFile from recommenders.datasets.download_utils import maybe_download, download_path @@ -24,7 +21,6 @@ DEFAULT_GENRE_COL, ) - try: from pyspark.sql.types import ( StructType, @@ -38,6 +34,13 @@ except ImportError: pass # so the environment without spark doesn't break +try: + import pandera as pa + from pandera import Field + from pandera.typing import Series +except ImportError: + pass # so the environment without recommender['dev'] doesn't break + class _DataFormat: def __init__( diff --git a/setup.py b/setup.py index 51a330d65d..c0d7d67377 100644 --- a/setup.py +++ b/setup.py @@ -45,7 +45,6 @@ "cornac>=1.1.2,<2", "scikit-surprise>=0.19.1,<=1.1.1", "retrying>=1.3.3", - "pandera[strategies]>=0.6.5", # For generating fake datasets ] # shared dependencies @@ -76,6 +75,7 @@ ], "dev": [ "black>=18.6b4,<21", + "pandera[strategies]>=0.6.5", # For generating fake datasets "pytest>=3.6.4", "pytest-cov>=2.12.1", "pytest-mock>=3.6.1", # for access to mock fixtures in pytest