From a0be65232cc1b929b919268310d4474c4274521c Mon Sep 17 00:00:00 2001
From: poteman <946691288@qq.com>
Date: Sat, 14 May 2022 10:56:12 +0800
Subject: [PATCH] MovieLens: data_process, recall, recall and rank.
---
.../datasets/MovieLens_data_process.ipynb | 1051 +++++++++++++++++
.../MovieLens_AutoX_popular_recall.ipynb | 693 +++++++++++
.../MovieLens_AutoX_recall_and_rank.ipynb | 893 ++++++++++++++
3 files changed, 2637 insertions(+)
create mode 100644 autox/autox_recommend/datasets/MovieLens_data_process.ipynb
create mode 100644 autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb
create mode 100644 autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb
diff --git a/autox/autox_recommend/datasets/MovieLens_data_process.ipynb b/autox/autox_recommend/datasets/MovieLens_data_process.ipynb
new file mode 100644
index 0000000..ae3d10f
--- /dev/null
+++ b/autox/autox_recommend/datasets/MovieLens_data_process.ipynb
@@ -0,0 +1,1051 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## import包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:10.125377Z",
+ "start_time": "2022-05-13T03:12:09.671666Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n",
+ " warnings.warn(msg, UserWarning)\n"
+ ]
+ }
+ ],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 处理后的数据包含: \n",
+ "```\n",
+ "1. 交互表(必须, user-item的交互记录, 包括训练集和测试集, 测试集中所有记录的时间都在训练集之后);\n",
+ "2. user表(可选);\n",
+ "3. item表(可选).\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 读数据"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:13.926483Z",
+ "start_time": "2022-05-13T03:12:13.922927Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "path = './ml-25m'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:21.478010Z",
+ "start_time": "2022-05-13T03:12:21.463303Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "['tags.csv',\n",
+ " 'links.csv',\n",
+ " 'README.txt',\n",
+ " 'ratings.csv',\n",
+ " 'genome-tags.csv',\n",
+ " 'genome-scores.csv',\n",
+ " 'movies.csv']"
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "os.listdir(path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:39.533274Z",
+ "start_time": "2022-05-13T03:12:22.064321Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "tags = pd.read_csv(f'{path}/tags.csv')\n",
+ "links = pd.read_csv(f'{path}/links.csv')\n",
+ "ratings = pd.read_csv(f'{path}/ratings.csv')\n",
+ "genometags = pd.read_csv(f'{path}/genome-tags.csv')\n",
+ "genomescores = pd.read_csv(f'{path}/genome-scores.csv')\n",
+ "movies = pd.read_csv(f'{path}/movies.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 数据处理"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 交互表"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:39.681400Z",
+ "start_time": "2022-05-13T03:12:39.669609Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " rating | \n",
+ " timestamp | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 296 | \n",
+ " 5.0 | \n",
+ " 1147880044 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 306 | \n",
+ " 3.5 | \n",
+ " 1147868817 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 307 | \n",
+ " 5.0 | \n",
+ " 1147868828 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 665 | \n",
+ " 5.0 | \n",
+ " 1147878820 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 899 | \n",
+ " 3.5 | \n",
+ " 1147868510 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userId movieId rating timestamp\n",
+ "0 1 296 5.0 1147880044\n",
+ "1 1 306 3.5 1147868817\n",
+ "2 1 307 5.0 1147868828\n",
+ "3 1 665 5.0 1147878820\n",
+ "4 1 899 3.5 1147868510"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ratings.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:43.312792Z",
+ "start_time": "2022-05-13T03:12:42.548009Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "((25000095, 4), (3612474, 4))"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ratings.shape, ratings.loc[ratings['rating'] == 5].shape"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:45.738446Z",
+ "start_time": "2022-05-13T03:12:45.281508Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "ratings = ratings.loc[ratings['rating'] == 5]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:12:47.017038Z",
+ "start_time": "2022-05-13T03:12:46.939111Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ratings.drop('rating', axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:43.531608Z",
+ "start_time": "2022-05-13T03:13:30.189616Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import datetime\n",
+ "ratings['time'] = ratings['timestamp'].apply(\n",
+ " lambda ts: datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:43.894742Z",
+ "start_time": "2022-05-13T03:13:43.738518Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ratings.drop('timestamp', axis=1, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:44.024141Z",
+ "start_time": "2022-05-13T03:13:44.014738Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 296 | \n",
+ " 2006-05-17 15:34:04 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 307 | \n",
+ " 2006-05-17 12:27:08 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 665 | \n",
+ " 2006-05-17 15:13:40 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 1 | \n",
+ " 1237 | \n",
+ " 2006-05-17 12:27:19 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 1 | \n",
+ " 2632 | \n",
+ " 2006-05-17 15:04:08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userId movieId time\n",
+ "0 1 296 2006-05-17 15:34:04\n",
+ "2 1 307 2006-05-17 12:27:08\n",
+ "3 1 665 2006-05-17 15:13:40\n",
+ "8 1 1237 2006-05-17 12:27:19\n",
+ "18 1 2632 2006-05-17 15:04:08"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ratings.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:47.725445Z",
+ "start_time": "2022-05-13T03:13:46.769086Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "ratings['time'] = pd.to_datetime(ratings['time'])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:47.870602Z",
+ "start_time": "2022-05-13T03:13:47.839219Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-21 09:06:53'))"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "ratings['time'].min(), ratings['time'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:51.542431Z",
+ "start_time": "2022-05-13T03:13:51.364166Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "import datetime\n",
+ "data_used_time = datetime.datetime.strptime('2019-11-15 00:00:00', '%Y-%m-%d %H:%M:%S')\n",
+ "\n",
+ "train = ratings.loc[ratings['time'] < data_used_time]\n",
+ "test = ratings.loc[ratings['time'] >= data_used_time]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:51.768791Z",
+ "start_time": "2022-05-13T03:13:51.738452Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-14 23:20:55'))"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "train['time'].min(), train['time'].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:13:52.119005Z",
+ "start_time": "2022-05-13T03:13:52.113871Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "(Timestamp('2019-11-15 00:08:42'), Timestamp('2019-11-21 09:06:53'))"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test['time'].min(), test['time'].max()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### item表"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:14:13.442569Z",
+ "start_time": "2022-05-13T03:14:04.946198Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "temp = genomescores.pivot(index='movieId', columns='tagId', values='relevance')\n",
+ "temp = temp.reset_index()\n",
+ "temp.columns = ['movieId'] + ['tag_' + str(i) for i in range(1, 1128+1)]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:14:14.519675Z",
+ "start_time": "2022-05-13T03:14:13.752261Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "movies = movies.merge(temp, on = 'movieId', how = 'left')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:14:15.041132Z",
+ "start_time": "2022-05-13T03:14:14.985591Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " movieId | \n",
+ " title | \n",
+ " genres | \n",
+ " tag_1 | \n",
+ " tag_2 | \n",
+ " tag_3 | \n",
+ " tag_4 | \n",
+ " tag_5 | \n",
+ " tag_6 | \n",
+ " tag_7 | \n",
+ " ... | \n",
+ " tag_1119 | \n",
+ " tag_1120 | \n",
+ " tag_1121 | \n",
+ " tag_1122 | \n",
+ " tag_1123 | \n",
+ " tag_1124 | \n",
+ " tag_1125 | \n",
+ " tag_1126 | \n",
+ " tag_1127 | \n",
+ " tag_1128 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " Toy Story (1995) | \n",
+ " Adventure|Animation|Children|Comedy|Fantasy | \n",
+ " 0.02875 | \n",
+ " 0.02375 | \n",
+ " 0.06250 | \n",
+ " 0.07575 | \n",
+ " 0.14075 | \n",
+ " 0.14675 | \n",
+ " 0.06350 | \n",
+ " ... | \n",
+ " 0.04050 | \n",
+ " 0.01425 | \n",
+ " 0.03050 | \n",
+ " 0.03500 | \n",
+ " 0.14125 | \n",
+ " 0.05775 | \n",
+ " 0.03900 | \n",
+ " 0.02975 | \n",
+ " 0.08475 | \n",
+ " 0.02200 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 2 | \n",
+ " Jumanji (1995) | \n",
+ " Adventure|Children|Fantasy | \n",
+ " 0.04125 | \n",
+ " 0.04050 | \n",
+ " 0.06275 | \n",
+ " 0.08275 | \n",
+ " 0.09100 | \n",
+ " 0.06125 | \n",
+ " 0.06925 | \n",
+ " ... | \n",
+ " 0.05250 | \n",
+ " 0.01575 | \n",
+ " 0.01250 | \n",
+ " 0.02000 | \n",
+ " 0.12225 | \n",
+ " 0.03275 | \n",
+ " 0.02100 | \n",
+ " 0.01100 | \n",
+ " 0.10525 | \n",
+ " 0.01975 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 3 | \n",
+ " Grumpier Old Men (1995) | \n",
+ " Comedy|Romance | \n",
+ " 0.04675 | \n",
+ " 0.05550 | \n",
+ " 0.02925 | \n",
+ " 0.08700 | \n",
+ " 0.04750 | \n",
+ " 0.04775 | \n",
+ " 0.04600 | \n",
+ " ... | \n",
+ " 0.06275 | \n",
+ " 0.01950 | \n",
+ " 0.02225 | \n",
+ " 0.02300 | \n",
+ " 0.12200 | \n",
+ " 0.03475 | \n",
+ " 0.01700 | \n",
+ " 0.01800 | \n",
+ " 0.09100 | \n",
+ " 0.01775 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 4 | \n",
+ " Waiting to Exhale (1995) | \n",
+ " Comedy|Drama|Romance | \n",
+ " 0.03425 | \n",
+ " 0.03800 | \n",
+ " 0.04050 | \n",
+ " 0.03100 | \n",
+ " 0.06500 | \n",
+ " 0.03575 | \n",
+ " 0.02900 | \n",
+ " ... | \n",
+ " 0.05325 | \n",
+ " 0.02800 | \n",
+ " 0.01675 | \n",
+ " 0.03875 | \n",
+ " 0.18200 | \n",
+ " 0.07050 | \n",
+ " 0.01625 | \n",
+ " 0.01425 | \n",
+ " 0.08850 | \n",
+ " 0.01500 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " Father of the Bride Part II (1995) | \n",
+ " Comedy | \n",
+ " 0.04300 | \n",
+ " 0.05325 | \n",
+ " 0.03800 | \n",
+ " 0.04100 | \n",
+ " 0.05400 | \n",
+ " 0.06725 | \n",
+ " 0.02775 | \n",
+ " ... | \n",
+ " 0.05350 | \n",
+ " 0.02050 | \n",
+ " 0.01425 | \n",
+ " 0.02550 | \n",
+ " 0.19225 | \n",
+ " 0.02675 | \n",
+ " 0.01625 | \n",
+ " 0.01300 | \n",
+ " 0.08700 | \n",
+ " 0.01600 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 62418 | \n",
+ " 209157 | \n",
+ " We (2018) | \n",
+ " Drama | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 62419 | \n",
+ " 209159 | \n",
+ " Window of the Soul (2001) | \n",
+ " Documentary | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 62420 | \n",
+ " 209163 | \n",
+ " Bad Poems (2018) | \n",
+ " Comedy|Drama | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 62421 | \n",
+ " 209169 | \n",
+ " A Girl Thing (2001) | \n",
+ " (no genres listed) | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 62422 | \n",
+ " 209171 | \n",
+ " Women of Devil's Island (1962) | \n",
+ " Action|Adventure|Drama | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
62423 rows × 1131 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " movieId title \\\n",
+ "0 1 Toy Story (1995) \n",
+ "1 2 Jumanji (1995) \n",
+ "2 3 Grumpier Old Men (1995) \n",
+ "3 4 Waiting to Exhale (1995) \n",
+ "4 5 Father of the Bride Part II (1995) \n",
+ "... ... ... \n",
+ "62418 209157 We (2018) \n",
+ "62419 209159 Window of the Soul (2001) \n",
+ "62420 209163 Bad Poems (2018) \n",
+ "62421 209169 A Girl Thing (2001) \n",
+ "62422 209171 Women of Devil's Island (1962) \n",
+ "\n",
+ " genres tag_1 tag_2 tag_3 \\\n",
+ "0 Adventure|Animation|Children|Comedy|Fantasy 0.02875 0.02375 0.06250 \n",
+ "1 Adventure|Children|Fantasy 0.04125 0.04050 0.06275 \n",
+ "2 Comedy|Romance 0.04675 0.05550 0.02925 \n",
+ "3 Comedy|Drama|Romance 0.03425 0.03800 0.04050 \n",
+ "4 Comedy 0.04300 0.05325 0.03800 \n",
+ "... ... ... ... ... \n",
+ "62418 Drama NaN NaN NaN \n",
+ "62419 Documentary NaN NaN NaN \n",
+ "62420 Comedy|Drama NaN NaN NaN \n",
+ "62421 (no genres listed) NaN NaN NaN \n",
+ "62422 Action|Adventure|Drama NaN NaN NaN \n",
+ "\n",
+ " tag_4 tag_5 tag_6 tag_7 ... tag_1119 tag_1120 tag_1121 \\\n",
+ "0 0.07575 0.14075 0.14675 0.06350 ... 0.04050 0.01425 0.03050 \n",
+ "1 0.08275 0.09100 0.06125 0.06925 ... 0.05250 0.01575 0.01250 \n",
+ "2 0.08700 0.04750 0.04775 0.04600 ... 0.06275 0.01950 0.02225 \n",
+ "3 0.03100 0.06500 0.03575 0.02900 ... 0.05325 0.02800 0.01675 \n",
+ "4 0.04100 0.05400 0.06725 0.02775 ... 0.05350 0.02050 0.01425 \n",
+ "... ... ... ... ... ... ... ... ... \n",
+ "62418 NaN NaN NaN NaN ... NaN NaN NaN \n",
+ "62419 NaN NaN NaN NaN ... NaN NaN NaN \n",
+ "62420 NaN NaN NaN NaN ... NaN NaN NaN \n",
+ "62421 NaN NaN NaN NaN ... NaN NaN NaN \n",
+ "62422 NaN NaN NaN NaN ... NaN NaN NaN \n",
+ "\n",
+ " tag_1122 tag_1123 tag_1124 tag_1125 tag_1126 tag_1127 tag_1128 \n",
+ "0 0.03500 0.14125 0.05775 0.03900 0.02975 0.08475 0.02200 \n",
+ "1 0.02000 0.12225 0.03275 0.02100 0.01100 0.10525 0.01975 \n",
+ "2 0.02300 0.12200 0.03475 0.01700 0.01800 0.09100 0.01775 \n",
+ "3 0.03875 0.18200 0.07050 0.01625 0.01425 0.08850 0.01500 \n",
+ "4 0.02550 0.19225 0.02675 0.01625 0.01300 0.08700 0.01600 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "62418 NaN NaN NaN NaN NaN NaN NaN \n",
+ "62419 NaN NaN NaN NaN NaN NaN NaN \n",
+ "62420 NaN NaN NaN NaN NaN NaN NaN \n",
+ "62421 NaN NaN NaN NaN NaN NaN NaN \n",
+ "62422 NaN NaN NaN NaN NaN NaN NaN \n",
+ "\n",
+ "[62423 rows x 1131 columns]"
+ ]
+ },
+ "execution_count": 21,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "movies"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 存档"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:14:33.717315Z",
+ "start_time": "2022-05-13T03:14:33.712050Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "output_path = './MovieLens_AutoX/'\n",
+ "os.makedirs(output_path, exist_ok = True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:16:19.154546Z",
+ "start_time": "2022-05-13T03:15:25.381464Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "train.to_csv(output_path + 'inter_df.csv', index = False)\n",
+ "test.to_csv(output_path + 'test.csv', index = False)\n",
+ "movies.to_csv(output_path + 'item_df.csv', index = False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": true
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb
new file mode 100644
index 0000000..342db5c
--- /dev/null
+++ b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb
@@ -0,0 +1,693 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## import包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:18:09.319781Z",
+ "start_time": "2022-05-13T03:18:09.317031Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append('../AutoX')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:18:15.567203Z",
+ "start_time": "2022-05-13T03:18:10.482510Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, positive=False):\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/online_lda.py:29: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " EPS = np.finfo(np.float).eps\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " dtype=np.int):\n"
+ ]
+ }
+ ],
+ "source": [
+ "from autox import AutoXRecommend"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:18:28.275812Z",
+ "start_time": "2022-05-13T03:18:28.272710Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 读数据"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:08.888049Z",
+ "start_time": "2022-05-13T03:19:08.884895Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "path = '~/AutoX/autox/autox_recommend/datasets/MovieLens_AutoX/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:21.230686Z",
+ "start_time": "2022-05-13T03:19:09.561508Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "inter_df = pd.read_csv(path + 'inter_df.csv')\n",
+ "item_df = pd.read_csv(path + 'item_df.csv')\n",
+ "test = pd.read_csv(path + 'test.csv')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:40.197258Z",
+ "start_time": "2022-05-13T03:19:40.185844Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 296 | \n",
+ " 2006-05-17 15:34:04 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 307 | \n",
+ " 2006-05-17 12:27:08 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 665 | \n",
+ " 2006-05-17 15:13:40 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1237 | \n",
+ " 2006-05-17 12:27:19 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2632 | \n",
+ " 2006-05-17 15:04:08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userId movieId time\n",
+ "0 1 296 2006-05-17 15:34:04\n",
+ "1 1 307 2006-05-17 12:27:08\n",
+ "2 1 665 2006-05-17 15:13:40\n",
+ "3 1 1237 2006-05-17 12:27:19\n",
+ "4 1 2632 2006-05-17 15:04:08"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inter_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 配置参数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:51.178487Z",
+ "start_time": "2022-05-13T03:19:51.175288Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "uid = 'userId'\n",
+ "iid = 'movieId'\n",
+ "time_col = 'time'\n",
+ "recall_num = 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 准备测试集结果"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:54.191979Z",
+ "start_time": "2022-05-13T03:19:53.511237Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('1995-01-09 11:46:49', '2019-11-14 23:20:55')"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inter_df[time_col].min(), inter_df[time_col].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:19:54.302038Z",
+ "start_time": "2022-05-13T03:19:54.295604Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('2019-11-15 00:08:42', '2019-11-21 09:06:53')"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test[time_col].min(), test[time_col].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:20:14.914265Z",
+ "start_time": "2022-05-13T03:20:14.587569Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "assert(test[time_col].min() > inter_df[time_col].max())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:20:16.965803Z",
+ "start_time": "2022-05-13T03:20:16.948163Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "283it [00:00, 168846.09it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total users in testidation: 283\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "positive_items_test = test.groupby([uid])[iid].apply(list)\n",
+ "test_users = positive_items_test.keys()\n",
+ "test_items = []\n",
+ "\n",
+ "for i, user in tqdm(enumerate(test_users)):\n",
+ " test_items.append(positive_items_test[user])\n",
+ " \n",
+ "print(\"Total users in testidation:\", len(test_users))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 时间列转化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:20:52.185539Z",
+ "start_time": "2022-05-13T03:20:51.288515Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "inter_df[time_col] = pd.to_datetime(inter_df[time_col])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 执行AutoX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:21:04.735973Z",
+ "start_time": "2022-05-13T03:21:04.676594Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [],
+ "source": [
+ "autoXRecommend = AutoXRecommend()\n",
+ "\n",
+ "autoXRecommend.fit(inter_df = inter_df, user_df = None, item_df = item_df,\n",
+ " uid = uid, iid = iid, time_col = time_col,\n",
+ " recall_num = recall_num, \n",
+ " mode = 'recalls', recall_method = 'popular')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:21:06.926077Z",
+ "start_time": "2022-05-13T03:21:06.613477Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 283/283 [00:00<00:00, 1025.37it/s]\n"
+ ]
+ }
+ ],
+ "source": [
+ "res = autoXRecommend.transform(test_users)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:21:08.623737Z",
+ "start_time": "2022-05-13T03:21:08.598246Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " prediction | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 606 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1746 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3409 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 4037 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 278 | \n",
+ " 159074 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 279 | \n",
+ " 159388 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 280 | \n",
+ " 159523 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 281 | \n",
+ " 161485 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ " 282 | \n",
+ " 162190 | \n",
+ " [318, 5952, 7153, 4993, 858, 260, 1196, 79132,... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
283 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userId prediction\n",
+ "0 4 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "1 606 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "2 1746 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "3 3409 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "4 4037 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ ".. ... ...\n",
+ "278 159074 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "279 159388 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "280 159523 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "281 161485 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "282 162190 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n",
+ "\n",
+ "[283 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 20,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "res"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 结果查看"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:21:10.832933Z",
+ "start_time": "2022-05-13T03:21:10.825779Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def apk(actual, predicted, k=12):\n",
+ " if len(predicted)>k:\n",
+ " predicted = predicted[:k]\n",
+ "\n",
+ " score = 0.0\n",
+ " num_hits = 0.0\n",
+ "\n",
+ " for i,p in enumerate(predicted):\n",
+ " if p in actual and p not in predicted[:i]:\n",
+ " num_hits += 1.0\n",
+ " score += num_hits / (i+1.0)\n",
+ "\n",
+ " if not actual:\n",
+ " return 0.0\n",
+ "\n",
+ " return score / min(len(actual), k)\n",
+ "\n",
+ "def mapk(actual, predicted, k=12):\n",
+ " return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T03:21:22.872923Z",
+ "start_time": "2022-05-13T03:21:22.866137Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mAP Score on Validation set: 0.0473412592184677\n"
+ ]
+ }
+ ],
+ "source": [
+ "outputs = res['prediction']\n",
+ "print(\"mAP Score on Validation set:\", mapk(test_items, outputs))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb
new file mode 100644
index 0000000..b3bfe9c
--- /dev/null
+++ b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb
@@ -0,0 +1,893 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## import包"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:24.337522Z",
+ "start_time": "2022-05-13T11:20:24.334714Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "sys.path.append('../AutoX')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:29.549828Z",
+ "start_time": "2022-05-13T11:20:24.339756Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n",
+ " warnings.warn(msg, UserWarning)\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, positive=False):\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/online_lda.py:29: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " EPS = np.finfo(np.float).eps\n",
+ "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n",
+ "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
+ " dtype=np.int):\n"
+ ]
+ }
+ ],
+ "source": [
+ "from autox import AutoXRecommend"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:29.555355Z",
+ "start_time": "2022-05-13T11:20:29.552439Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import os\n",
+ "from tqdm import tqdm"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 读取数据"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:29.604407Z",
+ "start_time": "2022-05-13T11:20:29.557484Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "path = '~/AutoX/autox/autox_recommend/datasets/MovieLens_AutoX/'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:41.797301Z",
+ "start_time": "2022-05-13T11:20:29.606616Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "inter_df = pd.read_csv(path + 'inter_df.csv')\n",
+ "item_df = pd.read_csv(path + 'item_df.csv')\n",
+ "test = pd.read_csv(path + 'test.csv')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 配置参数"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:41.803162Z",
+ "start_time": "2022-05-13T11:20:41.800065Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "uid = 'userId'\n",
+ "iid = 'movieId'\n",
+ "time_col = 'time'\n",
+ "recall_num = 100"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 准备测试集结果"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:42.474990Z",
+ "start_time": "2022-05-13T11:20:41.805384Z"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('1995-01-09 11:46:49', '2019-11-14 23:20:55')"
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inter_df[time_col].min(), inter_df[time_col].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:42.483575Z",
+ "start_time": "2022-05-13T11:20:42.477798Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "('2019-11-15 00:08:42', '2019-11-21 09:06:53')"
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "test[time_col].min(), test[time_col].max()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:42.825682Z",
+ "start_time": "2022-05-13T11:20:42.485772Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "assert(test[time_col].min() > inter_df[time_col].max())"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:42.844719Z",
+ "start_time": "2022-05-13T11:20:42.827758Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "283it [00:00, 177692.82it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Total users in testidation: 283\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ }
+ ],
+ "source": [
+ "positive_items_test = test.groupby([uid])[iid].apply(list)\n",
+ "test_users = positive_items_test.keys()\n",
+ "test_items = []\n",
+ "\n",
+ "for i, user in tqdm(enumerate(test_users)):\n",
+ " test_items.append(positive_items_test[user])\n",
+ " \n",
+ "print(\"Total users in testidation:\", len(test_users))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 时间列转化"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:43.862130Z",
+ "start_time": "2022-05-13T11:20:42.846547Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "inter_df[time_col] = pd.to_datetime(inter_df[time_col])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:20:43.874309Z",
+ "start_time": "2022-05-13T11:20:43.864199Z"
+ },
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " userId | \n",
+ " movieId | \n",
+ " time | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 296 | \n",
+ " 2006-05-17 15:34:04 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 307 | \n",
+ " 2006-05-17 12:27:08 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 665 | \n",
+ " 2006-05-17 15:13:40 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1237 | \n",
+ " 2006-05-17 12:27:19 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 2632 | \n",
+ " 2006-05-17 15:04:08 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " userId movieId time\n",
+ "0 1 296 2006-05-17 15:34:04\n",
+ "1 1 307 2006-05-17 12:27:08\n",
+ "2 1 665 2006-05-17 15:13:40\n",
+ "3 1 1237 2006-05-17 12:27:19\n",
+ "4 1 2632 2006-05-17 15:04:08"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "inter_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 执行AutoX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T11:41:40.981180Z",
+ "start_time": "2022-05-13T11:20:43.876240Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "popular_recall\n",
+ "train\n",
+ "2019-11-01 00:21:32 2019-11-07 23:45:48\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 331/331 [00:00<00:00, 23444.92it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "HIT: 0.27881950171515285\n",
+ "valid\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 310/310 [00:00<00:00, 1136.14it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "HIT: 0.27728749446707573\n",
+ "\n",
+ "history_recall\n",
+ "train\n",
+ "2019-11-01 00:21:32 2019-11-07 23:45:48\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 221/221 [00:00<00:00, 58868.36it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "valid\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 213/213 [00:00<00:00, 50433.94it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "itemcf_recall\n",
+ "train\n",
+ "valid\n",
+ "\n",
+ "binary_recall\n",
+ "train\n",
+ "valid\n",
+ "\n",
+ "merge recalls\n",
+ "train\n",
+ "valid\n",
+ "\n",
+ "feature engineer\n",
+ "train\n",
+ "customer feature engineer\n",
+ "interact feature engineer\n",
+ "valid\n",
+ "customer feature engineer\n",
+ "interact feature engineer\n",
+ "train_fe shape: (76417, 1147)\n",
+ "valid_fe shape: (71125, 1147)\n",
+ "\n",
+ "ranker\n",
+ "Training until validation scores don't improve for 100 rounds.\n",
+ "[100]\tvalid_0's map@12: 0.490571\n",
+ "Early stopping, best iteration is:\n",
+ "[93]\tvalid_0's map@12: 0.492638\n",
+ "defaultdict(, {'valid_0': {'map@12': 0.4926380419869894}})\n",
+ " feature importance\n",
+ "2 n_purchase 314\n",
+ "1140 latest_purchase_time_sub 289\n",
+ "1 binary_score 142\n",
+ "3 n_purchase_nunique 124\n",
+ "0 itemcf_score 100\n",
+ "1132 purchase_corr_item_max_time 85\n",
+ "1141 movieId_idx 52\n",
+ "766 tag_763 31\n",
+ "1133 purchase_corr_item_cnt 25\n",
+ "944 tag_941 24\n",
+ "384 tag_381 12\n",
+ "1067 tag_1064 12\n",
+ "755 tag_752 12\n",
+ "43 tag_40 11\n",
+ "277 tag_274 11\n",
+ "474 tag_471 10\n",
+ "995 tag_992 10\n",
+ "811 tag_808 10\n",
+ "97 tag_94 9\n",
+ "454 tag_451 9\n",
+ "\n",
+ "local result calculation\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n",
+ "mAP Score on Validation set: 0.07809885619482743\n",
+ "##############################\n",
+ "retrain\n",
+ "\n",
+ "popular_recall\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 310/310 [00:00<00:00, 12180.86it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "HIT: 0.27728749446707573\n",
+ "\n",
+ "history_recall\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 213/213 [00:00<00:00, 50091.77it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "itemcf_recall\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "calculate similarity\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 154824/154824 [09:36<00:00, 268.37it/s] \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "ItemCF recommend\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 213/213 [01:51<00:00, 1.92it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(21300, 3)\n",
+ "ItemCF recall: (21300, 4)\n",
+ "mean: 0.002347417840375587\n",
+ "sum: 50.0\n",
+ "\n",
+ "binary_recall\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 27045/27045 [05:13<00:00, 86.28it/s] \n",
+ "100%|██████████| 310/310 [01:44<00:00, 2.96it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(21300, 3)\n",
+ "BinaryNet recall: (21300, 4)\n",
+ "0.0032863849765258214\n",
+ "\n",
+ "merge recalls\n",
+ "\n",
+ "feature engineer\n",
+ "customer feature engineer\n",
+ "interact feature engineer\n",
+ "train_fe shape: (71125, 1147)\n",
+ "\n",
+ "ranker\n",
+ "defaultdict(, {'valid_0': {'map@12': 0.5546064728628005}})\n",
+ " feature importance\n",
+ "2 n_purchase 314\n",
+ "1140 latest_purchase_time_sub 264\n",
+ "3 n_purchase_nunique 121\n",
+ "1 binary_score 113\n",
+ "1132 purchase_corr_item_max_time 94\n",
+ "0 itemcf_score 73\n",
+ "1141 movieId_idx 67\n",
+ "1133 purchase_corr_item_cnt 35\n",
+ "24 tag_21 31\n",
+ "995 tag_992 21\n",
+ "210 tag_207 12\n",
+ "75 tag_72 12\n",
+ "340 tag_337 10\n",
+ "50 tag_47 10\n",
+ "1127 tag_1124 9\n",
+ "732 tag_729 9\n",
+ "70 tag_67 8\n",
+ "932 tag_929 8\n",
+ "792 tag_789 8\n",
+ "1028 tag_1025 8\n"
+ ]
+ }
+ ],
+ "source": [
+ "autoXRecommend = AutoXRecommend()\n",
+ "\n",
+ "autoXRecommend.fit(inter_df = inter_df, user_df = None, item_df = item_df,\n",
+ " uid = uid, iid = iid, time_col = time_col,\n",
+ " recall_num = recall_num,\n",
+ " time_decay = 0.99,\n",
+ " debug = True, debug_save_path = './temp_MovieLens')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T12:00:06.970676Z",
+ "start_time": "2022-05-13T11:41:40.983194Z"
+ },
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "popular recall, test\n",
+ "2019-11-08 00:01:28 2019-11-14 23:20:55\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 283/283 [00:00<00:00, 45743.11it/s]"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "history recall, test\n",
+ "\n",
+ "itemcf recall, test\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "100%|██████████| 154921/154921 [08:46<00:00, 294.04it/s]\n",
+ "100%|██████████| 202/202 [02:25<00:00, 1.39it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "binary recall, test\n"
+ ]
+ },
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "100%|██████████| 27102/27102 [05:11<00:00, 86.89it/s] \n",
+ "100%|██████████| 283/283 [01:31<00:00, 3.10it/s]\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "merge recalls\n",
+ "\n",
+ "feature engineer\n",
+ "customer feature engineer\n",
+ "interact feature engineer\n",
+ "test_fe shape: (66920, 1146)\n",
+ "\n",
+ "inference\n",
+ "[1/1]\n",
+ "(66920, 1146)\n"
+ ]
+ }
+ ],
+ "source": [
+ "res = autoXRecommend.transform(test_users)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 查看结果"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-13T12:00:06.979194Z",
+ "start_time": "2022-05-13T12:00:06.972819Z"
+ }
+ },
+ "outputs": [],
+ "source": [
+ "def apk(actual, predicted, k=12):\n",
+ " if len(predicted)>k:\n",
+ " predicted = predicted[:k]\n",
+ "\n",
+ " score = 0.0\n",
+ " num_hits = 0.0\n",
+ "\n",
+ " for i,p in enumerate(predicted):\n",
+ " if p in actual and p not in predicted[:i]:\n",
+ " num_hits += 1.0\n",
+ " score += num_hits / (i+1.0)\n",
+ "\n",
+ " if not actual:\n",
+ " return 0.0\n",
+ "\n",
+ " return score / min(len(actual), k)\n",
+ "\n",
+ "def mapk(actual, predicted, k=12):\n",
+ " return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "ExecuteTime": {
+ "end_time": "2022-05-14T00:00:06.385747Z",
+ "start_time": "2022-05-14T00:00:06.377280Z"
+ }
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "mAP Score on Validation set: 0.08030425675382308\n"
+ ]
+ }
+ ],
+ "source": [
+ "outputs = res['prediction']\n",
+ "print(\"mAP Score on Validation set:\", mapk(test_items, outputs))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.7.3"
+ },
+ "toc": {
+ "base_numbering": 1,
+ "nav_menu": {},
+ "number_sections": true,
+ "sideBar": true,
+ "skip_h1_title": false,
+ "title_cell": "Table of Contents",
+ "title_sidebar": "Contents",
+ "toc_cell": false,
+ "toc_position": {},
+ "toc_section_display": true,
+ "toc_window_display": false
+ },
+ "varInspector": {
+ "cols": {
+ "lenName": 16,
+ "lenType": 16,
+ "lenVar": 40
+ },
+ "kernels_config": {
+ "python": {
+ "delete_cmd_postfix": "",
+ "delete_cmd_prefix": "del ",
+ "library": "var_list.py",
+ "varRefreshCmd": "print(var_dic_list())"
+ },
+ "r": {
+ "delete_cmd_postfix": ") ",
+ "delete_cmd_prefix": "rm(",
+ "library": "var_list.r",
+ "varRefreshCmd": "cat(var_dic_list()) "
+ }
+ },
+ "types_to_exclude": [
+ "module",
+ "function",
+ "builtin_function_or_method",
+ "instance",
+ "_Feature"
+ ],
+ "window_display": false
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}