diff --git a/autox/autox_recommend/datasets/MovieLens_data_process.ipynb b/autox/autox_recommend/datasets/MovieLens_data_process.ipynb new file mode 100644 index 0000000..ae3d10f --- /dev/null +++ b/autox/autox_recommend/datasets/MovieLens_data_process.ipynb @@ -0,0 +1,1051 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## import包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:10.125377Z", + "start_time": "2022-05-13T03:12:09.671666Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n", + " warnings.warn(msg, UserWarning)\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 处理后的数据包含: \n", + "```\n", + "1. 交互表(必须, user-item的交互记录, 包括训练集和测试集, 测试集中所有记录的时间都在训练集之后);\n", + "2. user表(可选);\n", + "3. item表(可选).\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读数据" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:13.926483Z", + "start_time": "2022-05-13T03:12:13.922927Z" + } + }, + "outputs": [], + "source": [ + "path = './ml-25m'" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:21.478010Z", + "start_time": "2022-05-13T03:12:21.463303Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['tags.csv',\n", + " 'links.csv',\n", + " 'README.txt',\n", + " 'ratings.csv',\n", + " 'genome-tags.csv',\n", + " 'genome-scores.csv',\n", + " 'movies.csv']" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "os.listdir(path)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:39.533274Z", + "start_time": "2022-05-13T03:12:22.064321Z" + } + }, + "outputs": [], + "source": [ + "tags = pd.read_csv(f'{path}/tags.csv')\n", + "links = pd.read_csv(f'{path}/links.csv')\n", + "ratings = pd.read_csv(f'{path}/ratings.csv')\n", + "genometags = pd.read_csv(f'{path}/genome-tags.csv')\n", + "genomescores = pd.read_csv(f'{path}/genome-scores.csv')\n", + "movies = pd.read_csv(f'{path}/movies.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 数据处理" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 交互表" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:39.681400Z", + "start_time": "2022-05-13T03:12:39.669609Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdmovieIdratingtimestamp
012965.01147880044
113063.51147868817
213075.01147868828
316655.01147878820
418993.51147868510
\n", + "
" + ], + "text/plain": [ + " userId movieId rating timestamp\n", + "0 1 296 5.0 1147880044\n", + "1 1 306 3.5 1147868817\n", + "2 1 307 5.0 1147868828\n", + "3 1 665 5.0 1147878820\n", + "4 1 899 3.5 1147868510" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:43.312792Z", + "start_time": "2022-05-13T03:12:42.548009Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "((25000095, 4), (3612474, 4))" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings.shape, ratings.loc[ratings['rating'] == 5].shape" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:45.738446Z", + "start_time": "2022-05-13T03:12:45.281508Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "ratings = ratings.loc[ratings['rating'] == 5]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:12:47.017038Z", + "start_time": "2022-05-13T03:12:46.939111Z" + } + }, + "outputs": [], + "source": [ + "ratings.drop('rating', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:43.531608Z", + "start_time": "2022-05-13T03:13:30.189616Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "import datetime\n", + "ratings['time'] = ratings['timestamp'].apply(\n", + " lambda ts: datetime.datetime.utcfromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S'))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:43.894742Z", + "start_time": "2022-05-13T03:13:43.738518Z" + } + }, + "outputs": [], + "source": [ + "ratings.drop('timestamp', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:44.024141Z", + "start_time": "2022-05-13T03:13:44.014738Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdmovieIdtime
012962006-05-17 15:34:04
213072006-05-17 12:27:08
316652006-05-17 15:13:40
8112372006-05-17 12:27:19
18126322006-05-17 15:04:08
\n", + "
" + ], + "text/plain": [ + " userId movieId time\n", + "0 1 296 2006-05-17 15:34:04\n", + "2 1 307 2006-05-17 12:27:08\n", + "3 1 665 2006-05-17 15:13:40\n", + "8 1 1237 2006-05-17 12:27:19\n", + "18 1 2632 2006-05-17 15:04:08" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:47.725445Z", + "start_time": "2022-05-13T03:13:46.769086Z" + } + }, + "outputs": [], + "source": [ + "ratings['time'] = pd.to_datetime(ratings['time'])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:47.870602Z", + "start_time": "2022-05-13T03:13:47.839219Z" + }, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-21 09:06:53'))" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ratings['time'].min(), ratings['time'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:51.542431Z", + "start_time": "2022-05-13T03:13:51.364166Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "import datetime\n", + "data_used_time = datetime.datetime.strptime('2019-11-15 00:00:00', '%Y-%m-%d %H:%M:%S')\n", + "\n", + "train = ratings.loc[ratings['time'] < data_used_time]\n", + "test = ratings.loc[ratings['time'] >= data_used_time]" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:51.768791Z", + "start_time": "2022-05-13T03:13:51.738452Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Timestamp('1995-01-09 11:46:49'), Timestamp('2019-11-14 23:20:55'))" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "train['time'].min(), train['time'].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:13:52.119005Z", + "start_time": "2022-05-13T03:13:52.113871Z" + }, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "(Timestamp('2019-11-15 00:08:42'), Timestamp('2019-11-21 09:06:53'))" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test['time'].min(), test['time'].max()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### item表" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:14:13.442569Z", + "start_time": "2022-05-13T03:14:04.946198Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "temp = genomescores.pivot(index='movieId', columns='tagId', values='relevance')\n", + "temp = temp.reset_index()\n", + "temp.columns = ['movieId'] + ['tag_' + str(i) for i in range(1, 1128+1)]" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:14:14.519675Z", + "start_time": "2022-05-13T03:14:13.752261Z" + } + }, + "outputs": [], + "source": [ + "movies = movies.merge(temp, on = 'movieId', how = 'left')" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:14:15.041132Z", + "start_time": "2022-05-13T03:14:14.985591Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
movieIdtitlegenrestag_1tag_2tag_3tag_4tag_5tag_6tag_7...tag_1119tag_1120tag_1121tag_1122tag_1123tag_1124tag_1125tag_1126tag_1127tag_1128
01Toy Story (1995)Adventure|Animation|Children|Comedy|Fantasy0.028750.023750.062500.075750.140750.146750.06350...0.040500.014250.030500.035000.141250.057750.039000.029750.084750.02200
12Jumanji (1995)Adventure|Children|Fantasy0.041250.040500.062750.082750.091000.061250.06925...0.052500.015750.012500.020000.122250.032750.021000.011000.105250.01975
23Grumpier Old Men (1995)Comedy|Romance0.046750.055500.029250.087000.047500.047750.04600...0.062750.019500.022250.023000.122000.034750.017000.018000.091000.01775
34Waiting to Exhale (1995)Comedy|Drama|Romance0.034250.038000.040500.031000.065000.035750.02900...0.053250.028000.016750.038750.182000.070500.016250.014250.088500.01500
45Father of the Bride Part II (1995)Comedy0.043000.053250.038000.041000.054000.067250.02775...0.053500.020500.014250.025500.192250.026750.016250.013000.087000.01600
..................................................................
62418209157We (2018)DramaNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
62419209159Window of the Soul (2001)DocumentaryNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
62420209163Bad Poems (2018)Comedy|DramaNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
62421209169A Girl Thing (2001)(no genres listed)NaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
62422209171Women of Devil's Island (1962)Action|Adventure|DramaNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

62423 rows × 1131 columns

\n", + "
" + ], + "text/plain": [ + " movieId title \\\n", + "0 1 Toy Story (1995) \n", + "1 2 Jumanji (1995) \n", + "2 3 Grumpier Old Men (1995) \n", + "3 4 Waiting to Exhale (1995) \n", + "4 5 Father of the Bride Part II (1995) \n", + "... ... ... \n", + "62418 209157 We (2018) \n", + "62419 209159 Window of the Soul (2001) \n", + "62420 209163 Bad Poems (2018) \n", + "62421 209169 A Girl Thing (2001) \n", + "62422 209171 Women of Devil's Island (1962) \n", + "\n", + " genres tag_1 tag_2 tag_3 \\\n", + "0 Adventure|Animation|Children|Comedy|Fantasy 0.02875 0.02375 0.06250 \n", + "1 Adventure|Children|Fantasy 0.04125 0.04050 0.06275 \n", + "2 Comedy|Romance 0.04675 0.05550 0.02925 \n", + "3 Comedy|Drama|Romance 0.03425 0.03800 0.04050 \n", + "4 Comedy 0.04300 0.05325 0.03800 \n", + "... ... ... ... ... \n", + "62418 Drama NaN NaN NaN \n", + "62419 Documentary NaN NaN NaN \n", + "62420 Comedy|Drama NaN NaN NaN \n", + "62421 (no genres listed) NaN NaN NaN \n", + "62422 Action|Adventure|Drama NaN NaN NaN \n", + "\n", + " tag_4 tag_5 tag_6 tag_7 ... tag_1119 tag_1120 tag_1121 \\\n", + "0 0.07575 0.14075 0.14675 0.06350 ... 0.04050 0.01425 0.03050 \n", + "1 0.08275 0.09100 0.06125 0.06925 ... 0.05250 0.01575 0.01250 \n", + "2 0.08700 0.04750 0.04775 0.04600 ... 0.06275 0.01950 0.02225 \n", + "3 0.03100 0.06500 0.03575 0.02900 ... 0.05325 0.02800 0.01675 \n", + "4 0.04100 0.05400 0.06725 0.02775 ... 0.05350 0.02050 0.01425 \n", + "... ... ... ... ... ... ... ... ... \n", + "62418 NaN NaN NaN NaN ... NaN NaN NaN \n", + "62419 NaN NaN NaN NaN ... NaN NaN NaN \n", + "62420 NaN NaN NaN NaN ... NaN NaN NaN \n", + "62421 NaN NaN NaN NaN ... NaN NaN NaN \n", + "62422 NaN NaN NaN NaN ... NaN NaN NaN \n", + "\n", + " tag_1122 tag_1123 tag_1124 tag_1125 tag_1126 tag_1127 tag_1128 \n", + "0 0.03500 0.14125 0.05775 0.03900 0.02975 0.08475 0.02200 \n", + "1 0.02000 0.12225 0.03275 0.02100 0.01100 0.10525 0.01975 \n", + "2 0.02300 0.12200 0.03475 0.01700 0.01800 0.09100 0.01775 \n", + "3 0.03875 0.18200 0.07050 0.01625 0.01425 0.08850 0.01500 \n", + "4 0.02550 0.19225 0.02675 0.01625 0.01300 0.08700 0.01600 \n", + "... ... ... ... ... ... ... ... \n", + "62418 NaN NaN NaN NaN NaN NaN NaN \n", + "62419 NaN NaN NaN NaN NaN NaN NaN \n", + "62420 NaN NaN NaN NaN NaN NaN NaN \n", + "62421 NaN NaN NaN NaN NaN NaN NaN \n", + "62422 NaN NaN NaN NaN NaN NaN NaN \n", + "\n", + "[62423 rows x 1131 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "movies" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 存档" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:14:33.717315Z", + "start_time": "2022-05-13T03:14:33.712050Z" + } + }, + "outputs": [], + "source": [ + "output_path = './MovieLens_AutoX/'\n", + "os.makedirs(output_path, exist_ok = True)" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:16:19.154546Z", + "start_time": "2022-05-13T03:15:25.381464Z" + } + }, + "outputs": [], + "source": [ + "train.to_csv(output_path + 'inter_df.csv', index = False)\n", + "test.to_csv(output_path + 'test.csv', index = False)\n", + "movies.to_csv(output_path + 'item_df.csv', index = False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": true + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb new file mode 100644 index 0000000..342db5c --- /dev/null +++ b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_popular_recall.ipynb @@ -0,0 +1,693 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## import包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:18:09.319781Z", + "start_time": "2022-05-13T03:18:09.317031Z" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../AutoX')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:18:15.567203Z", + "start_time": "2022-05-13T03:18:10.482510Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n", + " warnings.warn(msg, UserWarning)\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, positive=False):\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/online_lda.py:29: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " EPS = np.finfo(np.float).eps\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " dtype=np.int):\n" + ] + } + ], + "source": [ + "from autox import AutoXRecommend" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:18:28.275812Z", + "start_time": "2022-05-13T03:18:28.272710Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读数据" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:08.888049Z", + "start_time": "2022-05-13T03:19:08.884895Z" + } + }, + "outputs": [], + "source": [ + "path = '~/AutoX/autox/autox_recommend/datasets/MovieLens_AutoX/'" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:21.230686Z", + "start_time": "2022-05-13T03:19:09.561508Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "inter_df = pd.read_csv(path + 'inter_df.csv')\n", + "item_df = pd.read_csv(path + 'item_df.csv')\n", + "test = pd.read_csv(path + 'test.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:40.197258Z", + "start_time": "2022-05-13T03:19:40.185844Z" + } + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdmovieIdtime
012962006-05-17 15:34:04
113072006-05-17 12:27:08
216652006-05-17 15:13:40
3112372006-05-17 12:27:19
4126322006-05-17 15:04:08
\n", + "
" + ], + "text/plain": [ + " userId movieId time\n", + "0 1 296 2006-05-17 15:34:04\n", + "1 1 307 2006-05-17 12:27:08\n", + "2 1 665 2006-05-17 15:13:40\n", + "3 1 1237 2006-05-17 12:27:19\n", + "4 1 2632 2006-05-17 15:04:08" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inter_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 配置参数" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:51.178487Z", + "start_time": "2022-05-13T03:19:51.175288Z" + } + }, + "outputs": [], + "source": [ + "uid = 'userId'\n", + "iid = 'movieId'\n", + "time_col = 'time'\n", + "recall_num = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 准备测试集结果" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:54.191979Z", + "start_time": "2022-05-13T03:19:53.511237Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('1995-01-09 11:46:49', '2019-11-14 23:20:55')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inter_df[time_col].min(), inter_df[time_col].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:19:54.302038Z", + "start_time": "2022-05-13T03:19:54.295604Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('2019-11-15 00:08:42', '2019-11-21 09:06:53')" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test[time_col].min(), test[time_col].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:20:14.914265Z", + "start_time": "2022-05-13T03:20:14.587569Z" + } + }, + "outputs": [], + "source": [ + "assert(test[time_col].min() > inter_df[time_col].max())" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:20:16.965803Z", + "start_time": "2022-05-13T03:20:16.948163Z" + }, + "scrolled": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "283it [00:00, 168846.09it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total users in testidation: 283\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "positive_items_test = test.groupby([uid])[iid].apply(list)\n", + "test_users = positive_items_test.keys()\n", + "test_items = []\n", + "\n", + "for i, user in tqdm(enumerate(test_users)):\n", + " test_items.append(positive_items_test[user])\n", + " \n", + "print(\"Total users in testidation:\", len(test_users))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 时间列转化" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:20:52.185539Z", + "start_time": "2022-05-13T03:20:51.288515Z" + } + }, + "outputs": [], + "source": [ + "inter_df[time_col] = pd.to_datetime(inter_df[time_col])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 执行AutoX" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:21:04.735973Z", + "start_time": "2022-05-13T03:21:04.676594Z" + }, + "scrolled": false + }, + "outputs": [], + "source": [ + "autoXRecommend = AutoXRecommend()\n", + "\n", + "autoXRecommend.fit(inter_df = inter_df, user_df = None, item_df = item_df,\n", + " uid = uid, iid = iid, time_col = time_col,\n", + " recall_num = recall_num, \n", + " mode = 'recalls', recall_method = 'popular')" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:21:06.926077Z", + "start_time": "2022-05-13T03:21:06.613477Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 283/283 [00:00<00:00, 1025.37it/s]\n" + ] + } + ], + "source": [ + "res = autoXRecommend.transform(test_users)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:21:08.623737Z", + "start_time": "2022-05-13T03:21:08.598246Z" + }, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdprediction
04[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
1606[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
21746[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
33409[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
44037[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
.........
278159074[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
279159388[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
280159523[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
281161485[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
282162190[318, 5952, 7153, 4993, 858, 260, 1196, 79132,...
\n", + "

283 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " userId prediction\n", + "0 4 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "1 606 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "2 1746 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "3 3409 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "4 4037 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + ".. ... ...\n", + "278 159074 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "279 159388 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "280 159523 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "281 161485 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "282 162190 [318, 5952, 7153, 4993, 858, 260, 1196, 79132,...\n", + "\n", + "[283 rows x 2 columns]" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "res" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 结果查看" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:21:10.832933Z", + "start_time": "2022-05-13T03:21:10.825779Z" + } + }, + "outputs": [], + "source": [ + "def apk(actual, predicted, k=12):\n", + " if len(predicted)>k:\n", + " predicted = predicted[:k]\n", + "\n", + " score = 0.0\n", + " num_hits = 0.0\n", + "\n", + " for i,p in enumerate(predicted):\n", + " if p in actual and p not in predicted[:i]:\n", + " num_hits += 1.0\n", + " score += num_hits / (i+1.0)\n", + "\n", + " if not actual:\n", + " return 0.0\n", + "\n", + " return score / min(len(actual), k)\n", + "\n", + "def mapk(actual, predicted, k=12):\n", + " return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T03:21:22.872923Z", + "start_time": "2022-05-13T03:21:22.866137Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mAP Score on Validation set: 0.0473412592184677\n" + ] + } + ], + "source": [ + "outputs = res['prediction']\n", + "print(\"mAP Score on Validation set:\", mapk(test_items, outputs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb new file mode 100644 index 0000000..b3bfe9c --- /dev/null +++ b/autox/autox_recommend/demo/MovieLens/MovieLens_AutoX_recall_and_rank.ipynb @@ -0,0 +1,893 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## import包" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:24.337522Z", + "start_time": "2022-05-13T11:20:24.334714Z" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../AutoX')" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:29.549828Z", + "start_time": "2022-05-13T11:20:24.339756Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/pandas/compat/_optional.py:138: UserWarning: Pandas requires version '2.7.0' or newer of 'numexpr' (version '2.6.9' currently installed).\n", + " warnings.warn(msg, UserWarning)\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:30: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:167: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " method='lar', copy_X=True, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:284: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_Gram=True, verbose=0,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:862: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1101: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, fit_path=True,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1127: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, positive=False):\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1362: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1602: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " max_n_alphas=1000, n_jobs=None, eps=np.finfo(np.float).eps,\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/least_angle.py:1738: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " eps=np.finfo(np.float).eps, copy_X=True, positive=False):\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/decomposition/online_lda.py:29: DeprecationWarning: `np.float` is a deprecated alias for the builtin `float`. To silence this warning, use `float` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.float64` here.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " EPS = np.finfo(np.float).eps\n", + "/home/caihengxing/anaconda3/lib/python3.7/site-packages/sklearn/feature_extraction/image.py:167: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " dtype=np.int):\n" + ] + } + ], + "source": [ + "from autox import AutoXRecommend" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:29.555355Z", + "start_time": "2022-05-13T11:20:29.552439Z" + } + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "import os\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 读取数据" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:29.604407Z", + "start_time": "2022-05-13T11:20:29.557484Z" + } + }, + "outputs": [], + "source": [ + "path = '~/AutoX/autox/autox_recommend/datasets/MovieLens_AutoX/'" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:41.797301Z", + "start_time": "2022-05-13T11:20:29.606616Z" + }, + "scrolled": true + }, + "outputs": [], + "source": [ + "inter_df = pd.read_csv(path + 'inter_df.csv')\n", + "item_df = pd.read_csv(path + 'item_df.csv')\n", + "test = pd.read_csv(path + 'test.csv')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 配置参数" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:41.803162Z", + "start_time": "2022-05-13T11:20:41.800065Z" + } + }, + "outputs": [], + "source": [ + "uid = 'userId'\n", + "iid = 'movieId'\n", + "time_col = 'time'\n", + "recall_num = 100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 准备测试集结果" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:42.474990Z", + "start_time": "2022-05-13T11:20:41.805384Z" + } + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('1995-01-09 11:46:49', '2019-11-14 23:20:55')" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inter_df[time_col].min(), inter_df[time_col].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:42.483575Z", + "start_time": "2022-05-13T11:20:42.477798Z" + }, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "('2019-11-15 00:08:42', '2019-11-21 09:06:53')" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "test[time_col].min(), test[time_col].max()" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:42.825682Z", + "start_time": "2022-05-13T11:20:42.485772Z" + } + }, + "outputs": [], + "source": [ + "assert(test[time_col].min() > inter_df[time_col].max())" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:42.844719Z", + "start_time": "2022-05-13T11:20:42.827758Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "283it [00:00, 177692.82it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Total users in testidation: 283\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + } + ], + "source": [ + "positive_items_test = test.groupby([uid])[iid].apply(list)\n", + "test_users = positive_items_test.keys()\n", + "test_items = []\n", + "\n", + "for i, user in tqdm(enumerate(test_users)):\n", + " test_items.append(positive_items_test[user])\n", + " \n", + "print(\"Total users in testidation:\", len(test_users))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 时间列转化" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:43.862130Z", + "start_time": "2022-05-13T11:20:42.846547Z" + } + }, + "outputs": [], + "source": [ + "inter_df[time_col] = pd.to_datetime(inter_df[time_col])" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:20:43.874309Z", + "start_time": "2022-05-13T11:20:43.864199Z" + }, + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
userIdmovieIdtime
012962006-05-17 15:34:04
113072006-05-17 12:27:08
216652006-05-17 15:13:40
3112372006-05-17 12:27:19
4126322006-05-17 15:04:08
\n", + "
" + ], + "text/plain": [ + " userId movieId time\n", + "0 1 296 2006-05-17 15:34:04\n", + "1 1 307 2006-05-17 12:27:08\n", + "2 1 665 2006-05-17 15:13:40\n", + "3 1 1237 2006-05-17 12:27:19\n", + "4 1 2632 2006-05-17 15:04:08" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "inter_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 执行AutoX" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T11:41:40.981180Z", + "start_time": "2022-05-13T11:20:43.876240Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "popular_recall\n", + "train\n", + "2019-11-01 00:21:32 2019-11-07 23:45:48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 331/331 [00:00<00:00, 23444.92it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HIT: 0.27881950171515285\n", + "valid\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 310/310 [00:00<00:00, 1136.14it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HIT: 0.27728749446707573\n", + "\n", + "history_recall\n", + "train\n", + "2019-11-01 00:21:32 2019-11-07 23:45:48\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 221/221 [00:00<00:00, 58868.36it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "valid\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 213/213 [00:00<00:00, 50433.94it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "itemcf_recall\n", + "train\n", + "valid\n", + "\n", + "binary_recall\n", + "train\n", + "valid\n", + "\n", + "merge recalls\n", + "train\n", + "valid\n", + "\n", + "feature engineer\n", + "train\n", + "customer feature engineer\n", + "interact feature engineer\n", + "valid\n", + "customer feature engineer\n", + "interact feature engineer\n", + "train_fe shape: (76417, 1147)\n", + "valid_fe shape: (71125, 1147)\n", + "\n", + "ranker\n", + "Training until validation scores don't improve for 100 rounds.\n", + "[100]\tvalid_0's map@12: 0.490571\n", + "Early stopping, best iteration is:\n", + "[93]\tvalid_0's map@12: 0.492638\n", + "defaultdict(, {'valid_0': {'map@12': 0.4926380419869894}})\n", + " feature importance\n", + "2 n_purchase 314\n", + "1140 latest_purchase_time_sub 289\n", + "1 binary_score 142\n", + "3 n_purchase_nunique 124\n", + "0 itemcf_score 100\n", + "1132 purchase_corr_item_max_time 85\n", + "1141 movieId_idx 52\n", + "766 tag_763 31\n", + "1133 purchase_corr_item_cnt 25\n", + "944 tag_941 24\n", + "384 tag_381 12\n", + "1067 tag_1064 12\n", + "755 tag_752 12\n", + "43 tag_40 11\n", + "277 tag_274 11\n", + "474 tag_471 10\n", + "995 tag_992 10\n", + "811 tag_808 10\n", + "97 tag_94 9\n", + "454 tag_451 9\n", + "\n", + "local result calculation\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n", + "mAP Score on Validation set: 0.07809885619482743\n", + "##############################\n", + "retrain\n", + "\n", + "popular_recall\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 310/310 [00:00<00:00, 12180.86it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "HIT: 0.27728749446707573\n", + "\n", + "history_recall\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 213/213 [00:00<00:00, 50091.77it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "itemcf_recall\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "calculate similarity\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 154824/154824 [09:36<00:00, 268.37it/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ItemCF recommend\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 213/213 [01:51<00:00, 1.92it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(21300, 3)\n", + "ItemCF recall: (21300, 4)\n", + "mean: 0.002347417840375587\n", + "sum: 50.0\n", + "\n", + "binary_recall\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 27045/27045 [05:13<00:00, 86.28it/s] \n", + "100%|██████████| 310/310 [01:44<00:00, 2.96it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(21300, 3)\n", + "BinaryNet recall: (21300, 4)\n", + "0.0032863849765258214\n", + "\n", + "merge recalls\n", + "\n", + "feature engineer\n", + "customer feature engineer\n", + "interact feature engineer\n", + "train_fe shape: (71125, 1147)\n", + "\n", + "ranker\n", + "defaultdict(, {'valid_0': {'map@12': 0.5546064728628005}})\n", + " feature importance\n", + "2 n_purchase 314\n", + "1140 latest_purchase_time_sub 264\n", + "3 n_purchase_nunique 121\n", + "1 binary_score 113\n", + "1132 purchase_corr_item_max_time 94\n", + "0 itemcf_score 73\n", + "1141 movieId_idx 67\n", + "1133 purchase_corr_item_cnt 35\n", + "24 tag_21 31\n", + "995 tag_992 21\n", + "210 tag_207 12\n", + "75 tag_72 12\n", + "340 tag_337 10\n", + "50 tag_47 10\n", + "1127 tag_1124 9\n", + "732 tag_729 9\n", + "70 tag_67 8\n", + "932 tag_929 8\n", + "792 tag_789 8\n", + "1028 tag_1025 8\n" + ] + } + ], + "source": [ + "autoXRecommend = AutoXRecommend()\n", + "\n", + "autoXRecommend.fit(inter_df = inter_df, user_df = None, item_df = item_df,\n", + " uid = uid, iid = iid, time_col = time_col,\n", + " recall_num = recall_num,\n", + " time_decay = 0.99,\n", + " debug = True, debug_save_path = './temp_MovieLens')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T12:00:06.970676Z", + "start_time": "2022-05-13T11:41:40.983194Z" + }, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "popular recall, test\n", + "2019-11-08 00:01:28 2019-11-14 23:20:55\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 283/283 [00:00<00:00, 45743.11it/s]" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "history recall, test\n", + "\n", + "itemcf recall, test\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "100%|██████████| 154921/154921 [08:46<00:00, 294.04it/s]\n", + "100%|██████████| 202/202 [02:25<00:00, 1.39it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "binary recall, test\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 27102/27102 [05:11<00:00, 86.89it/s] \n", + "100%|██████████| 283/283 [01:31<00:00, 3.10it/s]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "merge recalls\n", + "\n", + "feature engineer\n", + "customer feature engineer\n", + "interact feature engineer\n", + "test_fe shape: (66920, 1146)\n", + "\n", + "inference\n", + "[1/1]\n", + "(66920, 1146)\n" + ] + } + ], + "source": [ + "res = autoXRecommend.transform(test_users)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 查看结果" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-13T12:00:06.979194Z", + "start_time": "2022-05-13T12:00:06.972819Z" + } + }, + "outputs": [], + "source": [ + "def apk(actual, predicted, k=12):\n", + " if len(predicted)>k:\n", + " predicted = predicted[:k]\n", + "\n", + " score = 0.0\n", + " num_hits = 0.0\n", + "\n", + " for i,p in enumerate(predicted):\n", + " if p in actual and p not in predicted[:i]:\n", + " num_hits += 1.0\n", + " score += num_hits / (i+1.0)\n", + "\n", + " if not actual:\n", + " return 0.0\n", + "\n", + " return score / min(len(actual), k)\n", + "\n", + "def mapk(actual, predicted, k=12):\n", + " return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "ExecuteTime": { + "end_time": "2022-05-14T00:00:06.385747Z", + "start_time": "2022-05-14T00:00:06.377280Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "mAP Score on Validation set: 0.08030425675382308\n" + ] + } + ], + "source": [ + "outputs = res['prediction']\n", + "print(\"mAP Score on Validation set:\", mapk(test_items, outputs))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + }, + "varInspector": { + "cols": { + "lenName": 16, + "lenType": 16, + "lenVar": 40 + }, + "kernels_config": { + "python": { + "delete_cmd_postfix": "", + "delete_cmd_prefix": "del ", + "library": "var_list.py", + "varRefreshCmd": "print(var_dic_list())" + }, + "r": { + "delete_cmd_postfix": ") ", + "delete_cmd_prefix": "rm(", + "library": "var_list.r", + "varRefreshCmd": "cat(var_dic_list()) " + } + }, + "types_to_exclude": [ + "module", + "function", + "builtin_function_or_method", + "instance", + "_Feature" + ], + "window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}