-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathdropouts.py
163 lines (124 loc) · 5.98 KB
/
dropouts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#! /usr/local/bin/python3
# -*- utf-8 -*-
"""
Total: 12
0~2: 课程最后十天有操作的人数,与选课人数的比例: 课程有更新的最后十天、课程有操作的最后十天
3~6: 用户平均每个课程Dropout的次数,在该课程中Dropout的次数,与平均每个课程Dropout次数的比例;
总Dropout持续时长与课程持续时间的比例
7~11: 用户在所有课程上Dropout总持续时长与课程持续时间的比例的:平均值、方差、最大值、最小值;
有Dropout行为的课程占用户所选课程总数的比例
"""
import logging
import sys
import os
from datetime import timedelta
import numpy as np
import pandas as pd
import IO
import Path
import Util
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG,
format='%(asctime)s %(name)s %(levelname)s\t%(message)s')
logger = logging.getLogger(os.path.basename(__file__))
def extract(base_date):
pkl_path = Path.of_cache('dropouts.%s.pkl' % base_date)
X = IO.fetch_cache(pkl_path)
if X is not None:
logger.debug('cache hit')
return X
logger.debug('cache missed')
logger.debug('prepare datasets ...')
enroll_all = IO.load_enrollments()
log_all = IO.load_logs()
obj_all = IO.load_object()
log_all = log_all[log_all['time'] <= base_date]
obj_all = obj_all[obj_all['start'] <= base_date]
logger.debug('datasets prepared')
user_count_of_course = pd.merge(
log_all, enroll_all, how='left', on='enrollment_id')\
.groupby('course_id').agg({'username': lambda us: len(np.unique(us))})\
.reset_index().rename(columns={'username': 'user_count'})
import time_related
course_t1 = time_related.course_duration(log_all, obj_all, enroll_all)
course_t2 = pd.merge(log_all, enroll_all, how='left', on='enrollment_id')\
.groupby('course_id').agg({'time': [np.min, np.max]})
course_t2.columns = ['st2', 'et2']
course_t2.reset_index(inplace=True)
log_all = pd.merge(
pd.merge(log_all, enroll_all, how='left', on='enrollment_id'),
course_t1, how='left', on='course_id')
log_all = pd.merge(log_all, course_t2, how='left', on='course_id')
remain_log1 = log_all[log_all['time'] > log_all['et'] - timedelta(days=10)]
remain_count_of_course1 = remain_log1.groupby('course_id')\
.agg({'username': lambda us: len(np.unique(us))})\
.reset_index().rename(columns={'username': 'remain_count1'})
remain_log2 = log_all[
log_all['time'] > log_all['et2'] - timedelta(days=10)]
remain_count_of_course2 = remain_log2.groupby('course_id')\
.agg({'username': lambda us: len(np.unique(us))})\
.reset_index().rename(columns={'username': 'remain_count2'})
D_last = pd.merge(user_count_of_course, remain_count_of_course1,
how='left', on='course_id')
D_last = pd.merge(D_last, remain_count_of_course2,
how='left', on='course_id')
D_last.fillna(0, inplace=True)
D_last['remain_count1'] = D_last['remain_count1'] / D_last['user_count']
D_last['remain_count2'] = D_last['remain_count2'] / D_last['user_count']
# 0~2: 课程最后十天有操作的人数,与选课人数的比例: 课程有更新的最后十天、课程有操作的最后十天
logger.debug('0~2')
def __get_dropout_count__(group):
course_et = group['et'].max()
course_st = group['st'].min()
group_t = group['time'].sort(inplace=False)
dd = np.array((group_t[1:].reset_index() - group_t[:-1].reset_index())
['time'].dt.days)
dc = np.sum(dd >= 10)
dd_sum = np.sum(dd)
return pd.Series([dc, dd_sum / (course_et - course_st).days],
index=['dropout_count', 'dt_ratio'])
D_count = log_all.groupby('enrollment_id').apply(__get_dropout_count__)\
.reset_index()
D_count = pd.merge(D_count, enroll_all, how='left', on='enrollment_id')
avg_count = D_count.groupby('username')\
.agg({'dropout_count': np.average})\
.rename(columns={'dropout_count': 'avg_dc'}).reset_index()
D_count = pd.merge(D_count, avg_count, how='left', on='username')
D_count['dc_ratio'] = D_count['dropout_count'] / D_count['avg_dc']
# 3~6: 用户平均每个课程Dropout的次数,在该课程中Dropout的次数,
# 与平均每个课程Dropout次数的比例;总Dropout持续时长与课程持续时间的比例
X1 = D_count.copy()
del X1['username']
del X1['course_id']
logger.debug('3~6')
# 7~11: 用户在所有课程上Dropout总持续时长与课程持续时间的比例的:
# 平均值、方差、最大值、最小值;有Dropout行为的课程占用户所选课程总数的比例
X2 = D_count.groupby('username')\
.agg({'dt_ratio': [np.average, np.std, np.max, np.min]})\
.reset_index()
X2.columns = [' '.join(c).strip() for c in X2.columns.values]
DC_count = D_count[D_count['dropout_count'] > 0].groupby('username')\
.agg({'course_id': lambda cs: len(np.unique(cs))})\
.rename(columns={'course_id': 'd_course'}).reset_index()
UC_count = log_all.groupby('username')\
.agg({'course_id': lambda cs: len(np.unique(cs))})\
.reset_index().rename(columns={'course_id': 'course_count'})
X3 = pd.merge(DC_count, UC_count, how='left', on='username')
X3['drop_course_ratio'] = X3['d_course'] / X3['course_count']
del X3['d_course']
del X3['course_count']
logger.debug('7~11')
check_dataframe = Util.dataframe_checker(logger)
check_dataframe(D_last, 'D_last')
X = pd.merge(enroll_all, D_last, how='left', on='course_id')
check_dataframe(X1, 'X1')
X = pd.merge(X, X1, how='left', on='enrollment_id')
check_dataframe(X2, 'X2')
X = pd.merge(X, X2, how='left', on='username')
check_dataframe(X3, 'X3')
X = pd.merge(X, X3, how='left', on='username')
del X['username']
del X['course_id']
X.fillna(0, inplace=True)
check_dataframe(X, 'X')
IO.cache(X, pkl_path)
return X