-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathMachine_learning_Group_39_script.py
315 lines (263 loc) · 13.2 KB
/
Machine_learning_Group_39_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
# -*- coding: utf-8 -*-
"""MLchallenge_final.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1ll-6Ye4B4yFdaba6cBjirB1d1wVKEQFf
# **MACHINE LEARNING CHALLENGE - GROUP 39 **
NOTE: all of this script was done using Google Colab noteboook IDE. We decided to use this IDE since it would make collaboration easier, and we are able to run the code in a faster machine rented from Google
To further read about Google Colab and how it works, please refer to this page: https://colab.research.google.com/notebooks/intro.ipynb
However, if you want to run this code on Jupyter or other Python IDE, please make sure:
* To change the working directory to where you store the train-1.json and test.json
* To not load the package from google.colab
* To save the predicted.json accordingly
Thank you for reading!
"""
#Use this code to mount your google drive with google colab if needed
from google.colab import drive
drive.mount('/content/drive')
#import all packages needed
import pandas as pd
import numpy as np
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords # Import the stop word list
from tabulate import tabulate #to print dataframe prettier and easier to look at
from sklearn import linear_model
from sklearn.model_selection import KFold #to import k-fold cross validation
from sklearn.model_selection import train_test_split #to split train dataset into train + validation set
from sklearn.ensemble import RandomForestRegressor #to input random forest
from sklearn.metrics import r2_score #to calculate r2_score
from sklearn.preprocessing import StandardScaler #to rescale the variables preventing skewness
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import MultiLabelBinarizer #to encode categorical variables such as topics, fields of study
"""**STEP 1**: Processing the Train and the Test set
(Note: please make sure to have train-1.json and test.json in your working directory)
"""
#1) Load the train dataset
import json
t = open("/content/drive/MyDrive/Colab Notebooks/train-1.json")
train = json.load(t)
#convert train dataset into a panda dataframe
df = pd.DataFrame(train)
#find duplicate values
print(pd.concat(g for _, g in df.groupby('doi') if len(g) > 1)) #credit: https://stackoverflow.com/questions/14657241/how-do-i-get-a-list-of-all-the-duplicate-items-using-pandas-in-python
#remove duplicate value: keep the first occurence
df.drop_duplicates(subset = 'doi',keep = 'first',inplace = True)
#drop NAs
df.dropna()
#glance at train df
print(tabulate(df.head(), headers='keys', tablefmt='psql')) #credit: https://stackoverflow.com/questions/18528533/pretty-printing-a-pandas-dataframe
#2) Processing the test dataset
#load the test dataset
t1 = open("/content/drive/MyDrive/Colab Notebooks/test.json")
test = json.load(t1)
#convert test dataset into a panda dataframe
df1 = pd.DataFrame(test)
#remove duplicate value: keep the first occurence
df1.drop_duplicates(subset = 'doi',keep = 'first',inplace = True)
#drop NAs
df1.dropna()
#glance at test df
print(tabulate(df1.head(), headers='keys', tablefmt='psql')) #credit: https://stackoverflow.com/questions/18528533/pretty-printing-a-pandas-dataframe
"""**STEP 2:** Pre-processing the text data"""
#1) Define the clean text function
def clean_text(text_to_clean):
text_to_clean = re.sub( '[^a-zA-Z0-9]', ' ', str(text_to_clean)) # subs charact in the brackets
text_to_clean = re.sub( '\s+', ' ', str(text_to_clean)).strip() ## subs tabs,newlines and "whitespace-like"
words = text_to_clean.lower().split() ## convert to lowercase split indv words
stops = set(stopwords.words('english')) #converting stop words to set
meaningful_words = [w for w in words if not w in stops] # removing stop words
return(" ".join(meaningful_words))
#credit: https://github.com/berkurka/Reddit-Classifier/blob/master/Notebooks/02%20EDA.ipynb
#2) clean title, abstract, venue of the train df and make lists of each of the variable:
df['title'] = df['title'].apply(lambda x: clean_text(x))
title = df['title'].tolist()
df['abstract'] = df['abstract'].apply(lambda x: clean_text(x))
abstract = df['abstract'].tolist()
df['venue'] = df['venue'].apply(lambda x: clean_text(x))
venue = df['venue'].tolist()
#3) clean title, abstract, venue and make lists of each of the variable:
df1['title'] = df1['title'].apply(lambda x: clean_text(x))
title_test = df1['title'].tolist()
df1['abstract'] = df1['abstract'].apply(lambda x: clean_text(x))
abstract_test = df1['abstract'].tolist()
df1['venue'] = df1['venue'].apply(lambda x: clean_text(x))
venue_test = df1['venue'].tolist()
#4) combine train and test df title + abstract for tf-idf (later step)
title_combined = title + title_test
abstract_combined = abstract + abstract_test
#5) get a list of authors in train df
authors = df['authors'].tolist()
#6) get a list of authors in test df
authors_test = df1['authors'].tolist()
#7) clean text for topic in train df
topics = df['topics'].tolist()
topics_1 = []
for i in topics:
item = []
for y in i:
item.append(clean_text(y))
topics_1.append(item)
topics = topics_1
#8) clean text for topic in test df
topics_test = df1['topics'].tolist()
topics_2 = []
for i in topics_test:
item = []
for y in i:
item.append(clean_text(y))
topics_2.append(item)
topics_test = topics_2
"""**STEP 3:** Processing categorical variable: Venue, Topics, Fields of study
**Venue**
"""
#1) Venue in train df
df['venue'] = df['venue'].astype("category")
df['venue'] = df['venue'].cat.codes
#2) Venue in test_df
df1['venue'] = df1['venue'].astype("category")
df1['venue'] = df1['venue'].cat.codes
"""**Topics** removed since it decreased performance"""
# #0) read the shape of train df and test df1 -> use this index to split the df later
# print(df.shape)
# print(df1.shape)
# #1) combine 2 list of topics + topics_test
# topics_joined = topics + topics_test
# #2) stack train df and test df1 on top of each other
# pieces = (df,df1)
# df_concat = pd.concat(pieces)
# #3) impute topics_joined into concat df
# df_concat['topics'] = topics_joined
# topics_snip = df_concat['topics']
# #4) apply MultiLabelBinarier() for all topics in topics_snip df
# mlb = MultiLabelBinarizer()
# topics_df_concat = pd.DataFrame(mlb.fit_transform(topics_snip), columns = mlb.classes_)
# #5) split back the train and test df topics with multilabelbinarizer application
# topics_df_train = topics_df_concat.iloc[:9657,]
# topics_df_test = topics_df_concat.iloc[9657:,]
# topics_df_test = topics_df_test.reset_index()
# topics_df_test = topics_df_test.drop(columns=['index'])
# print(topics_df_train.shape) #print the shape of multilabelbinarizer df for train df
# print(topics_df_test.shape) #print the shape of multilabelbinarizer df for test df
"""**Fields of study** removed since it decreased performance """
# #1) extract the fields of study column from concat df + fill NaN with 0
# fields_of_study_snip = df_concat['fields_of_study']
# fields_of_study_snip = fields_of_study_snip.fillna("0")
# #2) Apply multilabel binerizer
# fields_of_study_df_concat = pd.DataFrame(mlb.fit_transform(fields_of_study_snip), columns = mlb.classes_)
# #3) split back the train and test df with multilabelbinarizer application + drop the "0" column (meaning the column with NaN type counted)
# fields_of_study_df_train = fields_of_study_df_concat.iloc[:9657,]
# fields_of_study_df_train = fields_of_study_df_train.drop(columns=['0'])
# fields_of_study_df_test = fields_of_study_df_concat.iloc[9657:,]
# fields_of_study_df_test = fields_of_study_df_test.drop(columns=['0'])
# fields_of_study_df_test = fields_of_study_df_test.reset_index()
# fields_of_study_df_test = fields_of_study_df_test.drop(columns=['index'])
# print(fields_of_study_df_train.shape) #print the shape of multilabelbinarizer df for train df
# print(fields_of_study_df_test.shape) #print the shape of multilabelbinarizer df for test df
"""**STEP 4:** tf-idf for cleaned words in title + abstract in both train and test dataset"""
#1) feature vector title -> df
vec1 = TfidfVectorizer(min_df = 500)
vec2 = TfidfVectorizer(min_df = 0.1)
test_counts_title_vectors = vec1.fit_transform(title_combined)
title_df = pd.DataFrame(test_counts_title_vectors.todense(), columns=vec1.get_feature_names())
title_df.replace([np.inf, -np.inf], np.nan, inplace=True)
title_df.fillna(0, inplace=True)
#2) feature vector abstract -> df
test_counts_abstract_vectors = vec2.fit_transform(abstract_combined)
abstract_df = pd.DataFrame(test_counts_abstract_vectors.todense(), columns=vec2.get_feature_names())
abstract_df.replace([np.inf, -np.inf], np.nan, inplace=True)
abstract_df.fillna(0, inplace=True)
"""**STEP 5:** Generate train and test features dataset"""
#1) X for train dataset
#Feature selection: title length, abstract length, number of authors, number of topics, reference number, year, is open access, venue, title word counts, abstract word counts
feature_name = ['title_length','abstract_length','authors_number','topics_number','references','year','is_open_access','venue']
#title length
title_length = []
for i in title:
title_length.append(len(i))
#abstract length
abstract_length = []
for i in abstract:
abstract_length.append(len(i))
#number of authors
authors_number = []
for i in authors:
authors_number.append(len(i))
#number of topics
topics_number = []
for i in topics:
topics_number.append(len(i))
#create feature dataframe for above stated lists + references, year, is_open_acces, venue
train1 = pd.DataFrame(list(zip(title_length,abstract_length,authors_number,topics_number,df['references'],df['year'],df['is_open_access'],df['venue'])),columns = feature_name)
train1 = pd.concat([train1,title_df,abstract_df],join='inner',axis = 1)
train1.replace([np.inf, -np.inf], np.nan, inplace=True)
train1.fillna(0, inplace=True)
# print(tabulate(train1.head(), headers='keys', tablefmt='psql')) -> to glance at the train features if needed
#feature
X_train = train1
#scaling X on numerical features to avoid skewness and other effects
scale = StandardScaler()
X_train[['title_length','abstract_length','authors_number','topics_number','references','year']] = scale.fit_transform(X_train[['title_length','abstract_length','authors_number','topics_number','references','year']])
X_train = np.array(X_train).astype(float)
#2) X for test dataset
#title length
title_length_test = []
for i in title_test:
title_length_test.append(len(i))
#abstract length
abstract_length_test = []
for i in abstract_test:
abstract_length_test.append(len(i))
#number of authors
authors_number_test = []
for i in authors_test:
authors_number_test.append(len(i))
#number of topics
topics_number_test = []
for i in topics_test:
topics_number_test.append(len(i))
#create feature dataframe for above stated lists + references, year, is_open_acces, venue
test1 = pd.DataFrame(list(zip(title_length_test,abstract_length_test,authors_number_test,topics_number_test,df1['references'],df1['year'],df1['is_open_access'],df1['venue'])),columns = feature_name)
test1 = pd.concat([test1,title_df,abstract_df],join='inner',axis = 1)
test1.replace([np.inf, -np.inf], np.nan, inplace=True)
test1.fillna(0, inplace=True)
# print(tabulate(test1.head(), headers='keys', tablefmt='psql')) -> to glance at the test features if needed
#feature
X_test = test1
#scaling X on numerical features to avoid skewness and other effects
X_test[['title_length','abstract_length','authors_number','topics_number','references','year']] = scale.transform(X_test[['title_length','abstract_length','authors_number','topics_number','references','year']])
X_test = np.array(X_test).astype(float)
#print the shape of the train and test feature
print(X_train.shape)
print(X_test.shape)
"""**STEP 6:** Setting up Y_train and Y_test"""
#Y_train reshaping + logging to lower outliers
Y_train = np.array(df['citations']).reshape(-1,1)
Y_train = np.log1p(Y_train)
"""**STEP 7:** Hyperparameter tunning to get the best parameters """
#running this takes hours, therefore we made a sperate random forest with the parameters found by the gridsearch
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 5000, num = 10)]
max_depth = [int(x) for x in np.linspace(1, 200, num = 11)]
max_features = [int(x) for x in np.linspace(start = 5, stop = 90, num = 10)]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth}
rf1 = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf1, param_distributions = random_grid, n_iter = 20, cv = 5,random_state = 42)
"""**STEP 8:** Making predictions with the best model"""
#random forest model
rf = RandomForestRegressor(n_estimators=1733, max_depth=27,max_features=30, random_state=42) #parameters from the random grid search
rf.fit(X_train,np.ravel(Y_train))
Y_test = rf.predict(X_test)
#convert back the log transformed of prediction to the original prediction
Y_test = np.expm1(Y_test)
#import google package to download files
from google.colab import files
#convert file to json format
predicted = pd.DataFrame(zip(df1['doi'],Y_test), columns = ['doi','citations'])
js = predicted.to_json("predicted.json",orient='records')
#download the file
files.download("predicted.json")