Skip to content

Commit

Permalink
<refactor> uses different machine learning pipeline
Browse files Browse the repository at this point in the history
app/run.py
- adds class StartingVerbExtractor

models/train_classifier.py
- uses SDGClassifier
- adds StartingVerbExtractor to Feature Union Pipeline
- adds Grid search to optimize parameters
  • Loading branch information
msrlab committed Sep 23, 2019
1 parent 98caf18 commit 83244dd
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 17 deletions.
31 changes: 30 additions & 1 deletion app/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,46 @@
import pandas as pd

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import word_tokenize, sent_tokenize

from flask import Flask
from flask import render_template, request, jsonify
from plotly.graph_objs import Bar, Scatter
from sklearn.externals import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sqlalchemy import create_engine
import nltk
#nltk.download(['punkt', 'wordnet','stopwords'])


app = Flask(__name__)

class StartingVerbExtractor(BaseEstimator, TransformerMixin):
'''
extract information whether text starts with verb or verbal phrase
can be used as estimator in sklearn (transform)
returns:
0 or 1
'''
def starting_verb(self, text):
sentence_list = nltk.sent_tokenize(text)
for sentence in sentence_list:
pos_tags = nltk.pos_tag(tokenize(sentence))
try:
first_word, first_tag = pos_tags[0]
if first_tag in ['VB', 'VBP'] or first_word == 'RT':
return 1
except:
return 0
return 0

def fit(self, x, y=None):
return self

def transform(self, X):
X_tagged = pd.Series(X).apply(self.starting_verb)
return pd.DataFrame(X_tagged)

def tokenize(text):
tokens = word_tokenize(text)
lemmatizer = WordNetLemmatizer()
Expand Down
Binary file modified models/classifier.pkl
Binary file not shown.
Binary file modified models/classifier_metrics.pkl
Binary file not shown.
52 changes: 36 additions & 16 deletions models/train_classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputClassifier
Expand All @@ -24,7 +24,8 @@
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.externals import joblib

import nltk
nltk.download(['punkt', 'wordnet','stopwords'])
## define some custom stopwords
#full stopwords from nltk
stopwords_a= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
Expand Down Expand Up @@ -118,7 +119,7 @@ def tokenize(text):
text: str that will be tokenized
returns new_tokens (list of extracted tokens)
'''
'''

#remove punctuation
text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
Expand Down Expand Up @@ -159,21 +160,40 @@ def transform(self, X):

def build_model():
'''
define pipeline and/or gridsearch object for feature extraction and trainig classifier
define pipeline and/or gridsearch object for feature extraction and trainig classifier
returns pipeline or gridsearch object
'''
'''
pipeline = Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer()),
('clf', MultiOutputClassifier(AdaBoostClassifier()))
('features', FeatureUnion([
('tfidf_pipeline', Pipeline([
('vect', CountVectorizer(tokenizer=tokenize)),
('tfidf', TfidfTransformer())
])),
('starting_verb', StartingVerbExtractor()),
])),
('clf', MultiOutputClassifier(SGDClassifier()))
])
#parameters = {'vect__max_df': (0.33, 0.66),
# 'vect__ngram_range': [(1, 1),(1, 3)],
# 'vect__stop_words': [stopwords_a, stopwords_b, stopwords_d]}
#cv = GridSearchCV(pipeline, param_grid = parameters, cv=3, n_jobs=1,
# verbose = 2, scoring = make_scorer(roc_auc_score))
#return cv
return pipeline

#parameters = {'features__tfidf_pipeline__vect__max_df': (0.6, 0.8, 1),
# 'features__tfidf_pipeline__vect__ngram_range': ((1,1),(1, 2)),
# 'features__tfidf_pipeline__vect__stop_words': (stopwords_a,stopwords_b),
# 'features__tfidf_pipeline__vect__max_features': (None, 10000),
# 'clf__estimator__max_iter': (50,),
# 'clf__estimator__alpha': (0.00001,),
# 'clf__estimator__penalty': ('elasticnet','l2')}

parameters = {'features__tfidf_pipeline__vect__max_df': (0.6,),
'features__tfidf_pipeline__vect__ngram_range': ((1, 2),),
'features__tfidf_pipeline__vect__stop_words': (stopwords_a,),
'features__tfidf_pipeline__vect__max_features': (None,),
'clf__estimator__max_iter': (50,),
'clf__estimator__alpha': (0.00001,),
'clf__estimator__penalty': ('elasticnet',)}
cv = GridSearchCV(pipeline, param_grid = parameters, cv=5, n_jobs=1,
verbose = 2, scoring = make_scorer(roc_auc_score))

return cv
#return pipeline

def evaluate_model(model, X_test, Y_test, category_names):
'''
Expand All @@ -186,7 +206,7 @@ def get_metrics (y_test, y_pred):
y_test: dataframe with true labels (binary)
y_pred: numpy array with predicted labels (y_pred = XXXX.predict(X_test) from an sklearn estimator)
returns: dataframe with accuracy, precision, f1, recall, tp, tn, fp, fn, roc_auc
returns: dataframe with accuracy, precision, f1, recall, tp, tn, fp, fn, roc_auc
Expand Down

0 comments on commit 83244dd

Please sign in to comment.