<refactor> uses different machine learning pipeline

app/run.py - adds class StartingVerbExtractor models/train_classifier.py - uses SDGClassifier - adds StartingVerbExtractor to Feature Union Pipeline - adds Grid search to optimize parameters
msrlab · Sep 23, 2019 · 83244dd · 83244dd
1 parent 98caf18
commit 83244dd
Show file tree

Hide file tree

Showing 4 changed files with 66 additions and 17 deletions.
diff --git a/app/run.py b/app/run.py
@@ -3,17 +3,46 @@
 import pandas as pd
 
 from nltk.stem import WordNetLemmatizer
-from nltk.tokenize import word_tokenize
+from nltk.tokenize import word_tokenize, sent_tokenize
 
 from flask import Flask
 from flask import render_template, request, jsonify
 from plotly.graph_objs import Bar, Scatter
 from sklearn.externals import joblib
+from sklearn.base import BaseEstimator, TransformerMixin
 from sqlalchemy import create_engine
+import nltk
+#nltk.download(['punkt', 'wordnet','stopwords'])
 
 
 app = Flask(__name__)
 
+class StartingVerbExtractor(BaseEstimator, TransformerMixin):
+    '''
+    extract information whether text starts with verb or verbal phrase
+    can be used as estimator in sklearn (transform)
+    returns:
+    0 or 1
+    '''
+    def starting_verb(self, text):
+        sentence_list = nltk.sent_tokenize(text)
+        for sentence in sentence_list:
+            pos_tags = nltk.pos_tag(tokenize(sentence))
+            try:
+                first_word, first_tag = pos_tags[0]
+                if first_tag in ['VB', 'VBP'] or first_word == 'RT':
+                    return 1
+            except:
+                return 0
+        return 0
+
+    def fit(self, x, y=None):
+        return self
+
+    def transform(self, X):
+        X_tagged = pd.Series(X).apply(self.starting_verb)
+        return pd.DataFrame(X_tagged)
+
 def tokenize(text):
     tokens = word_tokenize(text)
     lemmatizer = WordNetLemmatizer()

diff --git a/models/classifier.pkl b/models/classifier.pkl
diff --git a/models/classifier_metrics.pkl b/models/classifier_metrics.pkl
diff --git a/models/train_classifier.py b/models/train_classifier.py
@@ -15,7 +15,7 @@
 from sklearn.pipeline import Pipeline, FeatureUnion
 from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
 from sklearn.svm import SVC
-from sklearn.linear_model import LogisticRegression
+from sklearn.linear_model import LogisticRegression, SGDClassifier
 from sklearn.metrics import roc_auc_score
 from sklearn.metrics import make_scorer
 from sklearn.multioutput import MultiOutputClassifier
@@ -24,7 +24,8 @@
 from sklearn.metrics import classification_report
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.externals import joblib
-
+import nltk
+nltk.download(['punkt', 'wordnet','stopwords'])
 ## define some custom stopwords
 #full stopwords from nltk
 stopwords_a= ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
@@ -118,7 +119,7 @@ def tokenize(text):
     text: str that will be tokenized
 
     returns new_tokens (list of extracted tokens)
-    '''  
+    '''
 
     #remove punctuation
     text = re.sub(r"[^a-zA-Z0-9]", " ", text.lower())
@@ -159,21 +160,40 @@ def transform(self, X):
 
 def build_model():
     '''
-    define pipeline and/or gridsearch object for feature extraction and trainig classifier 
+    define pipeline and/or gridsearch object for feature extraction and trainig classifier
     returns pipeline or gridsearch object
-    '''  
+    '''
     pipeline = Pipeline([
-        ('vect', CountVectorizer(tokenizer=tokenize)),
-        ('tfidf', TfidfTransformer()),
-        ('clf', MultiOutputClassifier(AdaBoostClassifier()))
+        ('features', FeatureUnion([
+            ('tfidf_pipeline', Pipeline([
+                ('vect', CountVectorizer(tokenizer=tokenize)),
+                ('tfidf', TfidfTransformer())
+            ])),
+            ('starting_verb', StartingVerbExtractor()),
+        ])),
+        ('clf', MultiOutputClassifier(SGDClassifier()))
     ])
-    #parameters = {'vect__max_df': (0.33, 0.66),
-    #              'vect__ngram_range': [(1, 1),(1, 3)],
-    #               'vect__stop_words': [stopwords_a, stopwords_b, stopwords_d]}
-    #cv = GridSearchCV(pipeline, param_grid = parameters, cv=3, n_jobs=1,
-    #                  verbose = 2, scoring = make_scorer(roc_auc_score))
-    #return cv
-    return pipeline
+
+#parameters = {'features__tfidf_pipeline__vect__max_df': (0.6, 0.8, 1),
+#              'features__tfidf_pipeline__vect__ngram_range': ((1,1),(1, 2)),
+#              'features__tfidf_pipeline__vect__stop_words': (stopwords_a,stopwords_b),
+#              'features__tfidf_pipeline__vect__max_features': (None, 10000),
+#              'clf__estimator__max_iter': (50,),
+#              'clf__estimator__alpha': (0.00001,),
+#              'clf__estimator__penalty': ('elasticnet','l2')}
+
+    parameters = {'features__tfidf_pipeline__vect__max_df': (0.6,),
+              'features__tfidf_pipeline__vect__ngram_range': ((1, 2),),
+              'features__tfidf_pipeline__vect__stop_words': (stopwords_a,),
+              'features__tfidf_pipeline__vect__max_features': (None,),
+              'clf__estimator__max_iter': (50,),
+              'clf__estimator__alpha': (0.00001,),
+              'clf__estimator__penalty': ('elasticnet',)}
+    cv = GridSearchCV(pipeline, param_grid = parameters, cv=5, n_jobs=1,
+                      verbose = 2, scoring = make_scorer(roc_auc_score))
+
+    return cv
+    #return pipeline
 
 def evaluate_model(model, X_test, Y_test, category_names):
     '''
@@ -186,7 +206,7 @@ def get_metrics (y_test, y_pred):
         y_test: dataframe with true labels (binary)
         y_pred: numpy array with predicted labels (y_pred = XXXX.predict(X_test) from an sklearn estimator)
 
-        returns: dataframe with accuracy, precision, f1, recall, tp, tn, fp, fn, roc_auc 
+        returns: dataframe with accuracy, precision, f1, recall, tp, tn, fp, fn, roc_auc