Skip to content

Commit

Permalink
Merge pull request #1 from UHBristolDataScience/frontiers-review
Browse files Browse the repository at this point in the history
Frontiers review
  • Loading branch information
rustyBilges authored Jan 31, 2021
2 parents e6d18e5 + b2339e7 commit 19cdb86
Show file tree
Hide file tree
Showing 9 changed files with 2,538 additions and 3,076 deletions.
1,940 changes: 0 additions & 1,940 deletions Figures 1 and 2.bak.ipynb

This file was deleted.

25 changes: 14 additions & 11 deletions cap_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,20 @@
import numpy as np
import re

DATA_DIR = '../data/'

"""
Notes on time conversion:
Where days are months are missing from the date it defaults to 1st day/month. So times before DOD will be overestimated. One option would be to use years rather than months.
Probably missing some patterns below, but this should cover most cases (notably 21 Jan 2020 will omit leave a hanging 21). There is one typo written '22/112007' which fails silently (just replaces date with a space). And other dates in formt 21-01-2020 which fail.
"""


def load_data():
def load_data(data_dir):

return pd.read_excel(DATA_DIR + '20191028_committee_reviews_nlp_code.xlsx',
sheet_name='_20191028_committee_reviews_nlp')
return pd.read_excel(data_dir + '20191028_committee_reviews_nlp_code.xlsx',
sheet_name='_20191028_committee_reviews_nlp', engine='openpyxl')


def concatenate_feature_columns(df, columns=None):
def concatenate_feature_columns(df, columns=None, remove_nl=True):

if columns is None:
# create column with concatenation of all columns for any case,
Expand All @@ -29,13 +27,18 @@ def concatenate_feature_columns(df, columns=None):
cols = columns

df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
df['combined'] = df['combined'].replace({'nan': ''}, regex=True)

if remove_nl:
df['combined'] = df['combined'].replace({'_x000d_': ''}, regex=True)

return df


def add_dates(df):
def add_dates(df, data_dir):

deaths = pd.read_csv(DATA_DIR + '20200423_extra_dod_dodx.txt')
deaths = deaths.merge(pd.read_csv(DATA_DIR + '20200429_cap_id_lookup.txt'), on='cp1random_id_5_char')
deaths = pd.read_csv(data_dir + '20200423_extra_dod_dodx.txt')
deaths = deaths.merge(pd.read_csv(data_dir + '20200429_cap_id_lookup.txt'), on='cp1random_id_5_char')
df = df.merge(deaths, on='cp1id')

df['cnr19datedeath'] = pd.to_datetime(df['cnr19datedeath'], dayfirst=True)
Expand All @@ -45,9 +48,9 @@ def add_dates(df):
return df


def add_reviewer_ids(df):
def add_reviewer_ids(df, data_dir):

reviewers = pd.read_csv(DATA_DIR + '20191119_committee_reviews_nlp_code_update.csv')
reviewers = pd.read_csv(data_dir + '20191119_committee_reviews_nlp_code_update.csv')
df = df.merge(reviewers, on='cp1id')

return df
Expand Down
805 changes: 805 additions & 0 deletions classifier_performance_comparison.ipynb

Large diffs are not rendered by default.

121 changes: 120 additions & 1 deletion explainability.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,26 @@
import pandas as pd
import numpy as np
import shap
import fatf
import fatf.transparency.predictions.surrogate_explainers as fatf_surrogates
import fatf.vis.lime as fatf_vis_lime
from treeinterpreter import treeinterpreter as ti
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


RANDOM_STATE = 42
TEST_SIZE = 0.2
fatf.setup_random_seed(RANDOM_STATE)

def get_rf_feature_importances(grid_search_rf):
fimps = pd.DataFrame()
fimps['feature'] = grid_search_rf.best_estimator_['vect'].get_feature_names()
fimps['contribution'] = grid_search_rf.best_estimator_['clf'].feature_importances_
fimps['magnitude'] = np.abs(fimps.contribution)
fimps.sort_values('magnitude', inplace=True, ascending=False)
fimps['rank_rf'] = range(len(fimps))

return fimps

Expand Down Expand Up @@ -54,7 +64,8 @@ def get_ti_feature_contributions_for_instance_i(i, contributions, grid_search_rf
result['magnitude'] = np.absolute(contributions[i, :, 0])

result = result.loc[~result.feature.apply(lambda x: any(char.isdigit() for char in x))]
result.sort_values(by='contribution', inplace=True)
result.sort_values(by='magnitude', inplace=True, ascending=False)
result['rank_ti'] = range(len(result))

return result

Expand All @@ -69,5 +80,113 @@ def get_ti_feature_contributions_average(contributions, grid_search_rf):

result = result.reset_index().rename(columns={'index': 'feature'})
result['magnitude'] = np.absolute(result['contribution'])
result = result.loc[~result.feature.apply(lambda x: any(char.isdigit() for char in x))]
result.sort_values(by='magnitude', inplace=True, ascending=False)
result['rank_ti'] = range(len(result))

return result

def get_lime_explanation_instance(grid_search_clf, data, index_to_explain, ns=500):

X = grid_search_clf.best_estimator_['vect'].fit_transform(data).toarray()
X = grid_search_clf.best_estimator_['tfidf'].fit_transform(X).toarray()

lime = fatf_surrogates.TabularBlimeyLime(
X,
grid_search_clf.best_estimator_['clf'],
feature_names=grid_search_clf.best_estimator_['vect'].get_feature_names(),
class_names=['np', 'pc']
)

lime_explanation = lime.explain_instance(
X[index_to_explain, :], samples_number=ns
)

result = pd.DataFrame()
result['feature'] = grid_search_clf.best_estimator_['vect'].get_feature_names()
result['contribution'] = [
lime_explanation['pc'][key] for key in lime_explanation['pc'].keys()
]
result['magnitude'] = [np.abs(c) for c in result['contribution']]
result.sort_values('magnitude', ascending=False, inplace=True)
result['rank_lime'] = range(len(result))

return result

def get_lime_explanation_average(grid_search_clf, data, ns=500):

X = grid_search_clf.best_estimator_['vect'].fit_transform(data).toarray()
X = grid_search_clf.best_estimator_['tfidf'].fit_transform(X).toarray()

lime = fatf_surrogates.TabularBlimeyLime(
X,
grid_search_clf.best_estimator_['clf'],
feature_names=grid_search_clf.best_estimator_['vect'].get_feature_names(),
class_names=['np', 'pc']
)

result = pd.DataFrame()
result['feature'] = grid_search_clf.best_estimator_['vect'].get_feature_names()
average_contribution = np.zeros(len(result.feature))

for i in range(len(X)):

index_to_explain = i
lime_explanation = lime.explain_instance(
X[index_to_explain, :], samples_number=ns
)
for ki, key in enumerate(lime_explanation['pc'].keys()):
average_contribution[ki] += lime_explanation['pc'][key]


result['contribution'] = average_contribution / len(X)
result['magnitude'] = [np.abs(c) for c in result['contribution']]
result.sort_values('magnitude', ascending=False, inplace=True)
result['rank_lime'] = range(len(result))

return result

def get_shap_value_average(grid_search_clf, data):

X = grid_search_clf.best_estimator_['vect'].fit_transform(data).toarray()
X = grid_search_clf.best_estimator_['tfidf'].fit_transform(X).toarray()

features = grid_search_clf.best_estimator_['vect'].get_feature_names()

X_train_df = pd.DataFrame()

for i, fi in enumerate(features):
X_train_df[fi] = X[:,i]

explainer = shap.Explainer(grid_search_clf.best_estimator_['clf'])
shap_values = explainer(X_train_df)

result = pd.DataFrame()
result['feature'] = features
result['contribution'] = shap_values.values[:,:,1].mean(axis=0)
result['magnitude'] = [np.abs(c) for c in result.contribution]
result.sort_values('magnitude', ascending=False, inplace=True)
result['rank_shap'] = range(len(result))

return result

def get_shap_values(grid_search_clf, documents):

X = grid_search_clf.best_estimator_['vect'].fit_transform(documents).toarray()
X = grid_search_clf.best_estimator_['tfidf'].fit_transform(X).toarray()

features = grid_search_clf.best_estimator_['vect'].get_feature_names()

X_train_L, X_test_L = train_test_split(
X, test_size=TEST_SIZE, random_state=RANDOM_STATE
)
X_train_df = pd.DataFrame()

for i, fi in enumerate(features):
X_train_df[fi] = X_train_L[:,i]

explainer = shap.Explainer(grid_search_clf.best_estimator_['clf'])
shap_values = explainer(X_train_df)

return shap_values

Loading

0 comments on commit 19cdb86

Please sign in to comment.