visuals.py

###########################################
# Suppress matplotlib user warnings
# Necessary for newer version of matplotlib
import warnings
warnings.filterwarnings("ignore", category = UserWarning, module = "matplotlib")
#
# Display inline matplotlib plots with IPython
from IPython import get_ipython
get_ipython().run_line_magic('matplotlib', 'inline')
###########################################

import matplotlib.pyplot as pl
import matplotlib.patches as mpatches
import numpy as np
import pandas as pd
from time import time
from sklearn.metrics import f1_score, accuracy_score


def distribution(data, transformed = False):
    """
    Visualization code for displaying skewed distributions of features
    """
    
    # Create figure
    fig = pl.figure(figsize = (11,5));

    # Skewed feature plotting
    for i, feature in enumerate(['loan_amnt','emp_length','annual_inc','dti','delinq_2yrs','earliest_cr_line','inq_last_6mths','open_acc','pub_rec','revol_bal','revol_util','total_acc','collections_12_mths_ex_med']):
        ax = fig.add_subplot(3, 5, i+1)
        ax.hist(data[feature], bins = 25, color = '#00A0A0')
        ax.set_title("'%s'"%(feature), fontsize = 14)
        ax.set_xlabel("Value")
        ax.set_ylabel("Num. Records")
#         ax.set_ylim((0, 10000))
#         ax.set_yticks([0, 500, 1000, 1500, 2000])
#         ax.set_yticklabels([0, 500, 1000, 1500, ">5000"])

    # Plot aesthetics
    if transformed:
        fig.suptitle("Log-transformed Distributions of Continuous Census Data Features", fontsize = 16, y = 1.03)
    else:
        fig.suptitle("Skewed Distributions of Continuous Census Data Features", fontsize = 16, y = 1.03)

    fig.tight_layout()
    fig.show()


def evaluate(results, mse, r2):
    """
    Visualization code to display results of various learners.
    
    inputs:
      - learners: a list of supervised learners
      - stats: a list of dictionaries of the statistic results from 'train_predict()'
      - accuracy: The score for the naive predictor
      - f1: The score for the naive predictor
    """
  
    # Create figure
    fig, ax = pl.subplots(2, 3, figsize = (11,7))

    # Constants
    bar_width = 0.3
    colors = ['#A00000','#00A0A0','#00A000']
    
    # Super loop to plot four panels of data
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'mse_train', 'R2_train', 'pred_time', 'mse_test', 'R2_test']):
            for i in np.arange(3):
                
                # Creative plot code
                ax[j%2, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
                ax[j%2, j%3].set_xticks([0.45, 1.45, 2.45])
                ax[j%2, j%3].set_xticklabels(["1%", "10%", "100%"])
                ax[j%2, j%3].set_xlabel("Training Set Size")
                ax[j%2, j%3].set_xlim((-0.1, 3.0))
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("MSE")
    ax[0, 2].set_ylabel("R^2")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("MSE")
    ax[1, 2].set_ylabel("R^2")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("MSE on Training Subset")
    ax[0, 2].set_title("R^2 on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("MSE on Testing Set")
    ax[1, 2].set_title("R^2 on Testing Set")
    
    # Add horizontal lines for naive predictors
    ax[0, 1].axhline(y = mse, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 1].axhline(y = mse, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[0, 2].axhline(y = r2, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 2].axhline(y = r2, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((-.5, 1))
    ax[0, 2].set_ylim((-.5, 1))
    ax[1, 1].set_ylim((-.5, 1))
    ax[1, 2].set_ylim((-.5, 1))

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
    
    # Aesthetics
    pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    pl.tight_layout()
    pl.show()
    
def classevaluate(results, ROCAUC, prec):
    """
    Visualization code to display results of various learners.
    
    inputs:
      - learners: a list of supervised learners
      - stats: a list of dictionaries of the statistic results from 'train_predict()'
      - accuracy: The score for the naive predictor
      - f1: The score for the naive predictor
    """
  
    # Create figure
    fig, ax = pl.subplots(2, 3, figsize = (11,7))

    # Constants
    bar_width = 0.3
    colors = ['#A00000','#00A0A0','#00A000']
    
    # Super loop to plot four panels of data
    for k, learner in enumerate(results.keys()):
        for j, metric in enumerate(['train_time', 'ROC_AUC_train', 'precision_train', 'pred_time', 'ROC_AUC_test', 'precision_test']):
            for i in np.arange(3):
                
                # Creative plot code
                ax[j%2, j%3].bar(i+k*bar_width, results[learner][i][metric], width = bar_width, color = colors[k])
                ax[j%2, j%3].set_xticks([0.45, 1.45, 2.45])
                ax[j%2, j%3].set_xticklabels(["1%", "10%", "100%"])
                ax[j%2, j%3].set_xlabel("Training Set Size")
                ax[j%2, j%3].set_xlim((-0.1, 3.0))
    
    # Add unique y-labels
    ax[0, 0].set_ylabel("Time (in seconds)")
    ax[0, 1].set_ylabel("ROC AUC")
    ax[0, 2].set_ylabel("Precisiion")
    ax[1, 0].set_ylabel("Time (in seconds)")
    ax[1, 1].set_ylabel("ROC AUC")
    ax[1, 2].set_ylabel("Precision")
    
    # Add titles
    ax[0, 0].set_title("Model Training")
    ax[0, 1].set_title("ROC AUC on Training Subset")
    ax[0, 2].set_title("Precision on Training Subset")
    ax[1, 0].set_title("Model Predicting")
    ax[1, 1].set_title("ROC AUC on Testing Set")
    ax[1, 2].set_title("Precision on Testing Set")
    
    # Add horizontal lines for naive predictors
    ax[0, 1].axhline(y = ROCAUC, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 1].axhline(y = ROCAUC, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[0, 2].axhline(y = prec, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    ax[1, 2].axhline(y = prec, xmin = -0.1, xmax = 3.0, linewidth = 1, color = 'k', linestyle = 'dashed')
    
    # Set y-limits for score panels
    ax[0, 1].set_ylim((0, 1))
    ax[0, 2].set_ylim((0, 1))
    ax[1, 1].set_ylim((0, 1))
    ax[1, 2].set_ylim((0, 1))

    # Create patches for the legend
    patches = []
    for i, learner in enumerate(results.keys()):
        patches.append(mpatches.Patch(color = colors[i], label = learner))
    pl.legend(handles = patches, bbox_to_anchor = (-.80, 2.53), \
               loc = 'upper center', borderaxespad = 0., ncol = 3, fontsize = 'x-large')
    
    # Aesthetics
    pl.suptitle("Performance Metrics for Three Supervised Learning Models", fontsize = 16, y = 1.10)
    pl.tight_layout()
    pl.show()

# def feature_plot(importances, X_train, y_train):    
    
    # Display the five most important features
#     indices = np.argsort(importances)[::-1]
#     columns = X_train.columns.values[indices[:5]]
#     values = importances[indices][:5]
# 
#     # Creat the plot
#     fig = pl.figure(figsize = (9,5))
#     pl.title("Normalized Weights for First Five Most Predictive Features", fontsize = 16)
#     pl.bar(np.arange(5), values, width = 0.6, align="center", color = '#00A000', \
#           label = "Feature Weight")
#     pl.bar(np.arange(5) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
#           label = "Cumulative Feature Weight")
#     pl.xticks(np.arange(5), columns)
#     pl.xlim((-0.5, 4.5))
#     pl.ylabel("Weight", fontsize = 12)
#     pl.xlabel("Feature", fontsize = 12)
#     
#     pl.legend(loc = 'upper center')
#     pl.tight_layout()
#     pl.show()  

def feature_plot(importances, X_train, y_train):
    
    # Display the five most important features
    indices = np.argsort(importances)[::-1]
    columns = X_train.columns.values[indices[:10]]
    values = importances[indices][:10]

    # Create the plot
    fig = pl.figure(figsize = (10,5))
    pl.title("Normalized Weights for First Ten Most Predictive Features", fontsize = 16)
    pl.bar(np.arange(10), values, width = 0.6, align="center", color = '#00A000', \
          label = "Feature Weight")
    pl.bar(np.arange(10) - 0.3, np.cumsum(values), width = 0.2, align = "center", color = '#00A0A0', \
          label = "Cumulative Feature Weight")
    pl.xticks(np.arange(10), columns)
    pl.xlim((-0.5, 9.5))
    pl.ylabel("Weight", fontsize = 12)
    pl.xlabel("Feature", fontsize = 12)
    
    pl.legend(loc = 'upper center')
    pl.tight_layout()
    pl.show()