-
Notifications
You must be signed in to change notification settings - Fork 25
/
ds_custom_tools.py
106 lines (85 loc) · 4.16 KB
/
ds_custom_tools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
"""
Seaborn CorrPlot() Mimic Function
"""
# Setup with Mask Type Arguments
mask_types = ["numerical", "categorical"]
def _value_to_category(value):
""" Helper function to convert numerical values between -1.0 and 1.0 to discretized categories. """
_categorical_types, _value_ranges_test = ["- (S)", "- (M)", "- (W)", "0", "+ (W)", "+ (M)", "+ (S)"], [-1.0, -0.7, -0.4, -0.1, 0.1, 0.4, 0.7, 1.0]
# Iterates through value ranges and checks if value falls within specific range
for index in range(len(_value_ranges_test) - 1):
if float(value) >= _value_ranges_test[index] and float(value) < _value_ranges_test[index + 1]:
return _categorical_types[index]
def corrplot_(df=None, mask_type="numerical", figsize=(14, 14), fontsize=8, cpalette=(10, 220)):
""" Global function that produces customized correlation plot reducing redundancy. """
if df is None:
raise ReferenceError("\nDataFrame not found.")
corr_data = df.corr()
# Creates whitespace mask over upper right triangle section for repeated features
upper_triangle_mask = np.zeros_like(corr_data, dtype=np.bool)
upper_triangle_mask[np.triu_indices_from(upper_triangle_mask)] = True
# Generates MatPlotLib subplot objects
fig, ax = plt.subplots(figsize=figsize)
# Calculates relative maximum from correlational data
vmax = np.abs(corr_data.values[~upper_triangle_mask]).max()
# Creates correlational heatmap with simple color intensity relative to distribution
cmap = sns.diverging_palette(cpalette[0], cpalette[1], as_cmap=True)
sns.heatmap(corr_data, mask=upper_triangle_mask, cmap=cmap, vmin=-vmax, vmax=vmax, square=True, linecolor="lightgray", linewidths=1, ax=ax)
# Overlays feature names and corr. data values over whitespace mask
for iterator in range(len(corr_data)):
ax.text(iterator+0.5, iterator+0.5, corr_data.columns[iterator], ha="center", va="center", rotation=45)
for jterator in range(iterator+1, len(corr_data)):
value = "{:.3f}".format(corr_data.values[iterator, jterator])
# Switch-case for numerical whitespace mask
if mask_type == "numerical":
ax.text(jterator+0.5, (iterator+0.5), value, ha="center", va="center")
# Switch-case for categorical whitespace mask
if mask_type == "categorical":
ax.text(jterator+0.5, (iterator+0.5), _value_to_category(value), ha="center", va="center", fontsize=fontsize)
ax.axis("off")
"""
Mask Function for Easily Chaining Pandas Filtering Commands
"""
def filterby(df, key, value, operator="="):
""" Function that filters Pandas DataFrame by index (key) and operational parameter (value). Supports method chaining. """
operator_table = {
"=": df[df[key] == value],
"<": df[df[key] > value],
">": df[df[key] < value],
"<=": df[df[key] <= value],
">=": df[df[key] >= value],
"!=": df[df[key] != value]
}
return operator_table[operator]
pd.DataFrame.filterby = filterby
"""
Custom Confusion Matrix Function for Better Visualization
"""
def cmat_(cm, labels=None):
ax = plt.subplot()
sns.heatmap(cm, annot=True, fmt="g", ax=ax)
ax.set_title("Confusion Matrix")
ax.set_xlabel("Predicted Labels")
ax.set_ylabel("True Labels")
if labels:
ax.xaxis.set_ticklabels(labels)
ax.yaxis.set_ticklabels(labels)
plt.show()
"""
Display Class to Preview DataFrame Concatenation and Joining
"""
class Display_Preview(object):
""" Display HTML Representative Preview of Multiple Objects/Data Structures."""
template = """<div style="float: left; padding: 10px;">
<p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
</div>"""
def __init__(self, *args):
self.args = args
def _repr_html_(self):
return '\n'.join(self.template.format(arg, eval(arg)._repr_html_()) for arg in self.args)
def __repr__(self):
return '\n\n'.join(arg + '\n' + repr(eval(arg)) for arg in self.args)