Skip to content

Commit

Permalink
add support for datetime features (#122)
Browse files Browse the repository at this point in the history
  • Loading branch information
pplonski committed Jul 14, 2020
1 parent c44df82 commit c6b925e
Show file tree
Hide file tree
Showing 8 changed files with 235 additions and 8 deletions.
107 changes: 107 additions & 0 deletions supervised/preprocessing/datetime_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import pandas as pd
import datetime
import json

class DateTimeTransformer(object):
def __init__(self):
self._new_columns = []
self._old_column = None
self._min_datetime = None
self._transforms = []

def fit(self, X, column):
self._old_column = column
self._min_datetime = np.min(X[column])

values = X[column].dt.year
if len(np.unique(values)) > 1:
self._transforms += ["year"]
new_column = column +"_Year"
self._new_columns += [new_column]

values = X[column].dt.month
if len(np.unique(values)) > 1:
self._transforms += ["month"]
new_column = column +"_Month"
self._new_columns += [new_column]

values = X[column].dt.day
if len(np.unique(values)) > 1:
self._transforms += ["day"]
new_column = column +"_Day"
self._new_columns += [new_column]

values = X[column].dt.weekday
if len(np.unique(values)) > 1:
self._transforms += ["weekday"]
new_column = column +"_WeekDay"
self._new_columns += [new_column]

values = X[column].dt.dayofyear
if len(np.unique(values)) > 1:
self._transforms += ["dayofyear"]
new_column = column +"_DayOfYear"
self._new_columns += [new_column]

values = X[column].dt.hour
if len(np.unique(values)) > 1:
self._transforms += ["hour"]
new_column = column +"_Hour"
self._new_columns += [new_column]

values = (X[column] - self._min_datetime).dt.days
if len(np.unique(values)) > 1:
self._transforms += ["days_diff"]
new_column = column +"_Days_Diff_To_Min"
self._new_columns += [new_column]

def transform(self, X):
column = self._old_column

if "year" in self._transforms:
new_column = column +"_Year"
X[new_column] = X[column].dt.year

if "month" in self._transforms:
new_column = column +"_Month"
X[new_column] = X[column].dt.month

if "day" in self._transforms:
new_column = column +"_Day"
X[new_column] = X[column].dt.day

if "weekday" in self._transforms:
new_column = column +"_WeekDay"
X[new_column] = X[column].dt.weekday

if "dayofyear" in self._transforms:
new_column = column +"_DayOfYear"
X[new_column] = X[column].dt.dayofyear

if "hour" in self._transforms:
new_column = column +"_Hour"
X[new_column] = X[column].dt.hour

if "days_diff" in self._transforms:
new_column = column +"_Days_Diff_To_Min"
X[new_column] = (X[column] - self._min_datetime).dt.days

X.drop(column, axis=1, inplace=True)
return X

def to_json(self):
data_json = {
"new_columns": list(self._new_columns),
"old_column": self._old_column,
"min_datetime": str(self._min_datetime),
"transforms": list(self._transforms)
}
return data_json

def from_json(self, data_json):
self._new_columns = data_json.get("new_columns", None)
self._old_column = data_json.get("old_column", None)
d = data_json.get("min_datetime", None)
self._min_datetime = None if d is None else pd.to_datetime(d)
self._transforms = data_json.get("transforms", [])
43 changes: 43 additions & 0 deletions supervised/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from supervised.preprocessing.scale import Scale
from supervised.preprocessing.label_encoder import LabelEncoder
from supervised.preprocessing.label_binarizer import LabelBinarizer
from supervised.preprocessing.datetime_transformer import DateTimeTransformer
from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
from supervised.algorithms.registry import (
BINARY_CLASSIFICATION,
Expand Down Expand Up @@ -42,6 +43,7 @@ def __init__(
self._categorical = []
self._scale = []
self._remove_columns = []
self._datetime_transforms = []

def _exclude_missing_targets(self, X=None, y=None):
# check if there are missing values in target column
Expand Down Expand Up @@ -148,6 +150,23 @@ def fit_and_transform(self, X_train, y_train):
X_train = convert.transform(X_train)
self._categorical += [convert]

# datetime transform
cols_to_process = list(
filter(
lambda k: "datetime_transform" in columns_preprocessing[k],
columns_preprocessing,
)
)

new_datetime_columns = []
for col in cols_to_process:

t = DateTimeTransformer()
t.fit(X_train, col)
X_train = t.transform(X_train)
self._datetime_transforms += [t]
new_datetime_columns += t._new_columns

# SCALE
for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
cols_to_process = list(
Expand All @@ -156,6 +175,9 @@ def fit_and_transform(self, X_train, y_train):
columns_preprocessing,
)
)
if len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL:
cols_to_process += new_datetime_columns

if len(cols_to_process):
scale = Scale(cols_to_process)
scale.fit(X_train)
Expand Down Expand Up @@ -244,6 +266,11 @@ def transform(self, X_validation, y_validation):
for convert in self._categorical:
if X_validation is not None and convert is not None:
X_validation = convert.transform(X_validation)

for dtt in self._datetime_transforms:
if X_validation is not None and dtt is not None:
X_validation = dtt.transform(X_validation)

for scale in self._scale:
if X_validation is not None and scale is not None:
X_validation = scale.transform(X_validation)
Expand Down Expand Up @@ -372,6 +399,14 @@ def to_json(self):
cats += [cat.to_json()]
if cats:
preprocessing_params["categorical"] = cats

if self._datetime_transforms is not None and len(self._datetime_transforms):
dtts = []
for dtt in self._datetime_transforms:
dtts += [dtt.to_json()]
if dtts:
preprocessing_params["datetime_transforms"] = dtts

if self._scale is not None and len(self._scale):
scs = [sc.to_json() for sc in self._scale if sc.to_json()]
if scs:
Expand Down Expand Up @@ -403,6 +438,14 @@ def from_json(self, data_json):
cat = PreprocessingCategorical()
cat.from_json(cat_data)
self._categorical += [cat]

if "datetime_transforms" in data_json:
self._datetime_transforms = []
for dtt_params in data_json["datetime_transforms"]:
dtt = DateTimeTransformer()
dtt.from_json(dtt_params)
self._datetime_transforms += [dtt]

if "scale" in data_json:
self._scale = []
for scale_data in data_json["scale"]:
Expand Down
17 changes: 17 additions & 0 deletions supervised/preprocessing/preprocessing_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,21 @@ class PreprocessingMissingValues(object):
FILL_NA_MIN = "na_fill_min_1"
FILL_NA_MEAN = "na_fill_mean"
FILL_NA_MEDIAN = "na_fill_median"
FILL_DATETIME = "na_fill_datetime"

NA_EXCLUDE = "na_exclude"
MISSING_VALUE = "_missing_value_"
REMOVE_COLUMN = "remove_column"



def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN):
self._columns = columns
# fill method
self._na_fill_method = na_fill_method
# fill parameters stored as a dict, feature -> fill value
self._na_fill_params = {}
self._datetime_columns = []

def fit(self, X):
X = self._fit_na_fill(X)
Expand All @@ -31,6 +35,8 @@ def _fit_na_fill(self, X):
if np.sum(pd.isnull(X[column]) == True) == 0:
continue
self._na_fill_params[column] = self._get_fill_value(X[column])
if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
self._datetime_columns += [column]

def _get_fill_value(self, x):
# categorical type
Expand All @@ -40,6 +46,10 @@ def _get_fill_value(self, x):
PreprocessingMissingValues.MISSING_VALUE
) # add new categorical value
return PreprocessingUtils.get_most_frequent(x)

if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME:
return PreprocessingUtils.get_most_frequent(x)

# numerical type
if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
return PreprocessingUtils.get_min(x) - 1.0
Expand Down Expand Up @@ -71,12 +81,19 @@ def to_json(self):
params = {
"fill_method": self._na_fill_method,
"fill_params": self._na_fill_params,
"datetime_columns": list(self._datetime_columns)
}
for col in self._datetime_columns:
params["fill_params"][col] = str(params["fill_params"][col])
return params

def from_json(self, params):
if params is not None:
self._na_fill_method = params.get("fill_method", None)
self._na_fill_params = params.get("fill_params", {})
self._datetime_columns = params.get("datetime_columns", [])
for col in self._datetime_columns:
self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col])
else:
self._na_fill_method, self._na_fill_params = None, None
self._datetime_columns = []
9 changes: 9 additions & 0 deletions supervised/preprocessing/preprocessing_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class PreprocessingUtils(object):
CATEGORICAL = "categorical"
CONTINOUS = "continous"
DISCRETE = "discrete"
DATETIME = "datetime"

@staticmethod
def get_type(x):
Expand All @@ -26,13 +27,21 @@ def get_type(x):
data_type = PreprocessingUtils.CONTINOUS
elif col_type.startswith("int"):
data_type = PreprocessingUtils.DISCRETE
elif col_type.startswith("datetime"):
data_type = PreprocessingUtils.DATETIME

return data_type

@staticmethod
def is_categorical(x_org):
x = x_org[~pd.isnull(x_org)]
return PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL

@staticmethod
def is_datetime(x_org):
x = x_org[~pd.isnull(x_org)]
return PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME

@staticmethod
def is_0_1(x_org):
x = x_org[~pd.isnull(x_org)]
Expand Down
2 changes: 2 additions & 0 deletions supervised/tuner/data_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ def compute(X, y, machinelearning_task):
if PreprocessingUtils.is_categorical(X[col]):
columns_info[col] += ["categorical"]
columns_info[col] += [EncodingSelector.get(X, y, col)]
elif PreprocessingUtils.is_datetime(X[col]):
columns_info[col] += ["datetime_transform"]
else:
# numeric type, check if scale needed
if PreprocessingUtils.is_scale_needed(X[col]):
Expand Down
14 changes: 6 additions & 8 deletions supervised/tuner/preprocessing_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,22 +45,20 @@ def get(required_preprocessing, data_info, machinelearning_task):
# convert to categorical only for categorical types
convert_to_integer_will_be_applied = False
if (
"convert_categorical"
in required_preprocessing # the algorithm needs converted categoricals
and "categorical" in preprocessing_needed # the feature is categorical
"convert_categorical" in required_preprocessing # the algorithm needs converted categoricals
and "categorical" in preprocessing_needed # the feature is categorical
):
if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed:
preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT]
else:
preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
convert_to_integer_will_be_applied = True # maybe scale needed

if "datetime_transform" in preprocessing_needed:
preprocessing_to_apply += ["datetime_transform"]

if "scale" in required_preprocessing:
if convert_to_integer_will_be_applied:
preprocessing_to_apply += [Scale.SCALE_NORMAL]
# elif PreprocessingUtils.is_log_scale_needed(X[col]):
# preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL]
elif "scale" in preprocessing_needed:
if convert_to_integer_will_be_applied or "scale" in preprocessing_needed:
preprocessing_to_apply += [Scale.SCALE_NORMAL]

# remeber which preprocessing we need to apply
Expand Down
29 changes: 29 additions & 0 deletions tests/tests_preprocessing/test_datetime_transformer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import unittest
import tempfile
import json
import numpy as np
import pandas as pd

from supervised.preprocessing.datetime_transformer import DateTimeTransformer

class DateTimeTransformerTest(unittest.TestCase):
def test_transformer(self):

d = {"col1": ["2020/06/01", "2020/06/02", "2020/06/03", "2021/06/01", "2022/06/01"]}
df = pd.DataFrame(data=d)
df["col1"] = pd.to_datetime(df["col1"])
df_org = df.copy()

transf = DateTimeTransformer()
transf.fit(df, "col1")
df = transf.transform(df)

self.assertTrue(df.shape[0] == 5)
self.assertTrue("col1" not in df.columns)
self.assertTrue("col1_Year" in df.columns)

transf2 = DateTimeTransformer()
transf2.from_json(transf.to_json())
df2 = transf2.transform(df_org)
self.assertTrue("col1" not in df2.columns)
self.assertTrue("col1_Year" in df2.columns)
22 changes: 22 additions & 0 deletions tests/tests_preprocessing/test_encoding_selector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
import unittest
import tempfile
import numpy as np
import pandas as pd
from supervised.preprocessing.encoding_selector import EncodingSelector
from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical


class CategoricalIntegersTest(unittest.TestCase):
def test_selector(self):

d = {"col1": ["a", "a", "c"], "col2": ["a", "b", "c"]}
df = pd.DataFrame(data=d)

self.assertEqual(
EncodingSelector.get(df, None, "col1"),
PreprocessingCategorical.CONVERT_INTEGER,
)
self.assertEqual(
EncodingSelector.get(df, None, "col2"),
PreprocessingCategorical.CONVERT_ONE_HOT,
)

0 comments on commit c6b925e

Please sign in to comment.