add support for datetime features (#122)

mljar · Jul 14, 2020 · c6b925e · c6b925e
1 parent c44df82
commit c6b925e
Show file tree

Hide file tree

Showing 8 changed files with 235 additions and 8 deletions.
diff --git a/supervised/preprocessing/datetime_transformer.py b/supervised/preprocessing/datetime_transformer.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+import datetime
+import json
+
+class DateTimeTransformer(object):
+    def __init__(self):
+        self._new_columns = []
+        self._old_column = None
+        self._min_datetime = None
+        self._transforms = []
+
+    def fit(self, X, column):
+        self._old_column = column
+        self._min_datetime = np.min(X[column])
+
+        values = X[column].dt.year
+        if len(np.unique(values)) > 1:
+            self._transforms += ["year"]
+            new_column = column +"_Year"
+            self._new_columns += [new_column]
+
+        values = X[column].dt.month
+        if len(np.unique(values)) > 1:
+            self._transforms += ["month"]
+            new_column = column +"_Month"
+            self._new_columns += [new_column]
+
+        values = X[column].dt.day
+        if len(np.unique(values)) > 1:
+            self._transforms += ["day"]
+            new_column = column +"_Day"
+            self._new_columns += [new_column]
+
+        values = X[column].dt.weekday
+        if len(np.unique(values)) > 1:
+            self._transforms += ["weekday"]
+            new_column = column +"_WeekDay"
+            self._new_columns += [new_column]
+
+        values = X[column].dt.dayofyear
+        if len(np.unique(values)) > 1:
+            self._transforms += ["dayofyear"]
+            new_column = column +"_DayOfYear"
+            self._new_columns += [new_column]
+
+        values = X[column].dt.hour
+        if len(np.unique(values)) > 1:
+            self._transforms += ["hour"]
+            new_column = column +"_Hour"
+            self._new_columns += [new_column]
+
+        values = (X[column] - self._min_datetime).dt.days
+        if len(np.unique(values)) > 1:
+            self._transforms += ["days_diff"]
+            new_column = column +"_Days_Diff_To_Min"
+            self._new_columns += [new_column]
+
+    def transform(self, X):
+        column = self._old_column
+
+        if "year" in self._transforms:
+            new_column = column +"_Year"
+            X[new_column] = X[column].dt.year
+
+        if "month" in self._transforms:
+            new_column = column +"_Month"
+            X[new_column] = X[column].dt.month
+
+        if "day" in self._transforms:
+            new_column = column +"_Day"
+            X[new_column] = X[column].dt.day
+
+        if "weekday" in self._transforms:
+            new_column = column +"_WeekDay"
+            X[new_column] = X[column].dt.weekday
+
+        if "dayofyear" in self._transforms:
+            new_column = column +"_DayOfYear"
+            X[new_column] = X[column].dt.dayofyear
+
+        if "hour" in self._transforms:
+            new_column = column +"_Hour"
+            X[new_column] = X[column].dt.hour
+
+        if "days_diff" in self._transforms:
+            new_column = column +"_Days_Diff_To_Min"
+            X[new_column] = (X[column] - self._min_datetime).dt.days
+
+        X.drop(column, axis=1, inplace=True)
+        return X
+
+    def to_json(self):
+        data_json = {
+            "new_columns": list(self._new_columns),
+            "old_column": self._old_column,
+            "min_datetime": str(self._min_datetime),
+            "transforms": list(self._transforms)
+        }
+        return data_json
+
+    def from_json(self, data_json):
+        self._new_columns = data_json.get("new_columns", None)
+        self._old_column = data_json.get("old_column", None)
+        d = data_json.get("min_datetime", None)
+        self._min_datetime = None if d is None else pd.to_datetime(d)
+        self._transforms = data_json.get("transforms", [])
diff --git a/supervised/preprocessing/preprocessing.py b/supervised/preprocessing/preprocessing.py
@@ -9,6 +9,7 @@
 from supervised.preprocessing.scale import Scale
 from supervised.preprocessing.label_encoder import LabelEncoder
 from supervised.preprocessing.label_binarizer import LabelBinarizer
+from supervised.preprocessing.datetime_transformer import DateTimeTransformer
 from supervised.preprocessing.exclude_missing_target import ExcludeRowsMissingTarget
 from supervised.algorithms.registry import (
     BINARY_CLASSIFICATION,
@@ -42,6 +43,7 @@ def __init__(
         self._categorical = []
         self._scale = []
         self._remove_columns = []
+        self._datetime_transforms = []
 
     def _exclude_missing_targets(self, X=None, y=None):
         # check if there are missing values in target column
@@ -148,6 +150,23 @@ def fit_and_transform(self, X_train, y_train):
             X_train = convert.transform(X_train)
             self._categorical += [convert]
 
+        # datetime transform
+        cols_to_process = list(
+                filter(
+                    lambda k: "datetime_transform" in columns_preprocessing[k],
+                    columns_preprocessing,
+                )
+            )
+
+        new_datetime_columns = []
+        for col in cols_to_process:
+
+            t = DateTimeTransformer()
+            t.fit(X_train, col)
+            X_train = t.transform(X_train)
+            self._datetime_transforms += [t]
+            new_datetime_columns += t._new_columns
+
         # SCALE
         for scale_method in [Scale.SCALE_NORMAL, Scale.SCALE_LOG_AND_NORMAL]:
             cols_to_process = list(
@@ -156,6 +175,9 @@ def fit_and_transform(self, X_train, y_train):
                     columns_preprocessing,
                 )
             )
+            if len(cols_to_process) and len(new_datetime_columns) and scale_method == Scale.SCALE_NORMAL:
+                cols_to_process += new_datetime_columns
+
             if len(cols_to_process):
                 scale = Scale(cols_to_process)
                 scale.fit(X_train)
@@ -244,6 +266,11 @@ def transform(self, X_validation, y_validation):
         for convert in self._categorical:
             if X_validation is not None and convert is not None:
                 X_validation = convert.transform(X_validation)
+
+        for dtt in self._datetime_transforms:
+            if X_validation is not None and dtt is not None:
+                X_validation = dtt.transform(X_validation)
+
         for scale in self._scale:
             if X_validation is not None and scale is not None:
                 X_validation = scale.transform(X_validation)
@@ -372,6 +399,14 @@ def to_json(self):
                     cats += [cat.to_json()]
             if cats:
                 preprocessing_params["categorical"] = cats
+
+        if self._datetime_transforms is not None and len(self._datetime_transforms):
+            dtts = []
+            for dtt in self._datetime_transforms:
+                dtts += [dtt.to_json()]
+            if dtts:
+                preprocessing_params["datetime_transforms"] = dtts
+
         if self._scale is not None and len(self._scale):
             scs = [sc.to_json() for sc in self._scale if sc.to_json()]
             if scs:
@@ -403,6 +438,14 @@ def from_json(self, data_json):
                 cat = PreprocessingCategorical()
                 cat.from_json(cat_data)
                 self._categorical += [cat]
+
+        if "datetime_transforms" in data_json:
+            self._datetime_transforms = []
+            for dtt_params in data_json["datetime_transforms"]:
+                dtt = DateTimeTransformer()
+                dtt.from_json(dtt_params)
+                self._datetime_transforms += [dtt]
+
         if "scale" in data_json:
             self._scale = []
             for scale_data in data_json["scale"]:

diff --git a/supervised/preprocessing/preprocessing_missing.py b/supervised/preprocessing/preprocessing_missing.py
@@ -11,17 +11,21 @@ class PreprocessingMissingValues(object):
     FILL_NA_MIN = "na_fill_min_1"
     FILL_NA_MEAN = "na_fill_mean"
     FILL_NA_MEDIAN = "na_fill_median"
+    FILL_DATETIME = "na_fill_datetime"
 
     NA_EXCLUDE = "na_exclude"
     MISSING_VALUE = "_missing_value_"
     REMOVE_COLUMN = "remove_column"
 
+
+
     def __init__(self, columns=[], na_fill_method=FILL_NA_MEDIAN):
         self._columns = columns
         # fill method
         self._na_fill_method = na_fill_method
         # fill parameters stored as a dict, feature -> fill value
         self._na_fill_params = {}
+        self._datetime_columns = []
 
     def fit(self, X):
         X = self._fit_na_fill(X)
@@ -31,6 +35,8 @@ def _fit_na_fill(self, X):
             if np.sum(pd.isnull(X[column]) == True) == 0:
                 continue
             self._na_fill_params[column] = self._get_fill_value(X[column])
+            if PreprocessingUtils.get_type(X[column]) == PreprocessingUtils.DATETIME:
+                self._datetime_columns += [column]
 
     def _get_fill_value(self, x):
         # categorical type
@@ -40,6 +46,10 @@ def _get_fill_value(self, x):
                     PreprocessingMissingValues.MISSING_VALUE
                 )  # add new categorical value
             return PreprocessingUtils.get_most_frequent(x)
+
+        if PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME:
+            return PreprocessingUtils.get_most_frequent(x)
+
         # numerical type
         if self._na_fill_method == PreprocessingMissingValues.FILL_NA_MIN:
             return PreprocessingUtils.get_min(x) - 1.0
@@ -71,12 +81,19 @@ def to_json(self):
         params = {
             "fill_method": self._na_fill_method,
             "fill_params": self._na_fill_params,
+            "datetime_columns": list(self._datetime_columns)
         }
+        for col in self._datetime_columns:
+            params["fill_params"][col] = str(params["fill_params"][col])
         return params
 
     def from_json(self, params):
         if params is not None:
             self._na_fill_method = params.get("fill_method", None)
             self._na_fill_params = params.get("fill_params", {})
+            self._datetime_columns = params.get("datetime_columns", [])
+            for col in self._datetime_columns:
+                self._na_fill_params[col] = pd.to_datetime(self._na_fill_params[col])
         else:
             self._na_fill_method, self._na_fill_params = None, None
+            self._datetime_columns = []
diff --git a/supervised/preprocessing/preprocessing_utils.py b/supervised/preprocessing/preprocessing_utils.py
@@ -12,6 +12,7 @@ class PreprocessingUtils(object):
     CATEGORICAL = "categorical"
     CONTINOUS = "continous"
     DISCRETE = "discrete"
+    DATETIME = "datetime"
 
     @staticmethod
     def get_type(x):
@@ -26,13 +27,21 @@ def get_type(x):
             data_type = PreprocessingUtils.CONTINOUS
         elif col_type.startswith("int"):
             data_type = PreprocessingUtils.DISCRETE
+        elif col_type.startswith("datetime"):
+            data_type = PreprocessingUtils.DATETIME
+
         return data_type
 
     @staticmethod
     def is_categorical(x_org):
         x = x_org[~pd.isnull(x_org)]
         return PreprocessingUtils.get_type(x) == PreprocessingUtils.CATEGORICAL
 
+    @staticmethod
+    def is_datetime(x_org):
+        x = x_org[~pd.isnull(x_org)]
+        return PreprocessingUtils.get_type(x) == PreprocessingUtils.DATETIME
+
     @staticmethod
     def is_0_1(x_org):
         x = x_org[~pd.isnull(x_org)]

diff --git a/supervised/tuner/data_info.py b/supervised/tuner/data_info.py
@@ -37,6 +37,8 @@ def compute(X, y, machinelearning_task):
             if PreprocessingUtils.is_categorical(X[col]):
                 columns_info[col] += ["categorical"]
                 columns_info[col] += [EncodingSelector.get(X, y, col)]
+            elif PreprocessingUtils.is_datetime(X[col]):
+                columns_info[col] += ["datetime_transform"]
             else:
                 # numeric type, check if scale needed
                 if PreprocessingUtils.is_scale_needed(X[col]):

diff --git a/supervised/tuner/preprocessing_tuner.py b/supervised/tuner/preprocessing_tuner.py
@@ -45,22 +45,20 @@ def get(required_preprocessing, data_info, machinelearning_task):
             # convert to categorical only for categorical types
             convert_to_integer_will_be_applied = False
             if (
-                "convert_categorical"
-                in required_preprocessing  # the algorithm needs converted categoricals
-                and "categorical" in preprocessing_needed  # the feature is categorical
+                "convert_categorical" in required_preprocessing  # the algorithm needs converted categoricals
+                and "categorical" in preprocessing_needed        # the feature is categorical
             ):
                 if PreprocessingCategorical.CONVERT_ONE_HOT in preprocessing_needed:
                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_ONE_HOT]
                 else:
                     preprocessing_to_apply += [PreprocessingCategorical.CONVERT_INTEGER]
                     convert_to_integer_will_be_applied = True  # maybe scale needed
 
+            if "datetime_transform" in preprocessing_needed:
+                preprocessing_to_apply += ["datetime_transform"]
+
             if "scale" in required_preprocessing:
-                if convert_to_integer_will_be_applied:
-                    preprocessing_to_apply += [Scale.SCALE_NORMAL]
-                # elif PreprocessingUtils.is_log_scale_needed(X[col]):
-                #    preprocessing_to_apply += [Scale.SCALE_LOG_AND_NORMAL]
-                elif "scale" in preprocessing_needed:
+                if convert_to_integer_will_be_applied or "scale" in preprocessing_needed:
                     preprocessing_to_apply += [Scale.SCALE_NORMAL]
 
             # remeber which preprocessing we need to apply

diff --git a/tests/tests_preprocessing/test_datetime_transformer.py b/tests/tests_preprocessing/test_datetime_transformer.py
@@ -0,0 +1,29 @@
+import unittest
+import tempfile
+import json
+import numpy as np
+import pandas as pd
+
+from supervised.preprocessing.datetime_transformer import DateTimeTransformer
+
+class DateTimeTransformerTest(unittest.TestCase):
+    def test_transformer(self):
+
+        d = {"col1": ["2020/06/01", "2020/06/02", "2020/06/03", "2021/06/01", "2022/06/01"]}
+        df = pd.DataFrame(data=d)
+        df["col1"] = pd.to_datetime(df["col1"])
+        df_org = df.copy()
+
+        transf = DateTimeTransformer()
+        transf.fit(df, "col1")
+        df = transf.transform(df)
+
+        self.assertTrue(df.shape[0] == 5)
+        self.assertTrue("col1" not in df.columns)
+        self.assertTrue("col1_Year" in df.columns)
+
+        transf2 = DateTimeTransformer()
+        transf2.from_json(transf.to_json())
+        df2 = transf2.transform(df_org)
+        self.assertTrue("col1" not in df2.columns)
+        self.assertTrue("col1_Year" in df2.columns)
diff --git a/tests/tests_preprocessing/test_encoding_selector.py b/tests/tests_preprocessing/test_encoding_selector.py
@@ -0,0 +1,22 @@
+import unittest
+import tempfile
+import numpy as np
+import pandas as pd
+from supervised.preprocessing.encoding_selector import EncodingSelector
+from supervised.preprocessing.preprocessing_categorical import PreprocessingCategorical
+
+
+class CategoricalIntegersTest(unittest.TestCase):
+    def test_selector(self):
+
+        d = {"col1": ["a", "a", "c"], "col2": ["a", "b", "c"]}
+        df = pd.DataFrame(data=d)
+
+        self.assertEqual(
+            EncodingSelector.get(df, None, "col1"),
+            PreprocessingCategorical.CONVERT_INTEGER,
+        )
+        self.assertEqual(
+            EncodingSelector.get(df, None, "col2"),
+            PreprocessingCategorical.CONVERT_ONE_HOT,
+        )