Merge pull request #929 from mindsdb/staging

Release 22.7.2.0
mindsdb · Jul 11, 2022 · 8fa4521 · 8fa4521
2 parents eee7234 + 75fa8cf
commit 8fa4521
Show file tree

Hide file tree

Showing 33 changed files with 1,304 additions and 560 deletions.
diff --git a/lightwood/__about__.py b/lightwood/__about__.py
@@ -1,6 +1,6 @@
 __title__ = 'lightwood'
 __package_name__ = 'lightwood'
-__version__ = '22.6.1.2'
+__version__ = '22.7.2.0'
 __description__ = "Lightwood is a toolkit for automatic machine learning model building"
 __email__ = "[email protected]"
 __author__ = 'MindsDB Inc'

diff --git a/lightwood/analysis/explain.py b/lightwood/analysis/explain.py
@@ -37,6 +37,7 @@ def explain(data: pd.DataFrame,
  # Setup base insights
  # ------------------------- #
  data = data.reset_index(drop=True)
+ predictions = predictions.reset_index(drop=True)
 
  row_insights = pd.DataFrame()
  global_insights = {}

diff --git a/lightwood/analysis/helpers/conf_stats.py b/lightwood/analysis/helpers/conf_stats.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
 from typing import Dict
 from types import SimpleNamespace
 
@@ -55,10 +56,10 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
  mce: maximum value in `bins`.
  global_score: 1.0 minus absolute difference between accuracy and confidence over the entire validation set.
  """
-
+ confs = deepcopy(confs).reset_index(drop=True)
+ sorted_preds = deepcopy(preds).reset_index(drop=True)
+ sorted_inp = deepcopy(data).reset_index(drop=True)
  sorted_val = confs.sort_values(by='confidence', kind='stable')
- sorted_preds = preds.reindex(sorted_val.index)
- sorted_inp = data.reindex(sorted_val.index)
  sorted_inp['__mdb_confidence'] = sorted_val['confidence']
 
  if task_type == 'categorical':

diff --git a/lightwood/analysis/nc/calibrate.py b/lightwood/analysis/nc/calibrate.py
@@ -90,8 +90,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  icp = icp_class(nc, cal_size=self.validation_size)
 
  output['icp']['__default'] = icp
+ icp_df = deepcopy(ns.data)
 
  # setup prediction cache to avoid additional .predict() calls
+ pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \
+ isinstance(ns.normal_predictions['prediction'][0], list)
  if ns.is_classification:
  if ns.predictor.supports_proba:
  icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values
@@ -105,7 +108,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  predicted_classes = pd.get_dummies(preds).values # inflate to one-hot enc
  icp.nc_function.model.prediction_cache = predicted_classes
 
- elif ns.is_multi_ts:
+ elif ns.is_multi_ts or pred_is_list:
  # we fit ICPs for time series confidence bounds only at t+1 forecast
  icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']])
  else:
@@ -116,6 +119,9 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
 
  # fit additional ICPs in time series tasks with grouped columns
  if ns.tss.is_timeseries and ns.tss.group_by:
+ # generate a multiindex
+ midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
+ icp_df.index = midx
 
  # create an ICP for each possible group
  group_info = ns.data[ns.tss.group_by].to_dict('list')
@@ -127,7 +133,6 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  output['icp'][tuple(combination)] = deepcopy(icp)
 
  # calibrate ICP
- icp_df = deepcopy(ns.data)
  icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
  output['icp']['__default'].index = icp_df.columns
  output['icp']['__default'].calibrate(icp_df.values, y)
@@ -137,11 +142,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  icp_df, icp, ns.dtype_dict[ns.target],
  output, positive_domain=self.positive_domain, significance=self.fixed_significance)
  if not ns.is_classification:
- result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float)
+ result_df = pd.DataFrame(index=icp_df.index, columns=['confidence', 'lower', 'upper'], dtype=float)
  result_df.loc[icp_df.index, 'lower'] = ranges[:, 0]
  result_df.loc[icp_df.index, 'upper'] = ranges[:, 1]
  else:
- result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float)
+ result_df = pd.DataFrame(index=icp_df.index, columns=['confidence'], dtype=float)
 
  result_df.loc[icp_df.index, 'confidence'] = conf
 
@@ -152,10 +157,12 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
 
  # add all predictions to DF
  icps_df = deepcopy(ns.data)
- if ns.is_multi_ts:
- icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']]
+ midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
+ icps_df.index = midx
+ if ns.is_multi_ts or pred_is_list:
+ icps_df[f'__predicted_{ns.target}'] = np.array([p[0] for p in ns.normal_predictions['prediction']])
  else:
- icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction']
+ icps_df[f'__predicted_{ns.target}'] = np.array(ns.normal_predictions['prediction'])
 
  for group in icps['__mdb_groups']:
  icp_df = icps_df
@@ -207,6 +214,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
  # consolidate all groups here
  output['icp']['__mdb_active'] = True
 
+ result_df.index = ns.data.index
  output['result_df'] = result_df
 
  info = {**info, **output}
@@ -216,12 +224,21 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
  **kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]:
  ns = SimpleNamespace(**kwargs)
 
+ if 'confidence' in ns.predictions.columns:
+ # bypass calibrator if model already outputs confidence
+ row_insights['prediction'] = ns.predictions['prediction']
+ row_insights['confidence'] = ns.predictions['confidence']
+ if 'upper' in ns.predictions.columns and 'lower' in ns.predictions.columns:
+ row_insights['upper'] = ns.predictions['upper']
+ row_insights['lower'] = ns.predictions['lower']
+ return row_insights, global_insights
+
  if ns.analysis['icp']['__mdb_active']:
  icp_X = deepcopy(ns.data)
 
  # replace observed data w/predictions
  preds = ns.predictions['prediction']
- if ns.tss.is_timeseries and ns.tss.horizon > 1:
+ if ns.tss.is_timeseries and (ns.tss.horizon > 1 or isinstance(preds[0], list)):
  preds = [p[0] for p in preds]
 
  for col in [f'timestep_{i}' for i in range(1, ns.tss.horizon)]:

diff --git a/lightwood/api/json_ai.py b/lightwood/api/json_ai.py
@@ -220,27 +220,46 @@ def generate_json_ai(
  ):
  is_target_predicting_encoder = True
 
+ submodels = []
  if is_target_predicting_encoder:
- submodels = [
- {
- "module": "Unit",
- "args": {
- "target_encoder": "$encoders[self.target]",
- "stop_after": "$problem_definition.seconds_per_mixer",
- },
- }
- ]
+ submodels.extend(
+ [
+ {
+ "module": "Unit",
+ "args": {
+ "target_encoder": "$encoders[self.target]",
+ "stop_after": "$problem_definition.seconds_per_mixer",
+ },
+ }
+ ]
+ )
  else:
- submodels = [
- {
- "module": "Neural",
- "args": {
- "fit_on_dev": True,
- "stop_after": "$problem_definition.seconds_per_mixer",
- "search_hyperparameters": True,
- },
- }
- ]
+ if not tss.is_timeseries:
+ submodels.extend(
+ [
+ {
+ "module": "Neural",
+ "args": {
+ "fit_on_dev": True,
+ "stop_after": "$problem_definition.seconds_per_mixer",
+ "search_hyperparameters": True,
+ },
+ }
+ ]
+ )
+ else:
+ submodels.extend(
+ [
+ {
+ "module": "NeuralTs",
+ "args": {
+ "fit_on_dev": True,
+ "stop_after": "$problem_definition.seconds_per_mixer",
+ "search_hyperparameters": True,
+ },
+ }
+ ]
+ )
 
  if (not tss.is_timeseries or tss.horizon == 1) and dtype_dict[target] not in (dtype.num_array, dtype.cat_array):
  submodels.extend(
@@ -268,7 +287,8 @@ def generate_json_ai(
  "args": {
  "fit_on_dev": True,
  "stop_after": "$problem_definition.seconds_per_mixer",
- "horizon": "$problem_definition.timeseries_settings.horizon",
+ "ts_analysis": "$ts_analysis",
+ "tss": "$problem_definition.timeseries_settings",
  },
  }
  ]
@@ -494,29 +514,30 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
  is_ts = tss.is_timeseries
 
  # Add implicit arguments
- # @TODO: Consider removing once we have a proper editor in studio
  mixers = json_ai.model['args']['submodels']
  for i in range(len(mixers)):
  if mixers[i]["module"] == "Unit":
  pass
 
- elif mixers[i]["module"] == "Neural":
+ elif mixers[i]["module"] in ("Neural", "NeuralTs"):
  mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
  "target_encoder", "$encoders[self.target]"
  )
  mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
  mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
  "dtype_dict", "$dtype_dict"
  )
- mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
- "timeseries_settings", "$problem_definition.timeseries_settings"
- )
  mixers[i]["args"]["net"] = mixers[i]["args"].get(
  "net",
  '"DefaultNet"'
  if not tss.is_timeseries or not tss.use_previous_target
  else '"ArNet"',
  )
+ if mixers[i]["module"] == "NeuralTs":
+ mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
+ "timeseries_settings", "$problem_definition.timeseries_settings"
+ )
+ mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
 
  elif mixers[i]["module"] == "LightGBM":
  mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -551,8 +572,17 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
  mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
  "target_encoder", "$encoders[self.target]"
  )
- if "horizon" not in mixers[i]["args"]:
- mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+ mixers[i]["args"]["tss"] = mixers[i]["args"].get("tss", "$problem_definition.timeseries_settings")
+ mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
+ mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")
+
+ elif mixers[i]["module"] == "NHitsMixer":
+ mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
+ mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
+ mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
+ "ts_analysis", "$ts_analysis"
+ )
+ problem_definition.fit_on_all = False # takes too long otherwise
 
  elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
  mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
@@ -666,6 +696,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
  "dtype_dict": "$dtype_dict",
  "target": "$target",
  "mode": "$mode",
+ "ts_analysis": "$ts_analysis"
  },
  },
  "timeseries_analyzer": {
@@ -807,11 +838,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
 
 # Time-series blocks
 {ts_transform_code}
-"""
- if ts_analyze_code is not None:
- clean_body += f"""
-if self.mode != 'predict':
-{align(ts_analyze_code,1)}
 """
 
  clean_body += '\nreturn data'
@@ -835,12 +861,19 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
  # Prepare features Body
  # ----------------- #
 
- prepare_body = f"""
+ prepare_body = """
 self.mode = 'train'
 
 if self.statistical_analysis is None:
  raise Exception("Please run analyze_data first")
+"""
+ if ts_analyze_code is not None:
+ prepare_body += f"""
+if self.mode != 'predict':
+ {align(ts_analyze_code, 1)}
+"""
 
+ prepare_body += f"""
 # Column to encoder mapping
 self.encoders = {inline_dict(encoder_dict)}
 
@@ -1133,6 +1166,7 @@ def __init__(self):
 
  # Initial stats analysis
  self.statistical_analysis = None
+ self.ts_analysis = None
  self.runtime_log = dict()
 
  @timed

diff --git a/lightwood/api/types.py b/lightwood/api/types.py
@@ -73,6 +73,7 @@ class StatisticalAnalysis:
  :param bias:
  :param avg_words_per_sentence:
  :param positive_domain:
+ :param ts_stats:
  """ # noqa
 
  nr_rows: int
@@ -87,6 +88,7 @@ class StatisticalAnalysis:
  bias: object
  avg_words_per_sentence: object
  positive_domain: bool
+ ts_stats: dict
 
 
 @dataclass_json

diff --git a/lightwood/data/cleaner.py b/lightwood/data/cleaner.py
@@ -147,13 +147,15 @@ def _standardize_datetime(element: object) -> Optional[float]:
  """
  Parses an expected date-time element. Intakes an element that can in theory be anything.
  """
+ if element is None or pd.isna(element):
+ return 0.0 # correct? TODO: Remove if the TS encoder can handle `None`
  try:
  date = parse_dt(str(element))
  except Exception:
  try:
  date = datetime.datetime.utcfromtimestamp(element)
  except Exception:
- return None
+ return 0.0
 
  return date.timestamp()
 

diff --git a/lightwood/data/encoded_ds.py b/lightwood/data/encoded_ds.py
@@ -140,13 +140,15 @@ class ConcatedEncodedDs(EncodedDs):
  """
  `ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity.
  """ # noqa
+ # TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead
  def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
  # @TODO: missing super() call here?
  self.encoded_ds_arr = encoded_ds_arr
  self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr]
  self.encoders = self.encoded_ds_arr[0].encoders
  self.encoder_spans = self.encoded_ds_arr[0].encoder_spans
  self.target = self.encoded_ds_arr[0].target
+ self.data_frame = pd.concat([x.data_frame for x in self.encoded_ds_arr])
 
  def __len__(self):
  """
@@ -166,17 +168,6 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
  idx -= length
  raise StopIteration()
 
- @property
- def data_frame(self) -> pd.DataFrame:
- """
- Property that concatenates all underlying `EncodedDs`'s dataframes and returns them.
- 
- Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect.
- 
- :return: Dataframe with all original data.
- """ # noqa
- return pd.concat([x.data_frame for x in self.encoded_ds_arr])
-
  def get_column_original_data(self, column_name: str) -> pd.Series:
  """
  See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`.