Skip to content

Commit

Permalink
Merge pull request #929 from mindsdb/staging
Browse files Browse the repository at this point in the history
Release 22.7.2.0
  • Loading branch information
paxcema authored Jul 11, 2022
2 parents eee7234 + 75fa8cf commit 8fa4521
Show file tree
Hide file tree
Showing 33 changed files with 1,304 additions and 560 deletions.
2 changes: 1 addition & 1 deletion lightwood/__about__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
__title__ = 'lightwood'
__package_name__ = 'lightwood'
__version__ = '22.6.1.2'
__version__ = '22.7.2.0'
__description__ = "Lightwood is a toolkit for automatic machine learning model building"
__email__ = "[email protected]"
__author__ = 'MindsDB Inc'
Expand Down
1 change: 1 addition & 0 deletions lightwood/analysis/explain.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def explain(data: pd.DataFrame,
# Setup base insights
# ------------------------- #
data = data.reset_index(drop=True)
predictions = predictions.reset_index(drop=True)

row_insights = pd.DataFrame()
global_insights = {}
Expand Down
7 changes: 4 additions & 3 deletions lightwood/analysis/helpers/conf_stats.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from copy import deepcopy
from typing import Dict
from types import SimpleNamespace

Expand Down Expand Up @@ -55,10 +56,10 @@ def _get_stats(self, confs, preds, data, target, task_type='categorical'):
mce: maximum value in `bins`.
global_score: 1.0 minus absolute difference between accuracy and confidence over the entire validation set.
"""

confs = deepcopy(confs).reset_index(drop=True)
sorted_preds = deepcopy(preds).reset_index(drop=True)
sorted_inp = deepcopy(data).reset_index(drop=True)
sorted_val = confs.sort_values(by='confidence', kind='stable')
sorted_preds = preds.reindex(sorted_val.index)
sorted_inp = data.reindex(sorted_val.index)
sorted_inp['__mdb_confidence'] = sorted_val['confidence']

if task_type == 'categorical':
Expand Down
33 changes: 25 additions & 8 deletions lightwood/analysis/nc/calibrate.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
icp = icp_class(nc, cal_size=self.validation_size)

output['icp']['__default'] = icp
icp_df = deepcopy(ns.data)

# setup prediction cache to avoid additional .predict() calls
pred_is_list = isinstance(ns.normal_predictions['prediction'], list) and \
isinstance(ns.normal_predictions['prediction'][0], list)
if ns.is_classification:
if ns.predictor.supports_proba:
icp.nc_function.model.prediction_cache = ns.normal_predictions[all_cat_cols].values
Expand All @@ -105,7 +108,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
predicted_classes = pd.get_dummies(preds).values # inflate to one-hot enc
icp.nc_function.model.prediction_cache = predicted_classes

elif ns.is_multi_ts:
elif ns.is_multi_ts or pred_is_list:
# we fit ICPs for time series confidence bounds only at t+1 forecast
icp.nc_function.model.prediction_cache = np.array([p[0] for p in ns.normal_predictions['prediction']])
else:
Expand All @@ -116,6 +119,9 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:

# fit additional ICPs in time series tasks with grouped columns
if ns.tss.is_timeseries and ns.tss.group_by:
# generate a multiindex
midx = pd.MultiIndex.from_frame(icp_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
icp_df.index = midx

# create an ICP for each possible group
group_info = ns.data[ns.tss.group_by].to_dict('list')
Expand All @@ -127,7 +133,6 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
output['icp'][tuple(combination)] = deepcopy(icp)

# calibrate ICP
icp_df = deepcopy(ns.data)
icp_df, y = clean_df(icp_df, ns.target, ns.is_classification, output.get('label_encoders', None))
output['icp']['__default'].index = icp_df.columns
output['icp']['__default'].calibrate(icp_df.values, y)
Expand All @@ -137,11 +142,11 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
icp_df, icp, ns.dtype_dict[ns.target],
output, positive_domain=self.positive_domain, significance=self.fixed_significance)
if not ns.is_classification:
result_df = pd.DataFrame(index=ns.data.index, columns=['confidence', 'lower', 'upper'], dtype=float)
result_df = pd.DataFrame(index=icp_df.index, columns=['confidence', 'lower', 'upper'], dtype=float)
result_df.loc[icp_df.index, 'lower'] = ranges[:, 0]
result_df.loc[icp_df.index, 'upper'] = ranges[:, 1]
else:
result_df = pd.DataFrame(index=ns.data.index, columns=['confidence'], dtype=float)
result_df = pd.DataFrame(index=icp_df.index, columns=['confidence'], dtype=float)

result_df.loc[icp_df.index, 'confidence'] = conf

Expand All @@ -152,10 +157,12 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:

# add all predictions to DF
icps_df = deepcopy(ns.data)
if ns.is_multi_ts:
icps_df[f'__predicted_{ns.target}'] = [p[0] for p in ns.normal_predictions['prediction']]
midx = pd.MultiIndex.from_frame(icps_df[[*ns.tss.group_by, f'__mdb_original_{ns.tss.order_by[0]}']])
icps_df.index = midx
if ns.is_multi_ts or pred_is_list:
icps_df[f'__predicted_{ns.target}'] = np.array([p[0] for p in ns.normal_predictions['prediction']])
else:
icps_df[f'__predicted_{ns.target}'] = ns.normal_predictions['prediction']
icps_df[f'__predicted_{ns.target}'] = np.array(ns.normal_predictions['prediction'])

for group in icps['__mdb_groups']:
icp_df = icps_df
Expand Down Expand Up @@ -207,6 +214,7 @@ def analyze(self, info: Dict[str, object], **kwargs) -> Dict[str, object]:
# consolidate all groups here
output['icp']['__mdb_active'] = True

result_df.index = ns.data.index
output['result_df'] = result_df

info = {**info, **output}
Expand All @@ -216,12 +224,21 @@ def explain(self, row_insights: pd.DataFrame, global_insights: Dict[str, object]
**kwargs) -> Tuple[pd.DataFrame, Dict[str, object]]:
ns = SimpleNamespace(**kwargs)

if 'confidence' in ns.predictions.columns:
# bypass calibrator if model already outputs confidence
row_insights['prediction'] = ns.predictions['prediction']
row_insights['confidence'] = ns.predictions['confidence']
if 'upper' in ns.predictions.columns and 'lower' in ns.predictions.columns:
row_insights['upper'] = ns.predictions['upper']
row_insights['lower'] = ns.predictions['lower']
return row_insights, global_insights

if ns.analysis['icp']['__mdb_active']:
icp_X = deepcopy(ns.data)

# replace observed data w/predictions
preds = ns.predictions['prediction']
if ns.tss.is_timeseries and ns.tss.horizon > 1:
if ns.tss.is_timeseries and (ns.tss.horizon > 1 or isinstance(preds[0], list)):
preds = [p[0] for p in preds]

for col in [f'timestep_{i}' for i in range(1, ns.tss.horizon)]:
Expand Down
100 changes: 67 additions & 33 deletions lightwood/api/json_ai.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,27 +220,46 @@ def generate_json_ai(
):
is_target_predicting_encoder = True

submodels = []
if is_target_predicting_encoder:
submodels = [
{
"module": "Unit",
"args": {
"target_encoder": "$encoders[self.target]",
"stop_after": "$problem_definition.seconds_per_mixer",
},
}
]
submodels.extend(
[
{
"module": "Unit",
"args": {
"target_encoder": "$encoders[self.target]",
"stop_after": "$problem_definition.seconds_per_mixer",
},
}
]
)
else:
submodels = [
{
"module": "Neural",
"args": {
"fit_on_dev": True,
"stop_after": "$problem_definition.seconds_per_mixer",
"search_hyperparameters": True,
},
}
]
if not tss.is_timeseries:
submodels.extend(
[
{
"module": "Neural",
"args": {
"fit_on_dev": True,
"stop_after": "$problem_definition.seconds_per_mixer",
"search_hyperparameters": True,
},
}
]
)
else:
submodels.extend(
[
{
"module": "NeuralTs",
"args": {
"fit_on_dev": True,
"stop_after": "$problem_definition.seconds_per_mixer",
"search_hyperparameters": True,
},
}
]
)

if (not tss.is_timeseries or tss.horizon == 1) and dtype_dict[target] not in (dtype.num_array, dtype.cat_array):
submodels.extend(
Expand Down Expand Up @@ -268,7 +287,8 @@ def generate_json_ai(
"args": {
"fit_on_dev": True,
"stop_after": "$problem_definition.seconds_per_mixer",
"horizon": "$problem_definition.timeseries_settings.horizon",
"ts_analysis": "$ts_analysis",
"tss": "$problem_definition.timeseries_settings",
},
}
]
Expand Down Expand Up @@ -494,29 +514,30 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
is_ts = tss.is_timeseries

# Add implicit arguments
# @TODO: Consider removing once we have a proper editor in studio
mixers = json_ai.model['args']['submodels']
for i in range(len(mixers)):
if mixers[i]["module"] == "Unit":
pass

elif mixers[i]["module"] == "Neural":
elif mixers[i]["module"] in ("Neural", "NeuralTs"):
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
mixers[i]["args"]["dtype_dict"] = mixers[i]["args"].get(
"dtype_dict", "$dtype_dict"
)
mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
"timeseries_settings", "$problem_definition.timeseries_settings"
)
mixers[i]["args"]["net"] = mixers[i]["args"].get(
"net",
'"DefaultNet"'
if not tss.is_timeseries or not tss.use_previous_target
else '"ArNet"',
)
if mixers[i]["module"] == "NeuralTs":
mixers[i]["args"]["timeseries_settings"] = mixers[i]["args"].get(
"timeseries_settings", "$problem_definition.timeseries_settings"
)
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")

elif mixers[i]["module"] == "LightGBM":
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
Expand Down Expand Up @@ -551,8 +572,17 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
mixers[i]["args"]["target_encoder"] = mixers[i]["args"].get(
"target_encoder", "$encoders[self.target]"
)
if "horizon" not in mixers[i]["args"]:
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
mixers[i]["args"]["tss"] = mixers[i]["args"].get("tss", "$problem_definition.timeseries_settings")
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get("ts_analysis", "$ts_analysis")
mixers[i]["args"]["fit_on_dev"] = mixers[i]["args"].get("fit_on_dev", "True")

elif mixers[i]["module"] == "NHitsMixer":
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
mixers[i]["args"]["horizon"] = "$problem_definition.timeseries_settings.horizon"
mixers[i]["args"]["ts_analysis"] = mixers[i]["args"].get(
"ts_analysis", "$ts_analysis"
)
problem_definition.fit_on_all = False # takes too long otherwise

elif mixers[i]["module"] in ("SkTime", "ProphetMixer"):
mixers[i]["args"]["target"] = mixers[i]["args"].get("target", "$target")
Expand Down Expand Up @@ -666,6 +696,7 @@ def _add_implicit_values(json_ai: JsonAI) -> JsonAI:
"dtype_dict": "$dtype_dict",
"target": "$target",
"mode": "$mode",
"ts_analysis": "$ts_analysis"
},
},
"timeseries_analyzer": {
Expand Down Expand Up @@ -807,11 +838,6 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
# Time-series blocks
{ts_transform_code}
"""
if ts_analyze_code is not None:
clean_body += f"""
if self.mode != 'predict':
{align(ts_analyze_code,1)}
"""

clean_body += '\nreturn data'
Expand All @@ -835,12 +861,19 @@ def code_from_json_ai(json_ai: JsonAI) -> str:
# Prepare features Body
# ----------------- #

prepare_body = f"""
prepare_body = """
self.mode = 'train'
if self.statistical_analysis is None:
raise Exception("Please run analyze_data first")
"""
if ts_analyze_code is not None:
prepare_body += f"""
if self.mode != 'predict':
{align(ts_analyze_code, 1)}
"""

prepare_body += f"""
# Column to encoder mapping
self.encoders = {inline_dict(encoder_dict)}
Expand Down Expand Up @@ -1133,6 +1166,7 @@ def __init__(self):
# Initial stats analysis
self.statistical_analysis = None
self.ts_analysis = None
self.runtime_log = dict()
@timed
Expand Down
2 changes: 2 additions & 0 deletions lightwood/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ class StatisticalAnalysis:
:param bias:
:param avg_words_per_sentence:
:param positive_domain:
:param ts_stats:
""" # noqa

nr_rows: int
Expand All @@ -87,6 +88,7 @@ class StatisticalAnalysis:
bias: object
avg_words_per_sentence: object
positive_domain: bool
ts_stats: dict


@dataclass_json
Expand Down
4 changes: 3 additions & 1 deletion lightwood/data/cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,13 +147,15 @@ def _standardize_datetime(element: object) -> Optional[float]:
"""
Parses an expected date-time element. Intakes an element that can in theory be anything.
"""
if element is None or pd.isna(element):
return 0.0 # correct? TODO: Remove if the TS encoder can handle `None`
try:
date = parse_dt(str(element))
except Exception:
try:
date = datetime.datetime.utcfromtimestamp(element)
except Exception:
return None
return 0.0

return date.timestamp()

Expand Down
13 changes: 2 additions & 11 deletions lightwood/data/encoded_ds.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,13 +140,15 @@ class ConcatedEncodedDs(EncodedDs):
"""
`ConcatedEncodedDs` abstracts over multiple encoded datasources (`EncodedDs`) as if they were a single entity.
""" # noqa
# TODO: We should probably delete this abstraction, it's not really useful and it adds complexity/overhead
def __init__(self, encoded_ds_arr: List[EncodedDs]) -> None:
# @TODO: missing super() call here?
self.encoded_ds_arr = encoded_ds_arr
self.encoded_ds_lenghts = [len(x) for x in self.encoded_ds_arr]
self.encoders = self.encoded_ds_arr[0].encoders
self.encoder_spans = self.encoded_ds_arr[0].encoder_spans
self.target = self.encoded_ds_arr[0].target
self.data_frame = pd.concat([x.data_frame for x in self.encoded_ds_arr])

def __len__(self):
"""
Expand All @@ -166,17 +168,6 @@ def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
idx -= length
raise StopIteration()

@property
def data_frame(self) -> pd.DataFrame:
"""
Property that concatenates all underlying `EncodedDs`'s dataframes and returns them.
Note: be careful to not modify a `ConcatedEncodedDs`, as you can see in the source, it will not have an effect.
:return: Dataframe with all original data.
""" # noqa
return pd.concat([x.data_frame for x in self.encoded_ds_arr])

def get_column_original_data(self, column_name: str) -> pd.Series:
"""
See `lightwood.data.encoded_ds.EncodedDs.get_column_original_data()`.
Expand Down
Loading

0 comments on commit 8fa4521

Please sign in to comment.