diff --git a/mlops/unit_2_training/custom/dashboard_data_source.py b/mlops/unit_2_training/custom/dashboard_data_source.py index 79cdb3dda..d7cead9f7 100644 --- a/mlops/unit_2_training/custom/dashboard_data_source.py +++ b/mlops/unit_2_training/custom/dashboard_data_source.py @@ -10,12 +10,12 @@ @custom def source( + model: Booster, settings: Tuple[ Dict[str, Union[bool, float, int, str]], csr_matrix, Series, ], - model: Booster, **kwargs, ) -> Tuple[Booster, csr_matrix, csr_matrix]: X_train, y_train, _ = settings diff --git a/mlops/unit_2_training/data_exporters/xgboost.py b/mlops/unit_2_training/data_exporters/xgboost.py index 456729f18..9ecc9501a 100644 --- a/mlops/unit_2_training/data_exporters/xgboost.py +++ b/mlops/unit_2_training/data_exporters/xgboost.py @@ -12,6 +12,7 @@ @data_exporter def train( + training_set: Dict[str, Union[Series, csr_matrix]], settings: Tuple[ Dict[str, Union[bool, float, int, str]], csr_matrix, @@ -31,5 +32,6 @@ def train( hyperparameters, verbose_eval=kwargs.get('verbose_eval', 100), ) - - return model + + # DictVectorizer for online inference. + return model, training_set['build'][6] diff --git a/mlops/unit_4_triggering/pipelines/xgboost_training/metadata.yaml b/mlops/unit_4_triggering/pipelines/xgboost_training/metadata.yaml index 90ddf8fe9..eac434ca5 100755 --- a/mlops/unit_4_triggering/pipelines/xgboost_training/metadata.yaml +++ b/mlops/unit_4_triggering/pipelines/xgboost_training/metadata.yaml @@ -52,14 +52,14 @@ blocks: language: python name: XGBoost retry_config: null - status: executed + status: updated timeout: null type: data_exporter upstream_blocks: - training_set - hyperparameter_tuning/xgboost uuid: xgboost -- all_upstream_blocks_executed: true +- all_upstream_blocks_executed: false color: pink configuration: file_path: custom/dashboard_data_source.py @@ -72,7 +72,7 @@ blocks: language: python name: Dashboard data source retry_config: null - status: failed + status: executed timeout: null type: custom upstream_blocks: @@ -106,7 +106,5 @@ variables: early_stopping_rounds: 1 max_depth: 1 max_evaluations: 1 - verbose_eval: 10000 - verbosity: 0 -variables_dir: /home/src/mage_data/unit_4_triggering +variables_dir: /root/.mage_data/unit_4_triggering widgets: [] diff --git a/mlops/utils/logging.py b/mlops/utils/logging.py index adb2ca735..532f35d01 100644 --- a/mlops/utils/logging.py +++ b/mlops/utils/logging.py @@ -98,47 +98,48 @@ def track_experiment( dataset_inputs = [] - for dataset_name, dataset, tags in [ - ('dataset', training_set, dict(context='training')), - ( - 'targets', - training_targets.to_numpy() if training_targets is not None else None, - dict(context='training'), - ), - ('dataset', validation_set, dict(context='validation')), - ( - 'targets', - validation_targets.to_numpy() if validation_targets is not None else None, - dict(context='validation'), - ), - ('predictions', predictions, dict(context='training')), - ]: - if dataset is None: - continue - - dataset_from = None - if isinstance(dataset, pd.DataFrame): - dataset_from = from_pandas - elif isinstance(dataset, np.ndarray): - dataset_from = from_numpy - - if dataset_from: - ds = dataset_from(dataset, name=dataset_name)._to_mlflow_entity() - ds_input = DatasetInput(ds, tags=[InputTag(k, v) for k, v in tags.items()]) - dataset_inputs.append(ds_input) - - if verbosity: - context = tags['context'] - if dataset_from: - print(f'Logged input for {context} {dataset_name}.') - else: - print( - f'Unable to log input for {context} {dataset_name}, ' - f'{type(dataset)} not registered.' - ) - - if len(dataset_inputs) >= 1: - client.log_inputs(run_id, dataset_inputs) + # This increases memory too much. + # for dataset_name, dataset, tags in [ + # ('dataset', training_set, dict(context='training')), + # ( + # 'targets', + # training_targets.to_numpy() if training_targets is not None else None, + # dict(context='training'), + # ), + # ('dataset', validation_set, dict(context='validation')), + # ( + # 'targets', + # validation_targets.to_numpy() if validation_targets is not None else None, + # dict(context='validation'), + # ), + # ('predictions', predictions, dict(context='training')), + # ]: + # if dataset is None: + # continue + + # dataset_from = None + # if isinstance(dataset, pd.DataFrame): + # dataset_from = from_pandas + # elif isinstance(dataset, np.ndarray): + # dataset_from = from_numpy + + # if dataset_from: + # ds = dataset_from(dataset, name=dataset_name)._to_mlflow_entity() + # ds_input = DatasetInput(ds, tags=[InputTag(k, v) for k, v in tags.items()]) + # dataset_inputs.append(ds_input) + + # if verbosity: + # context = tags['context'] + # if dataset_from: + # print(f'Logged input for {context} {dataset_name}.') + # else: + # print( + # f'Unable to log input for {context} {dataset_name}, ' + # f'{type(dataset)} not registered.' + # ) + + # if len(dataset_inputs) >= 1: + # client.log_inputs(run_id, dataset_inputs) if model: log_model = None