Project import generated by Copybara. (#112)

GitOrigin-RevId: 09f3289f4581ba7d81e145e9593ffcda9233f4bf Co-authored-by: Snowflake Authors <[email protected]>
snowflakedb · Jul 29, 2024 · 123693a · 123693a
1 parent 3cbf8f1
commit 123693a
Show file tree

Hide file tree

Showing 187 changed files with 8,781 additions and 4,167 deletions.
diff --git a/.github/workflows/jira_issue.yml b/.github/workflows/jira_issue.yml
@@ -40,8 +40,9 @@ jobs:
           summary: ${{ github.event.issue.title }}
           description: |
             ${{ github.event.issue.body }} \\ \\ _Created from GitHub Action_ for ${{ github.event.issue.html_url }}
-          # Assign triage-ml-platform-dl and set "Data Platform: ML Engineering" component.
-          fields: '{"customfield_11401":{"id":"14538"}, "assignee":{"id":"639020ab3c26ca7fa0d6eb3f"},"components":[{"id":"16520"}]}'
+          # Assign triage-ml-platform-dl and set "ML Platform" component (19112).
+          # See https://snowflakecomputing.atlassian.net/rest/api/2/project/SNOW/components for component information.
+          fields: '{"customfield_11401":{"id":"14538"}, "assignee":{"id":"639020ab3c26ca7fa0d6eb3f"},"components":[{"id":"19112"}]}'
 
       - name: Update GitHub Issue
         uses: ./jira/gajira-issue-update

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,6 +1,52 @@
 # Release History
 
-## 1.5.4
+## 1.6.0
+
+### Bug Fixes
+
+- Modeling: `SimpleImputer` can impute integer columns with integer values.
+- Registry: Fix an issue when providing a pandas Dataframe whose index is not starting from 0 as the input to
+  the `ModelVersion.run`.
+
+### New Features
+
+- Feature Store: Add overloads to APIs accept both object and name/version. Impacted APIs include read_feature_view(),
+  refresh_feature_view(), get_refresh_history(), resume_feature_view(), suspend_feature_view(), delete_feature_view().
+- Feature Store: Add docstring inline examples for all public APIs.
+- Feature Store: Add new utility class `ExampleHelper` to help with load source data to simplify public notebooks.
+- Registry: Option to `enable_explainability` when registering XGBoost models as a pre-PuPr feature.
+- Feature Store: add new API `update_entity()`.
+- Registry: Option to `enable_explainability` when registering Catboost models as a pre-PuPr feature.
+- Feature Store: Add new argument warehouse to FeatureView constructor to overwrite the default warehouse. Also add
+  a new column 'warehouse' to the output of list_feature_views().
+- Registry: Add support for logging model from a model version.
+- Modeling: Distributed Hyperparameter Optimization now announce GA refresh version. The latest memory efficient version
+  will not have the 10GB training limitation for dataset any more. To turn off, please run
+  `
+  from snowflake.ml.modeling._internal.snowpark_implementations import (
+      distributed_hpo_trainer,
+  )
+  distributed_hpo_trainer.ENABLE_EFFICIENT_MEMORY_USAGE = False
+  `
+- Registry: Option to `enable_explainability` when registering LightGBM models as a pre-PuPr feature.
+
+### Behavior Changes
+
+- Feature Store: change some positional parameters to keyword arguments in following APIs:
+  - Entity(): desc.
+  - FeatureView(): timestamp_col, refresh_freq, desc.
+  - FeatureStore(): creation_mode.
+  - update_entity(): desc.
+  - register_feature_view(): block, overwrite.
+  - list_feature_views(): entity_name, feature_view_name.
+  - get_refresh_history(): verbose.
+  - retrieve_feature_values(): spine_timestamp_col, exclude_columns, include_feature_view_timestamp_col.
+  - generate_training_set(): save_as, spine_timestamp_col, spine_label_cols, exclude_columns,
+    include_feature_view_timestamp_col.
+  - generate_dataset(): version, spine_timestamp_col, spine_label_cols, exclude_columns,
+    include_feature_view_timestamp_col, desc, output_type.
+
+## 1.5.4 (2024-07-11)
 
 ### Bug Fixes
 

diff --git a/ci/conda_recipe/meta.yaml b/ci/conda_recipe/meta.yaml
@@ -17,7 +17,7 @@ build:
   noarch: python
 package:
   name: snowflake-ml-python
-  version: 1.5.4
+  version: 1.6.0
 requirements:
   build:
     - python

diff --git a/ci/targets/quarantine/prod3.txt b/ci/targets/quarantine/prod3.txt
@@ -2,4 +2,3 @@
 //tests/integ/snowflake/ml/registry:model_registry_snowservice_integ_test
 //tests/integ/snowflake/ml/model:spcs_llm_model_integ_test
 //tests/integ/snowflake/ml/extra_tests:xgboost_external_memory_training_test
-//tests/integ/snowflake/ml/lineage:lineage_integ_test
diff --git a/codegen/build_file_autogen.py b/codegen/build_file_autogen.py
@@ -14,7 +14,7 @@
 from absl import app
 
 from codegen import sklearn_wrapper_autogen as swa
-from snowflake.ml.snowpark_pandas import imports
+from snowflake.ml._internal.snowpark_pandassnowpark_pandas import imports
 
 
 @dataclass(frozen=True)
@@ -188,7 +188,7 @@ def get_snowpark_pandas_test_build_file_content(module: imports.ModuleInfo, modu
     return (
         'load("//codegen:codegen_rules.bzl", "autogen_snowpark_pandas_tests")\n'
         f'load("//{module_root_dir}:estimators_info.bzl", "snowpark_pandas_estimator_info_list")\n'
-        'package(default_visibility = ["//snowflake/ml/snowpark_pandas"])\n'
+        'package(default_visibility = ["//snowflake/ml/_internal/snowpark_pandas"])\n'
         "\nautogen_snowpark_pandas_tests(\n"
         f'    module = "{module.module_name}",\n'
         f'    module_root_dir = "{module_root_dir}",\n'

diff --git a/codegen/codegen_rules.bzl b/codegen/codegen_rules.bzl
@@ -178,7 +178,7 @@ def autogen_snowpark_pandas_tests(module, module_root_dir, snowpark_pandas_estim
             name = "{}_snowpark_pandas_test".format(e.normalized_class_name),
             srcs = [":generate_test_snowpark_pandas_{}".format(e.normalized_class_name)],
             deps = [
-                "//snowflake/ml/snowpark_pandas:snowpark_pandas_lib",
+                "//snowflake/ml/_internal/snowpark_pandas:snowpark_pandas_lib",
                 "//snowflake/ml/utils:connection_params",
             ],
             compatible_with_snowpark = False,

diff --git a/codegen/sklearn_wrapper_generator.py b/codegen/sklearn_wrapper_generator.py
@@ -205,6 +205,18 @@ def _is_data_module_obj(class_object: Tuple[str, type]) -> bool:
         """
         return class_object[1].__module__ == "sklearn.preprocessing._data"
 
+    @staticmethod
+    def _is_preprocessing_module_obj(class_object: Tuple[str, type]) -> bool:
+        """Check if the given class belongs to the SKLearn preprocessing module.
+
+        Args:
+            class_object: Meta class object which needs to be checked.
+
+        Returns:
+            True if the class belongs to `sklearn.preprocessing` module, otherwise False.
+        """
+        return class_object[1].__module__.startswith("sklearn.preprocessing")
+
     @staticmethod
     def _is_cross_decomposition_module_obj(class_object: Tuple[str, type]) -> bool:
         """Check if the given class belongs to the SKLearn cross_decomposition module.
@@ -675,6 +687,7 @@ def _populate_flags(self) -> None:
         self._is_cross_decomposition_module_obj = WrapperGeneratorFactory._is_cross_decomposition_module_obj(
             self.class_object
         )
+        self._is_preprocessing_module_obj = WrapperGeneratorFactory._is_preprocessing_module_obj(self.class_object)
         self._is_regressor = WrapperGeneratorFactory._is_regressor_obj(self.class_object)
         self._is_classifier = WrapperGeneratorFactory._is_classifier_obj(self.class_object)
         self._is_meta_estimator = WrapperGeneratorFactory._is_meta_estimator_obj(self.class_object)
@@ -1014,6 +1027,14 @@ def generate(self) -> "SklearnWrapperGenerator":
         if "random_state" in self.original_init_signature.parameters.keys():
             self.test_estimator_input_args_list.append("random_state=0")
 
+        # Our preprocessing classes don't support sparse features
+        if "sparse" in self.original_init_signature.parameters.keys() and self._is_preprocessing_module_obj:
+            self.test_estimator_input_args_list.append("sparse=False")
+
+        # For the case of KBinsDiscretizer, we need to set encode to ordinal
+        # if "encode" in self.original_init_signature.parameters.keys() and self._is_preprocessing_module_obj:
+        #     self.test_estimator_input_args_list.append("encode='ordinal'")
+
         if (
             "max_iter" in self.original_init_signature.parameters.keys()
             and not self._is_hist_gradient_boosting_regressor

diff --git a/codegen/snowpark_pandas_autogen_test_template.py_template b/codegen/snowpark_pandas_autogen_test_template.py_template
@@ -16,10 +16,10 @@ import pytest
 from typing import Any, Dict, List, Optional, Tuple, Union
 from absl.testing.absltest import TestCase, main
 {transform.test_snowpark_pandas_imports}
-# from snowflake.ml.beta import snowpark_pandas
+# from snowflake.ml import snowpark_pandas
 from snowflake.ml.utils.connection_params import SnowflakeLoginOptions
 from snowflake.snowpark import Session
-# from snowflake.snowpark.modin import pandas as SnowparkPandas
+# from snowflake.snowpark.modin import pandas as snowpark_pandas
 
 _INFERENCE = "INFERENCE"
 _EXPECTED = "EXPECTED"
@@ -35,7 +35,7 @@ class DatasetType(enum.Enum):
 class {transform.test_class_name}(TestCase):
     def setUp(self) -> None:
         """Creates Snowpark and Snowflake environments for testing."""
-        self._session = Session.builder.configs(SnowflakeLoginOptions("sfc")).create()
+        self._session = Session.builder.configs(SnowflakeLoginOptions()).create()
 
     def tearDown(self) -> None:
         self._session.close()
@@ -114,12 +114,12 @@ class {transform.test_class_name}(TestCase):
     #         inference_methods.remove("transform")  # underlying estimators have no method 'transform'
     #     if Sk{transform.original_class_name}.__name__ == "LocalOutlierFactor" and not reg.novelty:
     #         inference_methods.remove("predict")
-
+        
     #     for m in inference_methods:
     #         if callable(getattr(reg, m, None)):
     #             res = getattr(reg, m)(dataset)
-                # TODO(hayu): Remove the output manipulation as the results should be exactly the same as sklearn.
-    #             if isinstance(res, SnowparkPandas.DataFrame) or isinstance(res, pd.DataFrame):
+    #             # TODO(hayu): Remove the output manipulation as the results should be exactly the same as sklearn.
+    #             if isinstance(res, snowpark_pandas.DataFrame) or isinstance(res, pd.DataFrame):
     #                 arr = res.to_numpy()
     #             elif isinstance(res, list):
     #                 arr = np.array(res)
@@ -128,14 +128,14 @@ class {transform.test_class_name}(TestCase):
     #             if arr.ndim == 2 and arr.shape[1] == 1:
     #                 arr = arr.flatten()
     #             if len(arr.shape) == 3:
-                    # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
-                    # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
-                    # so we ignore flatten_transform flag and flatten the results. We need flatten sklearn results
-                    # also to compare with snowflake results.
+    #                 # VotingClassifier will return results of shape (n_classifiers, n_samples, n_classes)
+    #                 # when voting = "soft" and flatten_transform = False. We can't handle unflatten transforms,
+    #                 # so we ignore flatten_transform flag and flatten the results. We need flatten sklearn results
+    #                 # also to compare with snowflake results.
     #                 arr = np.hstack(arr)  # type: ignore[arg-type]
     #             elif len(arr.shape) == 1:
-                    # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
-                    # response as 2D array of shape (n_samples, 1). Flatten the snowflake response to compare results.
+    #                 # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
+    #                 # response as 2D array of shape (n_samples, 1). Flatten the snowflake response to compare results.
     #                 arr = arr.flatten()
     #             output[_INFERENCE].append(arr)
 
@@ -152,7 +152,7 @@ class {transform.test_class_name}(TestCase):
     #     for m in expected_methods:
     #         if callable(getattr(reg, m, None)):
     #             res = getattr(reg, m)(dataset)
-    #             if isinstance(res, SnowparkPandas.DataFrame) or isinstance(res, pd.DataFrame):
+    #             if isinstance(res, snowpark_pandas.DataFrame) or isinstance(res, pd.DataFrame):
     #                 arr = res.to_numpy()
     #             elif isinstance(res, list):
     #                 arr = np.array(res)
@@ -161,8 +161,8 @@ class {transform.test_class_name}(TestCase):
     #             if arr.ndim == 2 and arr.shape[1] == 1:
     #                 arr = arr.flatten()
     #             if isinstance(arr, list):
-                    # In case of multioutput estimators predict_proba, decision_function, etc., returns a list of
-                    # ndarrays as output. We need to concatenate them to compare with snowflake output.
+    #                 # In case of multioutput estimators predict_proba, decision_function, etc., returns a list of
+    #                 # ndarrays as output. We need to concatenate them to compare with snowflake output.
     #                 arr = np.concatenate(arr, axis=1)
     #             elif len(arr.shape) == 1:
     #                 # Sometimes sklearn returns results as 1D array of shape (n_samples,), but snowflake always returns
@@ -189,14 +189,18 @@ class {transform.test_class_name}(TestCase):
 
     #     reg = Sk{transform.original_class_name}({transform.test_estimator_input_args})
 
+    #     # Special handle for label encoder: sklearn label encoder fit method only accept fit(y),
+    #     # but our SnowML API would treat it as fit(X)
+    #     _is_label_encoder = reg.__class__.__name__ == "LabelEncoder"
+
     #     input_df_pandas, input_cols, label_col = self._get_test_dataset(
     #             sklearn_obj=reg,
     #             add_sample_weight_col=use_weighted_dataset
     #     )
-    #     input_df_snowpark_pandas = SnowparkPandas.DataFrame(input_df_pandas)
+    #     input_df_snowpandas = snow_pd.DataFrame(input_df_pandas)
 
     #     pd_X, pd_y = input_df_pandas[input_cols], input_df_pandas[label_col].squeeze()
-    #     snow_X, snow_y = input_df_snowpark_pandas[input_cols], input_df_snowpark_pandas[label_col].squeeze()
+    #     snow_X, snow_y = input_df_snowpandas[input_cols], input_df_snowpandas[label_col].squeeze()
     #     pd_args = {{
     #         'X': pd_X,
     #         'y': pd_y,
@@ -205,21 +209,23 @@ class {transform.test_class_name}(TestCase):
     #         'X': snow_X,
     #         'y': snow_y,
     #     }}
-    #     if use_weighted_dataset:
+
+    #     # SnowML preprocessing class currently doesn't support sample weight
+    #     if use_weighted_dataset and not {transform._is_preprocessing_module_obj}:
     #         pd_args['sample_weight'] = input_df_pandas["SAMPLE_WEIGHT"].squeeze()
-    #         snow_args['sample_weight'] = input_df_snowpark_pandas["SAMPLE_WEIGHT"].squeeze()
+    #         snow_args['sample_weight'] = input_df_snowpandas["SAMPLE_WEIGHT"].squeeze()
 
     #     pd_score_args = snow_score_args = None
     #     if callable(getattr(reg, "score", None)):
     #         pd_score_args = copy.deepcopy(pd_args)
     #         snow_score_args = copy.deepcopy(snow_args)
     #         score_argspec = inspect.getfullargspec(reg.score)
-            # Some classes that has sample_weight argument in fit() but not in score().
+    #         # Some classes that has sample_weight argument in fit() but not in score().
     #         if use_weighted_dataset and 'sample_weight' not in score_argspec.args:
     #             del pd_score_args['sample_weight']
     #             del snow_score_args['sample_weight']
 
-            # Some classes have different arg name in score: X -> X_test
+    #         # Some classes have different arg name in score: X -> X_test
     #         if "X_test" in score_argspec.args:
     #             pd_score_args['X_test'] = pd_score_args.pop('X')
     #             snow_score_args['X_test'] = snow_score_args.pop('X')
@@ -229,24 +235,34 @@ class {transform.test_class_name}(TestCase):
     #             pd_args['Y'] = pd_args.pop('y')
     #             snow_args['Y'] = snow_args.pop('y')
 
-        # pandas
-    #     pd_output = self._compute_output(reg, pd_args, input_df_pandas[input_cols], pd_score_args)
+    #     # pandas
+    #     if _is_label_encoder:
+    #         pd_output = self._compute_output(reg, {{'y': input_df_pandas[label_col]}}, input_df_pandas[label_col], None)
+    #     else:
+    #         pd_output = self._compute_output(reg, pd_args, input_df_pandas[input_cols], pd_score_args)
 
-        # snowpark_pandas
+    #     # snowpandas
     #     snowpark_pandas.init()
 
+    #     # Integrate with native distributed preprocessing methods
     #     snow_reg = Sk{transform.original_class_name}({transform.test_estimator_input_args})
     #     args = snow_args if training == DatasetType.SNOWPARK_PANDAS else pd_args
     #     dataset, score_args = (
-    #         (input_df_snowpark_pandas[input_cols], snow_score_args) if inference == DatasetType.SNOWPARK_PANDAS
+    #         (input_df_snowpandas[input_cols], snow_score_args) if inference == DatasetType.SNOWPARK_PANDAS
     #         else (input_df_pandas[input_cols], pd_score_args)
     #     )
-    #     snow_output = self._compute_output(snow_reg, args, dataset, score_args)
+    #     if _is_label_encoder:
+    #         if training == DatasetType.SNOWPARK_PANDAS:
+    #             snow_output = self._compute_output(reg, {{'X': input_df_snowpandas[label_col]}}, input_df_snowpandas[label_col], None)
+    #         else:
+    #             snow_output = self._compute_output(reg, {{'y': input_df_pandas[label_col]}}, input_df_pandas[label_col], None)
+    #     else:
+    #         snow_output = self._compute_output(snow_reg, args, dataset, score_args)
 
     #     for pd_arr, snow_arr in zip(pd_output[_INFERENCE], snow_output[_INFERENCE]):
     #         snow_arr = snow_arr.astype(pd_arr.dtype)  # type: ignore[union-attr]
-            # TODO(snandamuri): HistGradientBoostingRegressor is returning different results in different envs.
-            # Needs further debugging.
+    #         # TODO(snandamuri): HistGradientBoostingRegressor is returning different results in different envs.
+    #         # Needs further debugging.
     #         if {transform._is_hist_gradient_boosting_regressor}:
     #             num_diffs = (~np.isclose(snow_arr, pd_arr)).sum()
     #             num_example = pd_arr.shape[0]
@@ -282,13 +298,13 @@ class {transform.test_class_name}(TestCase):
     #         use_weighted_dataset=False
     #     )
 
-    def _is_weighted_dataset_supported(self, klass: type) -> bool:
-        is_weighted_dataset_supported = False
-        for m in inspect.getmembers(klass):
-            if inspect.isfunction(m[1]) and m[0] == "fit":
-                argspec = inspect.getfullargspec(m[1])
-                is_weighted_dataset_supported = True if "sample_weight" in argspec.args else False
-        return is_weighted_dataset_supported
+    # def _is_weighted_dataset_supported(self, klass: type) -> bool:
+    #     is_weighted_dataset_supported = False
+    #     for m in inspect.getmembers(klass):
+    #         if inspect.isfunction(m[1]) and m[0] == "fit":
+    #             argspec = inspect.getfullargspec(m[1])
+    #             is_weighted_dataset_supported = True if "sample_weight" in argspec.args else False
+    #     return is_weighted_dataset_supported
 
     # def test_weighted_datasets_snow_snow(self) -> None:
     #     if self._is_weighted_dataset_supported(Sk{transform.original_class_name}):

diff --git a/snowflake/cortex/BUILD.bazel b/snowflake/cortex/BUILD.bazel
@@ -29,6 +29,25 @@ py_library(
     srcs = ["_sse_client.py"],
 )
 
+py_library(
+    name = "classify_text",
+    srcs = ["_classify_text.py"],
+    deps = [
+        ":util",
+        "//snowflake/ml/_internal:telemetry",
+    ],
+)
+
+py_test(
+    name = "classify_text_test",
+    srcs = ["classify_text_test.py"],
+    deps = [
+        ":classify_text",
+        ":test_util",
+        "//snowflake/ml/utils:connection_params",
+    ],
+)
+
 py_library(
     name = "complete",
     srcs = ["_complete.py"],
@@ -140,6 +159,7 @@ py_library(
         "__init__.py",
     ],
     deps = [
+        ":classify_text",
         ":complete",
         ":extract_answer",
         ":sentiment",

diff --git a/snowflake/cortex/__init__.py b/snowflake/cortex/__init__.py
@@ -1,10 +1,12 @@
+from snowflake.cortex._classify_text import ClassifyText
 from snowflake.cortex._complete import Complete, CompleteOptions
 from snowflake.cortex._extract_answer import ExtractAnswer
 from snowflake.cortex._sentiment import Sentiment
 from snowflake.cortex._summarize import Summarize
 from snowflake.cortex._translate import Translate
 
 __all__ = [
+    "ClassifyText",
     "Complete",
     "CompleteOptions",
     "ExtractAnswer",