diff --git a/.github/workflows/create-more-space/action.yml b/.github/workflows/create-more-space/action.yml new file mode 100644 index 000000000..805d00107 --- /dev/null +++ b/.github/workflows/create-more-space/action.yml @@ -0,0 +1,14 @@ +# We are running out space on the disk, solution described here: +# https://github.com/actions/runner-images/issues/2840#issuecomment-790492173 + +name: "Create more disk space" +description: "Removing some folders to create more disk space" +runs: + using: composite + steps: + - shell: bash + run: | + sudo rm -rf /usr/share/dotnet + sudo rm -rf /opt/ghc + sudo rm -rf "/usr/local/share/boost" + sudo rm -rf "$AGENT_TOOLSDIRECTORY" diff --git a/.github/workflows/run-unittests-default_setup.yml b/.github/workflows/run-unittests-default_setup.yml index da6ba5c80..f0021e7a5 100644 --- a/.github/workflows/run-unittests-default_setup.yml +++ b/.github/workflows/run-unittests-default_setup.yml @@ -1,18 +1,13 @@ -name: tests/unitary/default_setup/** +name: "[Py3.8][Py3.9][Py3.10] tests/unitary/default_setup/**" on: workflow_dispatch: pull_request: - branches: - - main - - "release/**" - - develop paths: - "ads/**" - pyproject.toml - "**requirements.txt" - - .github/workflows/run-unittests.yml - - .github/workflows/run-unittests-default_setup.yml + - ".github/workflows/run-unittests*.yml" # Cancel in progress workflows on pull_requests. # https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value @@ -41,37 +36,16 @@ jobs: steps: - uses: actions/checkout@v3 - # Caching python libraries installed with pip - # https://github.com/actions/cache/blob/main/examples.md#python---pip - - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/test-requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- - uses: actions/setup-python@v4 with: python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: | + pyproject.toml + "**requirements.txt" - - name: "Test config setup" - shell: bash - env: - HOME_RUNNER_DIR: /home/runner - run: | - set -x # print commands that are executed - mkdir -p "$HOME_RUNNER_DIR"/.oci - openssl genrsa -out $HOME_RUNNER_DIR/.oci/oci_ads_user.pem 2048 - cat <> "$HOME_RUNNER_DIR/.oci/config" - [DEFAULT] - user=ocid1.user.oc1..xxx - fingerprint=00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 - tenancy=ocid1.tenancy.oc1..xxx - region=test_region - key_file=$HOME_RUNNER_DIR/.oci/oci_ads_user.pem - EOT - ls -lha "$HOME_RUNNER_DIR"/.oci - echo "Test config file:" - cat $HOME_RUNNER_DIR/.oci/config + - uses: ./.github/workflows/set-dummy-conf + name: "Test config setup" - name: "Run default_setup tests folder ONLY with minimum ADS dependencies" timeout-minutes: 15 diff --git a/.github/workflows/run-unittests.yml b/.github/workflows/run-unittests-py38-cov-report.yml similarity index 76% rename from .github/workflows/run-unittests.yml rename to .github/workflows/run-unittests-py38-cov-report.yml index c5ca97368..3493ac624 100644 --- a/.github/workflows/run-unittests.yml +++ b/.github/workflows/run-unittests-py38-cov-report.yml @@ -1,18 +1,13 @@ -name: tests/unitary/** +name: "[Py3.8][COV REPORT] tests/unitary/**" on: workflow_dispatch: pull_request: - branches: - - main - - "release/**" - - develop paths: - "ads/**" - pyproject.toml - "**requirements.txt" - - .github/workflows/run-unittests.yml - - .github/workflows/run-unittests-default_setup.yml + - ".github/workflows/run-unittests*.yml" # Cancel in progress workflows on pull_requests. # https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value @@ -30,14 +25,13 @@ env: jobs: test: - name: python ${{ matrix.python-version }}, ${{ matrix.name }} + name: python 3.8, ${{ matrix.name }} runs-on: ubuntu-latest timeout-minutes: 90 strategy: fail-fast: false matrix: - python-version: ["3.8", "3.9", "3.10"] test-path: ["tests/unitary/with_extras tests/unitary/default_setup", "tests/unitary/with_extras/model"] include: - test-path: "tests/unitary/with_extras tests/unitary/default_setup" @@ -45,43 +39,23 @@ jobs: name: "unitary" - test-path: "tests/unitary/with_extras/model" name: "model" - - python-version: "3.8" - cov-reports: --cov=ads --cov-report=xml --cov-report=html steps: - uses: actions/checkout@v3 - # Caching python libraries installed with pip - # https://github.com/actions/cache/blob/main/examples.md#python---pip - - uses: actions/cache@v3 - with: - path: ~/.cache/pip - key: ${{ runner.os }}-pip-${{ hashFiles('**/dev-requirements.txt') }} - restore-keys: | - ${{ runner.os }}-pip- + - uses: ./.github/workflows/create-more-space + name: "Create more disk space" + - uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version }} + python-version: "3.8" + cache: "pip" + cache-dependency-path: | + pyproject.toml + "**requirements.txt" - - name: "Test config setup" - shell: bash - env: - HOME_RUNNER_DIR: /home/runner - run: | - set -x # print commands that are executed - mkdir -p "$HOME_RUNNER_DIR"/.oci - openssl genrsa -out $HOME_RUNNER_DIR/.oci/oci_ads_user.pem 2048 - cat <> "$HOME_RUNNER_DIR/.oci/config" - [DEFAULT] - user=ocid1.user.oc1..xxx - fingerprint=00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 - tenancy=ocid1.tenancy.oc1..xxx - region=test_region - key_file=$HOME_RUNNER_DIR/.oci/oci_ads_user.pem - EOT - ls -lha "$HOME_RUNNER_DIR"/.oci - echo "Test config file:" - cat $HOME_RUNNER_DIR/.oci/config + - uses: ./.github/workflows/set-dummy-conf + name: "Test config setup" - name: "Test env setup" timeout-minutes: 20 @@ -104,22 +78,19 @@ jobs: set -x # print commands that are executed # Setup project and tests folder for cov reports to not be overwritten by another parallel step - if [[ ! -z "${{ matrix.cov-reports }}" ]]; then - mkdir -p cov-${{ matrix.name }} - cd cov-${{ matrix.name }} - ln -s ../tests tests - ln -s ../ads ads - ln -s ../.coveragerc .coveragerc - fi + mkdir -p cov-${{ matrix.name }} + cd cov-${{ matrix.name }} + ln -s ../tests tests + ln -s ../ads ads + ln -s ../.coveragerc .coveragerc # Run tests python -m pytest -v -p no:warnings --durations=5 \ - -n auto --dist loadfile ${{ matrix.cov-reports }} \ + -n auto --dist loadfile --cov=ads --cov-report=xml --cov-report=html \ ${{ matrix.test-path }} ${{ matrix.ignore-path }} - name: "Save coverage files" uses: actions/upload-artifact@v3 - if: ${{ matrix.cov-reports }} with: name: cov-reports-${{ matrix.name }} path: | @@ -132,7 +103,7 @@ jobs: runs-on: ubuntu-latest continue-on-error: true needs: test - if: ${{ success() }} && ${{ github.event.issue.pull_request }} + if: ${{ success() }} && && github.event_name == 'pull_request' env: COMPARE_BRANCH: main diff --git a/.github/workflows/run-unittests-py39-py310.yml b/.github/workflows/run-unittests-py39-py310.yml new file mode 100644 index 000000000..ae505ebf2 --- /dev/null +++ b/.github/workflows/run-unittests-py39-py310.yml @@ -0,0 +1,84 @@ +name: "[Py3.9][Py3.10] - tests/unitary/**" + +on: + workflow_dispatch: + pull_request: + paths: + - "ads/**" + - pyproject.toml + - "**requirements.txt" + - ".github/workflows/run-unittests*.yml" + +# Cancel in progress workflows on pull_requests. +# https://docs.github.com/en/actions/using-jobs/using-concurrency#example-using-a-fallback-value +concurrency: + group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + +# hack for https://github.com/actions/cache/issues/810#issuecomment-1222550359 +env: + SEGMENT_DOWNLOAD_TIMEOUT_MINS: 5 + +jobs: + test: + name: python ${{ matrix.python-version }}, ${{ matrix.name }} + runs-on: ubuntu-latest + timeout-minutes: 90 + + strategy: + fail-fast: false + matrix: + python-version: ["3.9", "3.10"] + test-path: ["tests/unitary/with_extras tests/unitary/default_setup", "tests/unitary/with_extras/model"] + include: + - test-path: "tests/unitary/with_extras tests/unitary/default_setup" + ignore-path: "--ignore tests/unitary/with_extras/model --ignore tests/unitary/with_extras/feature_store" + name: "unitary" + - test-path: "tests/unitary/with_extras/model" + name: "model" + + steps: + - uses: actions/checkout@v3 + + - uses: ./.github/workflows/create-more-space + name: "Create more disk space" + + - uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + cache: "pip" + cache-dependency-path: | + pyproject.toml + "**requirements.txt" + + - uses: ./.github/workflows/set-dummy-conf + name: "Test config setup" + + - name: "Test env setup" + timeout-minutes: 20 + shell: bash + run: | + set -x # print commands that are executed + + sudo apt-get install libkrb5-dev graphviz + $CONDA/bin/conda init + source /home/runner/.bashrc + + pip install -r dev-requirements.txt + + - name: "Run unitary tests folder with maximum ADS dependencies" + timeout-minutes: 60 + shell: bash + env: + CONDA_PREFIX: /usr/share/miniconda + run: | + set -x # print commands that are executed + + # Run tests + python -m pytest -v -p no:warnings --durations=5 \ + -n auto --dist loadfile \ + ${{ matrix.test-path }} ${{ matrix.ignore-path }} diff --git a/.github/workflows/set-dummy-conf/action.yml b/.github/workflows/set-dummy-conf/action.yml new file mode 100644 index 000000000..559048cfb --- /dev/null +++ b/.github/workflows/set-dummy-conf/action.yml @@ -0,0 +1,26 @@ +# This composite action is to avoid duplicating code across test actions. +# Ref: https://docs.github.com/en/actions/creating-actions/creating-a-composite-action + +name: "Create dummy oci config" +description: "Creates dummy oci config folder in ~/.oci" +runs: + using: composite + steps: + - shell: bash + env: + HOME_RUNNER_DIR: /home/runner + run: | + set -x # print commands that are executed + mkdir -p "$HOME_RUNNER_DIR"/.oci + openssl genrsa -out $HOME_RUNNER_DIR/.oci/oci_ads_user.pem 2048 + cat <> "$HOME_RUNNER_DIR/.oci/config" + [DEFAULT] + user=ocid1.user.oc1..xxx + fingerprint=00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 + tenancy=ocid1.tenancy.oc1..xxx + region=test_region + key_file=$HOME_RUNNER_DIR/.oci/oci_ads_user.pem + EOT + ls -lha "$HOME_RUNNER_DIR"/.oci + echo "Test config file:" + cat $HOME_RUNNER_DIR/.oci/config diff --git a/.gitleaks.toml b/.gitleaks.toml index f6dee79c4..ea1e8863d 100644 --- a/.gitleaks.toml +++ b/.gitleaks.toml @@ -13,6 +13,7 @@ useDefault = true '''example-password''', '''this-is-not-the-secret''', '''''', + '''security_token''', # NVIDIA_GPGKEY_SUM from public documentation: # https://gitlab.com/nvidia/container-images/cuda/-/blob/master/dist/10.1/centos7/base/Dockerfile '''d0664fbbdb8c32356d45de36c5984617217b2d0bef41b93ccecd326ba3b80c87''' diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index e77699a3f..d71fe18c4 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -39,6 +39,7 @@ repos: rev: v8.17.0 hooks: - id: gitleaks + exclude: .github/workflows/reusable-actions/set-dummy-conf.yml # Oracle copyright checker - repo: https://github.com/oracle-samples/oci-data-science-ai-samples/ rev: cbe0136f7aaffe463b31ddf3f34b0e16b4b124ff diff --git a/ads/common/auth.py b/ads/common/auth.py index e3aec78e6..5135459a4 100644 --- a/ads/common/auth.py +++ b/ads/common/auth.py @@ -416,7 +416,7 @@ def create_signer( >>> auth = ads.auth.create_signer(auth_type="security_token", config=config) # security token authentication created based on provided config """ if signer or signer_callable: - configuration = ads.telemetry.update_oci_client_config() + configuration = ads.telemetry.update_oci_client_config(config) if signer_callable: signer = signer_callable(**signer_kwargs) signer_dict = { @@ -479,7 +479,7 @@ def default_signer(client_kwargs: Optional[Dict] = None) -> Dict: """ auth_state = AuthState() if auth_state.oci_signer or auth_state.oci_signer_callable: - configuration = ads.telemetry.update_oci_client_config() + configuration = ads.telemetry.update_oci_client_config(auth_state.oci_config) signer = auth_state.oci_signer if auth_state.oci_signer_callable: signer_kwargs = auth_state.oci_signer_kwargs or {} diff --git a/ads/common/dsc_file_system.py b/ads/common/dsc_file_system.py index 92cc5cd90..ecd3de8ca 100644 --- a/ads/common/dsc_file_system.py +++ b/ads/common/dsc_file_system.py @@ -5,6 +5,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import ads import oci +import os import ipaddress from dataclasses import dataclass @@ -20,6 +21,7 @@ class DSCFileSystem: dest: str = None storage_type: str = None destination_directory_name: str = None + destination_path: str = None def update_to_dsc_model(self) -> dict: """Updates arguments to dsc model. @@ -47,6 +49,29 @@ def update_from_dsc_model(cls, dsc_model) -> dict: """ pass + @staticmethod + def get_destination_path_and_name(dest: str) -> (str, str): + """Gets the destination path and destination directory name from dest. + Example: + dir - "fss" & path - "/opc" : mount happens under "/opc/fss" + dir - "fss" & path - "/" : mount happens under "/fss" + dir - "fss" & path - omitted : mount happens under "/mnt/fss" (for backward compatibility) + + Parameters + ---------- + dest: str + The dest path to which to mount the file system. + + Returns + ------- + tuple + A tuple of destination path and destination directory name. + """ + return ( + os.path.dirname(dest.rstrip("/")) or None, # when destination path is omitted, oci api requires it to be None + os.path.basename(dest.rstrip("/")) + ) + @dataclass class OCIFileStorage(DSCFileSystem): @@ -65,8 +90,10 @@ def update_to_dsc_model(self) -> dict: dict: A dictionary of arguments. """ + path, directory_name = self.get_destination_path_and_name(self.dest) arguments = { - "destinationDirectoryName" : self.dest, + "destinationDirectoryName" : directory_name, + "destinationPath" : path, "storageType" : self.storage_type } @@ -177,10 +204,14 @@ def update_from_dsc_model(cls, dsc_model) -> dict: raise ValueError( "Missing parameter `destination_directory_name` from service. Check service log to see the error." ) + + dest = dsc_model.destination_directory_name + if dsc_model.destination_path: + dest = f"{dsc_model.destination_path.rstrip('/')}/{dsc_model.destination_directory_name}" return { "src" : f"{dsc_model.mount_target_id}:{dsc_model.export_id}", - "dest" : dsc_model.destination_directory_name + "dest" : dest } @dataclass @@ -189,8 +220,10 @@ class OCIObjectStorage(DSCFileSystem): storage_type: str = OBJECT_STORAGE_TYPE def update_to_dsc_model(self) -> dict: + path, directory_name = self.get_destination_path_and_name(self.dest) arguments = { - "destinationDirectoryName" : self.dest, + "destinationDirectoryName" : directory_name, + "destinationPath" : path, "storageType" : self.storage_type } src_list = self.src.split("@") @@ -220,9 +253,13 @@ def update_from_dsc_model(cls, dsc_model) -> dict: "Missing parameter `destination_directory_name` from service. Check service log to see the error." ) + dest = dsc_model.destination_directory_name + if dsc_model.destination_path: + dest = f"{dsc_model.destination_path.rstrip('/')}/{dsc_model.destination_directory_name}" + return { "src" : f"oci://{dsc_model.bucket}@{dsc_model.namespace}/{dsc_model.prefix or ''}", - "dest" : dsc_model.destination_directory_name + "dest" : dest } diff --git a/ads/common/oci_mixin.py b/ads/common/oci_mixin.py index 2d1ed5e80..6bbcc9c58 100644 --- a/ads/common/oci_mixin.py +++ b/ads/common/oci_mixin.py @@ -41,7 +41,7 @@ class MergeStrategy(Enum): MERGE = "merge" -class OCIModelNotExists(Exception): # pragma: no cover +class OCIModelNotExists(Exception): # pragma: no cover pass @@ -383,7 +383,7 @@ class OCIModelMixin(OCISerializableMixin): """ # Regex pattern matching the module name of an OCI model. - OCI_MODEL_PATTERN = r"oci.[^.]+\.models[\..*]?" + OCI_MODEL_PATTERN = r"(oci|feature_store_client).[^.]+\.models[\..*]?" # Constants CONS_COMPARTMENT_ID = "compartment_id" @@ -937,11 +937,11 @@ def get_work_request_response( return work_request_response def wait_for_progress( - self, - work_request_id: str, - num_steps: int = DEFAULT_WORKFLOW_STEPS, - max_wait_time: int = DEFAULT_WAIT_TIME, - poll_interval: int = DEFAULT_POLL_INTERVAL + self, + work_request_id: str, + num_steps: int = DEFAULT_WORKFLOW_STEPS, + max_wait_time: int = DEFAULT_WAIT_TIME, + poll_interval: int = DEFAULT_POLL_INTERVAL, ): """Waits for the work request progress bar to be completed. @@ -969,10 +969,10 @@ def wait_for_progress( seconds_since = time.time() - start_time exceed_max_time = max_wait_time > 0 and seconds_since >= max_wait_time if exceed_max_time: - logger.error( - f"Max wait time ({max_wait_time} seconds) exceeded." - ) - while not exceed_max_time and (not work_request_logs or len(work_request_logs) < num_steps): + logger.error(f"Max wait time ({max_wait_time} seconds) exceeded.") + while not exceed_max_time and ( + not work_request_logs or len(work_request_logs) < num_steps + ): time.sleep(poll_interval) new_work_request_logs = [] diff --git a/ads/dataset/sampled_dataset.py b/ads/dataset/sampled_dataset.py index 665957ebd..8d53715c0 100644 --- a/ads/dataset/sampled_dataset.py +++ b/ads/dataset/sampled_dataset.py @@ -47,13 +47,13 @@ OptionalDependency, ) +NATURAL_EARTH_DATASET = "naturalearth_lowres" class PandasDataset(object): """ This class provides APIs that can work on a sampled dataset. """ - @runtime_dependency(module="geopandas", install_from=OptionalDependency.GEO) def __init__( self, sampled_df, @@ -67,9 +67,7 @@ def __init__( self.correlation = None self.feature_dist_html_dict = {} self.feature_types = metadata if metadata is not None else {} - self.world = geopandas.read_file( - geopandas.datasets.get_path("naturalearth_lowres") - ) + self.world = None self.numeric_columns = self.sampled_df.select_dtypes( utils.numeric_pandas_dtypes() @@ -562,7 +560,7 @@ def plot_gis_scatter(self, lon="longitude", lat="latitude", ax=None): ), ) world = geopandas.read_file( - geopandas.datasets.get_path("naturalearth_lowres") + geopandas.datasets.get_path(NATURAL_EARTH_DATASET) ) ax1 = world.plot(ax=ax, color="lightgrey", linewidth=0.5, edgecolor="white") gdf.plot(ax=ax1, color="blue", markersize=10) @@ -706,6 +704,12 @@ def _visualize_feature_distribution(self, html_widget): gdf = geopandas.GeoDataFrame( df, geometry=geopandas.points_from_xy(df["lon"], df["lat"]) ) + + if not self.world: + self.world = geopandas.read_file( + geopandas.datasets.get_path(NATURAL_EARTH_DATASET) + ) + self.world.plot( ax=ax, color="lightgrey", linewidth=0.5, edgecolor="white" ) diff --git a/ads/feature_store/common/enums.py b/ads/feature_store/common/enums.py index b3f55c19a..0a88318bb 100644 --- a/ads/feature_store/common/enums.py +++ b/ads/feature_store/common/enums.py @@ -49,7 +49,20 @@ class DatasetIngestionMode(Enum): SQL = "SQL" -class IngestionMode(Enum): +class IngestionType(Enum): + """ + The type of ingestion that can be performed. + + Possible values: + * STREAMING: The data is ingested in real time. + * BATCH: The data is ingested in batches. + """ + + STREAMING = "STREAMING" + BATCH = "BATCH" + + +class BatchIngestionMode(Enum): """ An enumeration that represents the supported Ingestion Mode in feature store. @@ -68,6 +81,20 @@ class IngestionMode(Enum): UPSERT = "UPSERT" +class StreamingIngestionMode(Enum): + """ + Enumeration for stream ingestion modes. + + - `COMPLETE`: Represents complete stream ingestion where the entire dataset is replaced. + - `APPEND`: Represents appending new data to the existing dataset. + - `UPDATE`: Represents updating existing data in the dataset. + """ + + COMPLETE = "COMPLETE" + APPEND = "APPEND" + UPDATE = "UPDATE" + + class JoinType(Enum): """Enumeration of supported SQL join types. @@ -214,6 +241,7 @@ class TransformationMode(Enum): SQL = "sql" PANDAS = "pandas" + SPARK = "spark" class FilterOperators(Enum): diff --git a/ads/feature_store/common/utils/transformation_utils.py b/ads/feature_store/common/utils/transformation_utils.py index 7a6445ccf..9f84f080a 100644 --- a/ads/feature_store/common/utils/transformation_utils.py +++ b/ads/feature_store/common/utils/transformation_utils.py @@ -69,7 +69,10 @@ def apply_transformation( temporary_table_view, **transformation_kwargs_dict ) ) - elif transformation.transformation_mode == TransformationMode.PANDAS.value: + elif transformation.transformation_mode in [ + TransformationMode.PANDAS.value, + TransformationMode.SPARK.value, + ]: transformed_data = transformation_function_caller( dataframe, **transformation_kwargs_dict ) diff --git a/ads/feature_store/dataset.py b/ads/feature_store/dataset.py index e505a8a76..ede461039 100644 --- a/ads/feature_store/dataset.py +++ b/ads/feature_store/dataset.py @@ -10,7 +10,7 @@ from great_expectations.core import ExpectationSuite from ads import deprecated -from oci.feature_store.models import ( +from feature_store_client.feature_store.models import ( DatasetFeatureGroupCollection, DatasetFeatureGroupSummary, ) @@ -21,6 +21,7 @@ ExecutionEngine, ExpectationType, EntityType, + BatchIngestionMode, ) from ads.feature_store.common.exceptions import NotMaterializedError from ads.feature_store.common.utils.utility import ( @@ -28,7 +29,7 @@ validate_delta_format_parameters, convert_expectation_suite_to_expectation, ) -from ads.feature_store.dataset_job import DatasetJob, IngestionMode +from ads.feature_store.dataset_job import DatasetJob from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine from ads.feature_store.execution_strategy.execution_strategy_provider import ( OciExecutionStrategyProvider, @@ -720,7 +721,7 @@ def create(self, validate_sql=False, **kwargs) -> "Dataset": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Dataset` accepts. + Can be any attribute that `feature_store.models.Dataset` accepts. validate_sql: Boolean value indicating whether to validate sql before creating dataset @@ -779,7 +780,7 @@ def delete(self): None """ # Create DataSet Job and persist it - dataset_job = self._build_dataset_job(IngestionMode.DEFAULT) + dataset_job = self._build_dataset_job(BatchIngestionMode.DEFAULT) # Create the Job dataset_job.create() @@ -821,7 +822,7 @@ def update(self, **kwargs) -> "Dataset": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Dataset` accepts. + Can be any attribute that `feature_store.models.Dataset` accepts. Returns ------- @@ -874,7 +875,7 @@ def _update_from_oci_dataset_model(self, oci_dataset: OCIDataset) -> "Dataset": def materialise( self, - ingestion_mode: IngestionMode = IngestionMode.OVERWRITE, + ingestion_mode: BatchIngestionMode = BatchIngestionMode.OVERWRITE, feature_option_details: FeatureOptionDetails = None, ): """Creates a dataset job. diff --git a/ads/feature_store/dataset_job.py b/ads/feature_store/dataset_job.py index 3fc57206d..2b8deacb1 100644 --- a/ads/feature_store/dataset_job.py +++ b/ads/feature_store/dataset_job.py @@ -5,13 +5,17 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import logging from copy import deepcopy -from typing import Dict, List, Any +from typing import Dict, List, Any, Union import pandas from ads.common import utils +from ads.feature_store.common.enums import ( + JobConfigurationType, + BatchIngestionMode, + StreamingIngestionMode, +) from ads.feature_store.feature_option_details import FeatureOptionDetails -from ads.feature_store.common.enums import IngestionMode, JobConfigurationType from ads.feature_store.service.oci_dataset_job import OCIDatasetJob from ads.jobs.builders.base import Builder @@ -113,7 +117,7 @@ def _to_oci_fs_dataset_run(self, **kwargs): kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.DatasetJob` accepts. + Can be any attribute that `feature_store.models.DatasetJob` accepts. Returns ------- @@ -225,10 +229,14 @@ def ingestion_mode(self) -> str: return self.get_spec(self.CONST_INGESTION_MODE) @ingestion_mode.setter - def ingestion_mode(self, ingestion_mode: IngestionMode) -> "DatasetJob": + def ingestion_mode( + self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode] + ) -> "DatasetJob": return self.with_ingestion_mode(ingestion_mode) - def with_ingestion_mode(self, ingestion_mode: IngestionMode) -> "DatasetJob": + def with_ingestion_mode( + self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode] + ) -> "DatasetJob": """Sets the mode of the dataset ingestion mode. Parameters @@ -337,7 +345,7 @@ def create(self, **kwargs) -> "DatasetJob": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.DatasetJob` accepts. + Can be any attribute that `feature_store.models.DatasetJob` accepts. Returns ------- @@ -370,7 +378,7 @@ def update(self, **kwargs) -> "DatasetJob": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.DatasetJob` accepts. + Can be any attribute that `feature_store.models.DatasetJob` accepts. Returns ------- diff --git a/ads/feature_store/docs/source/data_versioning.rst b/ads/feature_store/docs/source/data_versioning.rst new file mode 100644 index 000000000..62f585a77 --- /dev/null +++ b/ads/feature_store/docs/source/data_versioning.rst @@ -0,0 +1,33 @@ +Data Versioning +**************** + +Data versioning is a practice aimed at recording the various data commits integrated into a particular feature group and dataset. This involves tracking changes in data over time while maintaining consistent schema structures and feature definitions within a shared schema version. In the context of feature store, it's important to note that data versioning features are exclusively available for offline feature groups. + +.. image:: figures/data_versioning.png + + +As Of +====== + +You can call the ``as_of()`` method of the ``FeatureGroup`` or ``Dataset`` instance to get specified point in time and time traveled data. + +The ``.as_of()`` method takes the following optional parameter: + +- ``commit_timestamp: date-time``. Commit timestamp for feature group +- ``version_number: int``. Version number for feature group + +.. code-block:: python3 + + # as_of feature group + df = feature_group.as_of(version_number=1) + + +History +======= + +You can call the ``history()`` method of the ``FeatureGroup`` or ``Dataset`` instance to show history of the feature group. + +.. code-block:: python3 + + # Show history of feature group + df = feature_group.history() diff --git a/ads/feature_store/docs/source/dataset.rst b/ads/feature_store/docs/source/dataset.rst index 0fe9700af..3e6fc8ad2 100644 --- a/ads/feature_store/docs/source/dataset.rst +++ b/ads/feature_store/docs/source/dataset.rst @@ -1,349 +1,343 @@ -Dataset -******** - -A dataset is a collection of feature that are used together to either train a model or perform model inference. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a dataset. - - -With the specified way below, you can define a dataset and give it a name. -A ``Dataset`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.dataset import Dataset - - dataset = ( - Dataset - .with_name("") - .with_entity_id() - .with_feature_store_id("") - .with_description("") - .with_compartment_id("") - .with_dataset_ingestion_mode(DatasetIngestionMode.SQL) - .with_query('SELECT col FROM .') - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.dataset import Dataset - - yaml_string = """ - kind: dataset - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - type: dataset - """ - - dataset = Dataset.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``Dataset`` instance to create an dataset. - -.. important:: - - This method is lazy and does not persist any metadata or feature data in the feature store on its own. - To persist the dataset and save feature data along the metadata in the feature store, call the ``materialise()`` - method with a DataFrame. - -.. code-block:: python3 - - # Create an dataset - dataset.create() - - -Load -==== - -Use the ``from_id()`` method from the ``Dataset`` class to load an existing dataset with its OCID provided. It returns a ``Dataset`` instance. - -.. code-block:: python3 - - from ads.feature_store.dataset import Dataset - - dataset = Dataset.from_id("ocid1.dataset..") - -Materialise -=========== - -You can call the ``materialise() -> DatasetJob`` method of the ``Dataset`` instance to load the data to dataset. To persist the dataset and save dataset data along the metadata in the feature store, call the ``materialise()`` - -The ``.materialise()`` method takes the following parameter: - -- ``input_dataframe: Union[DataFrame, pd.DataFrame]``. Spark dataframe or pandas dataframe. -- ``from_timestamp: str(Optional)``. From timestamp of dataset. -- ``to_timestamp: str(Optional)``. To timestamp of dataset. -- ``feature_option_details: FeatureOptionDetails(Optional)``. Feature option details for materialise operation. - - ``write_config_details: (merge_schema: bool, overwrite_schema: bool)``. Write config details for feature option details - - ``read_config_details: (version_as_of: int, timestamp_as_of: datetime)``. Read config details for feature option details - -.. code-block:: python3 - - from ads.feature_store.dataset_job import DatasetJob - - dataset_job: DatasetJob = dataset.materialise(dataframe) - -.. seealso:: - :ref:`Dataset Job` - - -Delete -====== - -Use the ``.delete()`` method on the ``Dataset`` instance to delete a dataset. - -A dataset can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - dataset.delete() - -Get last dataset job -==================== -Dataset job is the execution instance of a dataset. Each dataset job will include validation results and statistics results. - -With a Dataset instance, we can get the last dataset job details using ``get_last_job()`` - -.. code-block:: python3 - - dataset_job = dataset.get_last_job() - -Save expectation entity -======================= -Feature store allows you to define expectations on data being materialized into dataset instance.With a ``Dataset`` instance, You can save the expectation details using ``with_expectation_suite()`` with parameters - -- ``expectation_suite: ExpectationSuite``. ExpectationSuit of great expectation -- ``expectation_type: ExpectationType``. Type of expectation - - ``ExpectationType.STRICT``: Fail the job if expectation not met - - ``ExpectationType.LENIENT``: Pass the job even if expectation not met - -.. note:: - - Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases. - -.. image:: figures/validation.png - -.. code-block:: python3 - - expectation_suite = ExpectationSuite( - expectation_suite_name="expectation_suite_name" - ) - expectation_suite.add_expectation( - ExpectationConfiguration( - expectation_type="expect_column_values_to_not_be_null", - kwargs={"column": ""}, - ) - - dataset_resource = ( - Dataset() - .with_description("dataset description") - .with_compartment_id() - .with_name() - .with_entity_id(entity_id) - .with_feature_store_id(feature_store_id) - .with_query(f"SELECT * FROM `{entity_id}`.{feature_group_name}") - .with_expectation_suite( - expectation_suite=expectation_suite, - expectation_type=ExpectationType.STRICT, - ) - ) - -You can call the ``get_validation_output()`` method of the Dataset instance to fetch validation results for a specific ingestion job. -The ``get_validation_output()`` method takes the following optional parameter: - -- ``job_id: string``. Id of dataset job - -``get_validation_output().to_pandas()`` will output the validation results for each expectation as pandas dataframe - -.. image:: figures/dataset_validation_results.png - -``get_validation_output().to_summary()`` will output the overall summary of validation as pandas dataframe. - -.. image:: figures/dataset_validation_summary.png - -.. seealso:: - - :ref:`Feature Validation` - -Statistics Computation -======================== -During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of -dataset or it can be updated later as well. - -.. code-block:: python3 - - # Define statistics configuration for selected features - stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"]) - - -This can be used with dataset instance. - -.. code-block:: python3 - - from ads.feature_store.dataset import Dataset - - dataset = ( - Dataset - .with_name("") - .with_entity_id() - .with_feature_store_id("") - .with_description("") - .with_compartment_id("") - .with_dataset_ingestion_mode(DatasetIngestionMode.SQL) - .with_query('SELECT col FROM .') - .with_statistics_config(stats_config) - ) - -You can call the ``get_statistics()`` method of the dataset to fetch metrics for a specific ingestion job. - -The ``get_statistics()`` method takes the following optional parameter: - -- ``job_id: string``. Id of dataset job - -.. code-block:: python3 - - # Fetch stats results for a dataset job - df = dataset.get_statistics(job_id).to_pandas() - -.. image:: figures/dataset_statistics.png - -.. code-block:: python3 - - # Fetch and visualize stats for a dataset job - df = dataset.get_statistics(job_id).to_viz() - -.. image:: figures/dataset_statistics_viz.png - - -.. seealso:: - - :ref:`Statistics` - - -Get features -============ -You can call the ``get_features_df()`` method of the Dataset instance to fetch features in a dataset. - -.. code-block:: python3 - - # Fetch features for a dataset - df = dataset.get_features_df() - df.show() - - -Preview -======== - -.. deprecated:: 1.0.3 - Use :func:`as_of` instead. - -You can call the ``preview()`` method of the Dataset instance to preview the dataset. - -The ``.preview()`` method takes the following optional parameter: -- ``timestamp: date-time``. Commit timestamp for dataset -- ``version_number: int``. Version number for dataset -- ``row_count: int``. Defaults to 10. Total number of row to return - -.. code-block:: python3 - - # Preview dataset - df = dataset.preview(row_count=50) - df.show() - -as_of -======= - -You can call the ``as_of()`` method of the Dataset instance to get specified point in time and time traveled data. - -The ``.as_of()`` method takes the following optional parameter: - -- ``commit_timestamp: date-time``. Commit timestamp for dataset -- ``version_number: int``. Version number for dataset - -.. code-block:: python3 - - # as_of feature group - df = dataset.as_of(version_number=1) - - -Restore -======= -You can call the ``restore()`` method of the Dataset instance to restore the dataset to a particular version and timestamp. - -The ``.restore()`` method takes the following optional parameter: -- ``timestamp: date-time``. Commit timestamp for dataset -- ``version_number: int``. Version number for dataset - -.. code-block:: python3 - - # Restore dataset to a particular version and timestamp - df = feature_group.restore(version_number=2) - df.show() - - -Profile -======= -You can call the ``profile()`` method of the Dataset instance to profile the dataset. - -.. code-block:: python3 - - # Profile dataset - df = dataset.profile() - df.show() - - -History -======= -You can call the ``history()`` method of the Dataset instance to show history of the dataset. - -.. code-block:: python3 - - # Show history of dataset - df = dataset.history() - df.show() - - -Visualize Lineage -================= - -Use the ``show()`` method on the ``Dataset`` instance to visualize the lineage of the dataset. - -The ``show()`` method takes the following optional parameter: - -- ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). - - -.. code-block:: python3 - - dataset.show() - -Below is an example of the output. - -.. figure:: figures/dataset_lineage.png - :width: 400 - - -Add Model Details -================= - -You can call the ``add_models()`` method of the Dataset instance to add model ids to dataset. -The ``.add_models()`` method takes the following parameter: - -- ``model_details: ModelDetails``. ModelDetails takes ``items: List[str]`` as parameter and model ids to be passed as items. - -.. code-block:: python3 - - dataset.add_models(ModelDetails().with_items([])) +Dataset +******** + +A dataset is a collection of feature that are used together to either train a model or perform model inference. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a dataset. + + +The following example defines a dataset and gives it a name. A ``Dataset`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.dataset import Dataset + + dataset = ( + Dataset + .with_name("") + .with_entity_id() + .with_feature_store_id("") + .with_description("") + .with_compartment_id("") + .with_dataset_ingestion_mode(DatasetIngestionMode.SQL) + .with_query('SELECT col FROM .') + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.dataset import Dataset + + yaml_string = """ + kind: dataset + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + type: dataset + """ + + dataset = Dataset.from_yaml(yaml_string) + + +Create +====== + +Use the the ``create()`` method of the ``Dataset`` instance to create an dataset. + +.. important:: + + This method doesn’t persist any metadata or feature data in the feature store. To persist the dataset and save feature data including the metadata in the feature store, use the ``materialise()`` method with a dataframe. + +.. code-block:: python3 + + # Create an dataset + dataset.create() + + +Load +==== + +Use the ``from_id()`` method from the ``Dataset`` class to load an existing dataset by specifying the OCID. A ``Dataset`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.dataset import Dataset + + dataset = Dataset.from_id("") + +Materialize +=========== + +Use the the ``materialise() -> DatasetJob`` method of the ``Dataset`` instance to load the data to dataset. To persist the dataset and save dataset data, including the metadata in the feature store, use ``materialise()``. + +The ``.materialise()`` method has the following parameters: + +- ``input_dataframe: Union[DataFrame, pd.DataFrame]``. Spark dataframe or Pandas dataframe. +- ``from_timestamp: str(Optional)``. From timestamp of dataset. +- ``to_timestamp: str(Optional)``. To timestamp of dataset. +- ``feature_option_details: FeatureOptionDetails(Optional)``. Feature option details for materialize operation. + - ``write_config_details: (merge_schema: bool, overwrite_schema: bool)``. Write configuration details for feature option details. + - ``read_config_details: (version_as_of: int, timestamp_as_of: datetime)``. Read configuration details for feature option details. + +.. code-block:: python3 + + from ads.feature_store.dataset_job import DatasetJob + + dataset_job: DatasetJob = dataset.materialise(dataframe) + +.. seealso:: + :ref:`Dataset Job` + + +Delete +====== + +Use the ``.delete()`` method on the ``Dataset`` instance to delete a dataset. A dataset can only be deleted when its associated entities are all deleted, + +.. code-block:: python3 + + dataset.delete() + +Get Last Dataset Job +==================== +A dataset job is the processing instance of a dataset. Each dataset job includes validation and statistics results. + +With a ``Dataset`` instance, you can get the last dataset job details using ``get_last_job()``. + +.. code-block:: python3 + + dataset_job = dataset.get_last_job() + +Save Expectation Entity +======================= +Feature store allows you to define expectations on data being materialized into dataset instance. With a ``Dataset`` instance, save the expectation details using ``with_expectation_suite()`` with the following parameters: + +- ``expectation_suite: ExpectationSuite``. ``ExpectationSuite`` of the great expectation library. +- ``expectation_type: ExpectationType``. Type of expectation. + - ``ExpectationType.STRICT``: Fail the job if the expectation isn't met. + - ``ExpectationType.LENIENT``: Pass the job even if the expectation isn't met. + +.. note:: + + `Great Expectations `_ is an open source Python-based library that validates, documents, and profiles data. It automates testing, which is essential for managing complex code bases. + +.. image:: figures/validation.png + +.. code-block:: python3 + + expectation_suite = ExpectationSuite( + expectation_suite_name="expectation_suite_name" + ) + expectation_suite.add_expectation( + ExpectationConfiguration( + expectation_type="expect_column_values_to_not_be_null", + kwargs={"column": ""}, + ) + + dataset_resource = ( + Dataset() + .with_description("dataset description") + .with_compartment_id() + .with_name() + .with_entity_id(entity_id) + .with_feature_store_id(feature_store_id) + .with_query(f"SELECT * FROM `{entity_id}`.{feature_group_name}") + .with_expectation_suite( + expectation_suite=expectation_suite, + expectation_type=ExpectationType.STRICT, + ) + ) + +Use the the ``get_validation_output()`` method of the dataset instance to fetch validation results for a specific ingestion job. +The ``get_validation_output()`` method has the following optional parameter: + +- ``job_id: string``. ID of dataset job. + +``get_validation_output().to_pandas()`` Outputs the validation results for each expectation as a Pandas dataframe. + +.. image:: figures/dataset_validation_results.png + +``get_validation_output().to_summary()`` Outputs the overall summary of the validation as a Pandas dataframe. + +.. image:: figures/dataset_validation_summary.png + +.. seealso:: + + :ref:`Feature Validation` + +Statistics Computation +======================== +During the materialization, feature store performs computation of statistical metrics for all the features by default. The computation is configured using a ``StatisticsConfig`` object, which is passed at the creation of the dataset, or it can be updated later. + +.. code-block:: python3 + + # Define statistics configuration for selected features + from ads.feature_store.statistics_config import StatisticsConfig + stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"]) + +This is used with ``Dataset`` instance. + +.. code-block:: python3 + + from ads.feature_store.dataset import Dataset + + dataset = ( + Dataset + .with_name("") + .with_entity_id() + .with_feature_store_id("") + .with_description("") + .with_compartment_id("") + .with_dataset_ingestion_mode(DatasetIngestionMode.SQL) + .with_query('SELECT col FROM .') + .with_statistics_config(stats_config) + ) + +Use the the ``get_statistics()`` method of the ``Dataset`` instance to fetch metrics for a specific ingestion job. + +The ``get_statistics()`` method has the following optional parameter: + +- ``job_id: string``. ID of the dataset job. + +.. code-block:: python3 + + # Fetch stats results for a dataset job + df = dataset.get_statistics(job_id).to_pandas() + +.. image:: figures/dataset_statistics.png + +.. code-block:: python3 + + # Fetch and visualize stats for a dataset job + df = dataset.get_statistics(job_id).to_viz() + +.. image:: figures/dataset_statistics_viz.png + + +.. seealso:: + + :ref:`Statistics` + + +Get features +============ +Use the the ``get_features_df()`` method of the ``Dataset`` instance to fetch features in a dataset. + +.. code-block:: python3 + + # Fetch features for a dataset + df = dataset.get_features_df() + df.show() + + +Preview +======== + +.. deprecated:: 1.0.3 + Use :func:`as_of` instead. + +Use the the ``preview()`` method of the ``Dataset`` instance to preview the dataset. + +The ``.preview()`` method has the following optional parameters: +- ``timestamp: date-time``. Commit timestamp for the dataset. +- ``version_number: int``. Version number for the dataset. +- ``row_count: int``. Defaults to 10. Total number of rows to return. + +.. code-block:: python3 + + # Preview dataset + df = dataset.preview(row_count=50) + df.show() + +as_of +======= + +Use the the ``as_of()`` method of the ``Dataset`` instance to get a specified point in time and time traveled data. + +The ``.as_of()`` method has the following optional parameters: + +- ``commit_timestamp: date-time``. Commit timestamp for the dataset. +- ``version_number: int``. Version number for the dataset. + +.. code-block:: python3 + + # as_of feature group + df = dataset.as_of(version_number=1) + + +Restore +======= +Use the the ``restore()`` method of the ``Dataset`` instance to restore the dataset to a particular version and timestamp. + +The ``.restore()`` method has the following optional parameters: +- ``timestamp: date-time``. Commit timestamp for the dataset. +- ``version_number: int``. Version number for the dataset. + +.. code-block:: python3 + + # Restore the dataset to a particular version and timestamp + df = feature_group.restore(version_number=2) + df.show() + + +Profile +======= +Use the the ``profile()`` method of the ``Dataset`` instance to profile the dataset. + +.. code-block:: python3 + + # Profile dataset + df = dataset.profile() + df.show() + + +History +======= +Use the the ``history()`` method of the ``Dataset`` instance to show history of the dataset. + +.. code-block:: python3 + + # Show history of dataset + df = dataset.history() + df.show() + + +Visualize Lineage +================= + +Use the ``show()`` method on the ``Dataset`` instance to visualize the lineage of the dataset. + +The ``show()`` method gas the following optional parameter: + +- ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). + + +.. code-block:: python3 + + dataset.show() + +The following is an example of the output: + +.. figure:: figures/dataset_lineage.png + :width: 400 + + +Add Model Details +================= + +Use the the ``add_models()`` method of the ``Dataset`` instance to add model IDs to the dataset. +The ``.add_models()`` method has the following parameter: + +- ``model_details: ModelDetails``. Provide ``items: List[str]`` as parameter and model IDs are passed as items. + +.. code-block:: python3 + + dataset.add_models(ModelDetails().with_items([])) diff --git a/ads/feature_store/docs/source/dataset_job.rst b/ads/feature_store/docs/source/dataset_job.rst index 6a2c4968e..65d106f9e 100644 --- a/ads/feature_store/docs/source/dataset_job.rst +++ b/ads/feature_store/docs/source/dataset_job.rst @@ -1,81 +1,78 @@ -.. _Dataset Job: - -Dataset Job -*********** - -Dataset job is the execution instance of a dataset. Each dataset job will include validation results and statistics results. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a dataset job. - - -With the specified way below, you can define a dataset_job and give it a name. -A ``DatasetJob`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.dataset_job import DatasetJob - - dataset_job = ( - DatasetJob - .with_name("") - .with_feature_store_id("") - .with_description("") - .with_compartment_id("") - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.dataset_job import DatasetJob - - yaml_string = """ - kind: dataset_job - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - type: dataset_job - """ - - dataset_job = DatasetJob.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``DatasetJob`` instance to create an dataset job. - -.. code-block:: python3 - - # Create an dataset_job - dataset_job.create() - - -Load -==== - -Use the ``from_id()`` method from the ``DatasetJob`` class to load an existing dataset job with its OCID provided. It returns a ``DatasetJob`` instance. - -.. code-block:: python3 - - from ads.feature_store.dataset_job import DatasetJob - - dataset_job = DatasetJob.from_id("ocid1.dataset_job..") - -Delete -====== - -Use the ``.delete()`` method on the ``DatasetJob`` instance to delete a dataset job. - -A dataset_job can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - dataset_job.delete() +.. _Dataset Job: + +Dataset Job +*********** + +A dDataset job is the processing instance of a dataset. Each dataset job includes validation and statistics results. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a dataset job. + + +The following example defines a dataset job and gives it a name. A ``DatasetJob`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.dataset_job import DatasetJob + + dataset_job = ( + DatasetJob + .with_name("") + .with_feature_store_id("") + .with_description("") + .with_compartment_id("") + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.dataset_job import DatasetJob + + yaml_string = """ + kind: dataset_job + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + type: dataset_job + """ + + dataset_job = DatasetJob.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``DatasetJob`` instance to create a dataset job. + +.. code-block:: python3 + + # Create an dataset_job + dataset_job.create() + + +Load +==== + +Use the ``from_id()`` method from the ``DatasetJob`` class to load an existing dataset job by specifying its OCID. A``DatasetJob`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.dataset_job import DatasetJob + + dataset_job = DatasetJob.from_id("") + +Delete +====== + +Use the ``.delete()`` method on the ``DatasetJob`` instance to delete a dataset job. A dataset_job can only be deleted when its associated entities are all deleted, + +.. code-block:: python3 + + dataset_job.delete() diff --git a/ads/feature_store/docs/source/demo.rst b/ads/feature_store/docs/source/demo.rst index 819a41062..22714c3fa 100644 --- a/ads/feature_store/docs/source/demo.rst +++ b/ads/feature_store/docs/source/demo.rst @@ -1,41 +1,41 @@ -====== -Demos -====== - -.. admonition:: Examples - :class: note - - .. list-table:: - :widths: 50 50 - :header-rows: 1 - - * - Demo - - Description - - * - `Feature store overview `__ - - | 1. More about `conda environment `__ - | 2. Feature store overview using flights data - | 3. UI exploration of feature store - - * - `Querying feature store using pandas like interface `__ - - | 1. More about `conda environment `__ - | 2. Pandas like interface for feature store - | 3. Querying for complex SQL queries - | 4. Validation of query syntax - - * - `UI exploration for feature store `__ - - | 1. More about `conda environment `__ - | 2. UI exploration of feature store, feature groups and dataset - - * - `Feature Store Stats and Validation `__ - - | 1. More about `conda environment `__ - | 2. Feature Store Statistics and Validation - - * - | 1. `Deploy feature store stack using terraform for admin users `__ - | 2. `Deploy feature store stack using terraform for non admin users `__ - | 3. `Deploy feature store stack using terraform BYODB ATP instance `__ - | 4. `Deploy feature store stack using terraform BYODB MySQL instance `__ - - | 1. Setting up feature store using terraform stack for admin user - | 2. Setting up feature store using terraform stack for non admin user - | 3. Setting up feature store using terraform stack using customer ATP instance - | 4. Setting up feature store using terraform stack MySQL instance +====== +Demos +====== + +.. admonition:: Examples + :class: note + + .. list-table:: + :widths: 50 50 + :header-rows: 1 + + * - Demo + - Description + + * - `Feature Store Overview `__. + - | 1. More about `conda environments `__ + | 2. Feature store overview using flights data. + | 3. UI exploration of feature store. + + * - `Querying Feature Store Using Pandas Like Interface `__. + - | 1. More about `conda environments `__. + | 2. Pandas like interface for feature store. + | 3. Querying for complex SQL queries. + | 4. Validation of query syntax. + + * - `UI Exploration for Feature Store `__. + - | 1. More about `conda environments `__. + | 2. UI exploration of feature store, feature groups, and datasets. + + * - `Feature Store Statistics and Validation `__. + - | 1. More about `conda environmenst `__. + | 2. Feature store statistics and validation. + + * - | 1. `Deploy feature store stack using terraform for admin users `__ + | 2. `Deploy feature store stack using terraform for non admin users `__ + | 3. `Deploy feature store stack using terraform BYODB ATP instance `__ + | 4. `Deploy feature store stack using terraform BYODB MySQL instance `__ + - | 1. Setting up feature store using terraform stack for administration user. + | 2. Setting up feature store using terraform stack for non-administration user. + | 3. Setting up feature store using terraform stack using an ATP instance. + | 4. Setting up feature store using terraform stack using a MySQL instance. diff --git a/ads/feature_store/docs/source/entity.rst b/ads/feature_store/docs/source/entity.rst index 447b1ebf9..6de5a4259 100644 --- a/ads/feature_store/docs/source/entity.rst +++ b/ads/feature_store/docs/source/entity.rst @@ -1,80 +1,77 @@ -Entity -******** - -An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated features. Another way to look at it is that an entity is an object or concept that is described by its features. Examples of entities could be customer, product, transaction, review, image, document, etc. - - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a entity. - - -With the specified way below, you can define a entity and give it a name. -A ``Entity`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.entity import Entity - - entity = ( - Entity - .with_name("") - .with_feature_store_id("") - .with_description("") - .with_compartment_id("") - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.entity import Entity - - yaml_string = """ - kind: entity - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - type: entity - """ - - entity = Entity.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``Entity`` instance to create an entity. - -.. code-block:: python3 - - # Create an entity - entity.create() - - -Load -==== - -Use the ``from_id()`` method from the ``Entity`` class to load an existing entity with its OCID provided. It returns a ``Entity`` instance. - -.. code-block:: python3 - - from ads.feature_store.entity import Entity - - entity = Entity.from_id("ocid1.entity..") - -Delete -====== - -Use the ``.delete()`` method on the ``Entity`` instance to delete a entity. - -A entity can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - entity.delete() +Entity +******** + +An entity is a group of semantically related features. An entity is an object or concept that is described by its features. The first step when accessing a feature store is typically to list the entities and the entities associated features. Examples of entities are customer, product, transaction, review, image, document, and so on. + + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define an entity. + + +The following example defines an entity and gives it a name. An ``Entity`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.entity import Entity + + entity = ( + Entity + .with_name("") + .with_feature_store_id("") + .with_description("") + .with_compartment_id("") + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.entity import Entity + + yaml_string = """ + kind: entity + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + type: entity + """ + + entity = Entity.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``Entity`` instance to create an entity. + +.. code-block:: python3 + + # Create an entity + entity.create() + + +Load +==== + +Use the ``from_id()`` method from the ``Entity`` class to load an existing entity by specifying its OCID. An ``Entity`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.entity import Entity + + entity = Entity.from_id("") + +Delete +====== + +Use the ``.delete()`` method on the ``Entity`` instance to delete a entity. A entity can only be deleted when its associated entities are all deleted, + +.. code-block:: python3 + + entity.delete() diff --git a/ads/feature_store/docs/source/feature_group.rst b/ads/feature_store/docs/source/feature_group.rst index 860280336..590816c44 100644 --- a/ads/feature_store/docs/source/feature_group.rst +++ b/ads/feature_store/docs/source/feature_group.rst @@ -1,555 +1,573 @@ -Feature Group -************* - -A feature group in a feature store is a collection of related features that are often used together in ml models. It serves as an organizational unit within the feature store for users to manage, version and share features across different ml projects. By organizing features into groups, data scientists and ml engineers can efficiently discover, reuse and collaborate on features reducing the redundant work and ensuring consistency in feature engineering. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a feature group. - - -With the specified way below, you can define a feature group and give it a name. -A ``FeatureGroup`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.feature_group import FeatureGroup - # Dictionary containing arguments for the feature group for the transformation function. - transformation_kwargs = {} - - feature_group_flights = ( - FeatureGroup() - .with_feature_store_id(feature_store.id) - .with_primary_keys(["col1"]) - .with_partition_keys(["col1", "col2"]) - .with_name("flights_feature_group") - .with_entity_id("") - .with_compartment_id("ocid1.compartment..") - .with_schema_details_from_dataframe(dataframe) - .with_transformation_kwargs(transformation_kwargs) - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.feature_group import FeatureGroup - - yaml_string = """ - kind: FeatureGroup - spec: - compartmentId: ocid1.compartment.. - entityId: - featureStoreId: - id: - inputFeatureDetails: - - featureType: STRING - name: col1 - orderNumber: 1 - - featureType: STRING - name: col2 - orderNumber: 2 - name: - primaryKeys: - items: - - name: col1 - partitionKeys: - items: - - name: col1 - statisticsConfig: - isEnabled: true - type: featureGroup - """ - - feature_group = FeatureGroup.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``FeatureGroup`` instance to create a feature group. - -.. important:: - - This method is lazy and does not persist any metadata or feature data in the feature store on its own. - To persist the feature group and save feature data along the metadata in the feature store, call the ``materialise()`` - method with a DataFrame. - -.. code-block:: python3 - - # Create a feature group - feature_group.create() - - -Load -==== - -Use the ``from_id()`` method from the ``FeatureGroup`` class to load an existing feature group with its OCID provided. It returns a ``FeatureGroup`` instance. - -.. code-block:: python3 - - from ads.feature_store.feature_group import FeatureGroup - - feature_group = FeatureGroup.from_id("ocid1.feature_group..") - - -Materialise -=========== - -You can call the ``materialise() -> FeatureGroupJob`` method of the ``FeatureGroup`` instance to load the data to feature group. To persist the feature_group and save feature_group data along the metadata in the feature store, call the ``materialise()`` - -The ``.materialise()`` method takes the following parameter: - -- ``input_dataframe: Union[DataFrame, pd.DataFrame]``. Spark dataframe or pandas dataframe. -- ``from_timestamp: str(Optional)``. From timestamp of feature group. -- ``to_timestamp: str(Optional)``. To timestamp of feature group. -- ``feature_option_details: FeatureOptionDetails(Optional)``. Feature option details for materialise operation. - - ``write_config_details: (merge_schema: bool, overwrite_schema: bool)``. Write config details for feature option details - - ``read_config_details: (version_as_of: int, timestamp_as_of: datetime)``. Read config details for feature option details - -.. code-block:: python3 - - from ads.feature_store.feature_group_job import FeatureGroupJob - - feature_group_job: FeatureGroupJob = feature_group.materialise(dataframe) - -.. seealso:: - :ref:`Feature Group Job` - -.. seealso:: - Refer :ref:`Data types` supported by feature store - -Delete -====== - -Use the ``.delete()`` method on the ``FeatureGroup`` instance to delete a feature group. - -A feature group can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - feature_group.delete() - -Select -====== -You can call the ``select()`` method of the FeatureGroup instance to return ``Query`` interface. ``Query`` interface can be used to join and filter on the feature group. - -Feature store provides an API similar to Pandas to join feature groups together and to select features from different feature groups. This easies the way you can write the query by selecting all/subset of features from a feature group and join them with all/subset of features of another feature group. - -.. code-block:: python3 - - # Select all columns of feature group - feature_group.select().show() - - # Select subset columns of feature group - feature_group.select(['col1', 'col2']).show() - - # Filter feature groups - feature_group.filter(feature_group.col1 == 0).show() - - - # Join feature groups - query = feature_group_a.select()\ - .join(feature_group_b.select(), left_on=['a_1'], right_on=['b_1'])\ - .join(feature_group_c.select(), left_on=['b_1'], right_on=['c_1']) - query.show(5) - -Save expectation entity -======================= -With a ``FeatureGroup`` instance, You can save the expectation details using ``with_expectation_suite()`` with parameters - -- ``expectation_suite: ExpectationSuite``. ExpectationSuit of great expectation -- ``expectation_type: ExpectationType``. Type of expectation - - ``ExpectationType.STRICT``: Fail the job if expectation not met - - ``ExpectationType.LENIENT``: Pass the job even if expectation not met - -.. note:: - - Great Expectations is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases. - -.. image:: figures/validation.png - -.. code-block:: python3 - - expectation_suite = ExpectationSuite( - expectation_suite_name="expectation_suite_name" - ) - expectation_suite.add_expectation( - ExpectationConfiguration( - expectation_type="expect_column_values_to_not_be_null", - kwargs={"column": ""}, - ) - - feature_group_resource = ( - FeatureGroup() - .with_feature_store_id(feature_store.id) - .with_primary_keys([""]) - .with_name("") - .with_entity_id(entity.id) - .with_compartment_id() - .with_schema_details_from_dataframe() - .with_expectation_suite( - expectation_suite=expectation_suite, - expectation_type=ExpectationType.STRICT, - ) - ) - -You can call the ``get_validation_output()`` method of the FeatureGroup instance to fetch validation results for a specific ingestion job. -The ``get_validation_output()`` method takes the following optional parameter: - -- ``job_id: string``. Id of feature group job -``get_validation_output().to_pandas()`` will output the validation results for each expectation as pandas dataframe - -.. image:: figures/validation_results.png - -``get_validation_output().to_summary()`` will output the overall summary of validation as pandas dataframe. - -.. image:: figures/validation_summary.png -.. seealso:: - - :ref:`Feature Validation` - - -Statistics Computation -======================== -During the materialization feature store performs computation of statistical metrics for all the features by default. This can be configured using ``StatisticsConfig`` object which can be passed at the creation of -feature group or it can be updated later as well. - -.. code-block:: python3 - - # Define statistics configuration for selected features - stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"]) - - -This can be used with feature group instance. - -.. code-block:: python3 - - # Fetch stats results for a feature group job - from ads.feature_store.feature_group import FeatureGroup - - feature_group_resource = ( - FeatureGroup() - .with_feature_store_id(feature_store.id) - .with_primary_keys([""]) - .with_name("") - .with_entity_id(entity.id) - .with_compartment_id() - .with_schema_details_from_dataframe() - .with_statistics_config(stats_config) - -You can call the ``get_statistics()`` method of the feature group to fetch metrics for a specific ingestion job. - -The ``get_statistics()`` method takes the following optional parameter: - -- ``job_id: string``. Id of feature group job - -.. code-block:: python3 - - # Fetch stats results for a feature group job - df = feature_group.get_statistics(job_id).to_pandas() - -.. image:: figures/stats_1.png - -.. code-block:: python3 - - # Fetch and visualize stats for a dataset job - df = feature_group.get_statistics(job_id).to_viz() - -.. image:: figures/feature_group_statistics_viz.png - -.. seealso:: - - :ref:`Statistics` - -Get last feature group job -========================== -Feature group job is the execution instance of a feature group. Each feature group job will include validation results and statistics results. - -With a FeatureGroup instance, we can get the last feature group job details using ``get_last_job()`` - -.. code-block:: python3 - - # Fetch validation results for a feature group - feature_group_job = feature_group.get_last_job() - -Get features -============= -You can call the ``get_features_df`` method of the FeatureGroup instance to fetch features in a feature group - -.. code-block:: python3 - - # Fetch features for a feature group - df = feature_group.get_features_df() - - -Filter -====== -You can call the ``filter()`` method of the FeatureGroup instance to return ``Query`` interface. ``Query`` interface can be used to join and filter on the feature group or a set of feature groups. - -Feature store provides an API similar to Pandas to join feature groups together and to select features from different feature groups. This easies the way you can write the query by selecting all/subset of features from a feature group and join them with all/subset of features of another feature group. - -.. code-block:: python3 - - # Filter feature group - feature_group.filter(feature_group.col1 > 10).show() - - -Preview -======= - -.. deprecated:: 1.0.3 - Use :func:`as_of` instead. - -You can call the ``preview()`` method of the FeatureGroup instance to preview the feature group. - -The ``.preview()`` method takes the following optional parameter: - -- ``timestamp: date-time``. Commit timestamp for feature group -- ``version_number: int``. Version number for feature group -- ``row_count: int``. Defaults to 10. Total number of row to return - -.. code-block:: python3 - - # Preview feature group - df = feature_group.preview(row_count=50) - -as_of -======= - -You can call the ``as_of()`` method of the FeatureGroup instance to get specified point in time and time traveled data. - -The ``.as_of()`` method takes the following optional parameter: - -- ``commit_timestamp: date-time``. Commit timestamp for feature group -- ``version_number: int``. Version number for feature group - -.. code-block:: python3 - - # as_of feature group - df = feature_group.as_of(version_number=1) - -Restore -======= - -You can call the ``restore()`` method of the FeatureGroup instance to restore the feature group to a particular version and timestamp. - -The ``.restore()`` method takes the following optional parameter: - -- ``timestamp: date-time``. Commit timestamp for feature group -- ``version_number: int``. Version number for feature group - -.. code-block:: python3 - - # Restore feature group to a particular version and timestamp - df = feature_group.restore(version_number=2) - -Profile -======= - -You can call the ``profile()`` method of the FeatureGroup instance to profile the feature group. - -.. code-block:: python3 - - # Profile feature group - df = feature_group.profile() - -History -======= - -You can call the ``history()`` method of the FeatureGroup instance to show history of the feature group. - -.. code-block:: python3 - - # Show history of feature group - df = feature_group.history() - -Visualize Lineage -================= - -Use the ``show()`` method on the ``FeatureGroup`` instance to visualize the lineage of the feature group. - -The ``show()`` method takes the following optional parameter: - - - ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). - - -.. code-block:: python3 - - feature_store.show() - -Below is an example of the output. - -.. figure:: figures/feature_group_lineage.png - :width: 400 - -.. _Data types: - -Data types -========== -The data will be stored in a data type native to each store. There is an option to automatically infer the datatype or specify the datatype explicitly. If the user specifies ``with_schema_details_from_dataframe``, the feature store service automatically infers the data types from the dataframe - -.. note:: - - Offline data types - ################### - Please refer to the following mapping when registering a Spark DataFrame, or a Pandas DataFrame.For spark dataframes we support - all the data types and the ones which are not specified in the following table will be mapped to Offline Feature Type COMPLEX - - .. list-table:: - :widths: 20 25 25 40 - :header-rows: 1 - - * - Spark Type - - Pandas Type - - Offline Feature Type - - Notes - * - BooleanType - - bool - - BOOLEAN - - True or False - * - ByteType - - uint8 - - INTEGER - - 8-bit integer - * - ShortType - - int16, Int16 - - INTEGER - - 16-bit integer - * - IntegerType - - int32 - - INTEGER - - 32-bit integer - * - LongType - - int64 - - INTEGER - - 64-bit integer - * - FloatType - - float32 - - FLOAT - - Floating-point values - * - DoubleType - - float64 - - FLOAT - - Double-precision floating-point values - * - DecimalType - - object(decimal) - - DECIMAL - - Fixed-point decimal numbers - * - TimestampType - - datetime64[ns] - - TIMESTAMP - - Timestamps - * - DateType - - datetime64[ns] - - TIMESTAMP - - Date values - * - StringType - - object - - STRING - - Textual data - * - ArrayType(IntegerType()) - - object (list), object (np.ndarray) - - INTEGER_ARRAY - - List of values - * - ArrayType(LongType()) - - object (list), object (np.ndarray) - - LONG_ARRAY - - List of values - * - ArrayType(FloatType()) - - object (list), object (np.ndarray) - - FLOAT_ARRAY - - List of values - * - ArrayType(DoubleType()) - - object (list), object (np.ndarray) - - DOUBLE_ARRAY - - List of values - * - ArrayType(BinaryType()) - - object (list), object (np.ndarray) - not supported - - BINARY_ARRAY - - List of values - * - ArrayType(DateType()) - - object (list), object (np.ndarray) - - DATE_ARRAY - - List of values - * - ArrayType(TimestampType()) - - object (list), object (np.ndarray) - - TIMESTAMP_ARRAY - - List of values - * - StructType - - object - not supported - - STRUCT - - Structured data - * - BinaryType - - object(bytes) - not supported - - BINARY - - Binary data - * - MapType(StringType(), StringType()) - - object - not supported - - STRING_STRING_MAP - - Key-value pairs - * - MapType(StringType(), IntegerType()) - - object - not supported - - STRING_INTEGER_MAP - - Key-value pairs - * - MapType(StringType(), ShortType()) - - object - not supported - - STRING_SHORT_MAP - - Key-value pairs - * - MapType(StringType(), LongType()) - - object - not supported - - STRING_LONG_MAP - - Key-value pairs - * - MapType(StringType(), FloatType()) - - object - not supported - - STRING_FLOAT_MAP - - Key-value pairs - * - MapType(StringType(), DoubleType()) - - object - not supported - - STRING_DOUBLE_MAP - - Key-value pairs - * - MapType(StringType(), TimestampType()) - - object - not supported - - STRING_TIMESTAMP_MAP - - Key-value pairs - * - MapType(StringType(), DateType()) - - object - not supported - - STRING_DATE_MAP - - Key-value pairs - - When it comes to Pandas dataframes, the initial step involves converting the dataframe into a Spark dataframe. This conversion is done using the default conversion mechanism provided by Spark, which may result in a less precise mapping between Python and Spark types. - - .. list-table:: - :widths: 30 20 40 - :header-rows: 1 - - * - Pandas Type - - Spark Type - - Notes - * - bool - - BooleanType - - - * - int8, uint8, int16, uint16, int32, int, uint32, int64 - - LongType - - - * - float, float16, float32, float64 - - DoubleType - - - * - object (decimal.decimal) - - DecimalType - - - * - datetime64[ns], datetime64[ns, tz] - - TimestampType - - Timestamps and Timezones - * - object (datetime.date) - - DateType - - - * - object (str), object(np.unicode) - - StringType - - - * - object (list), object (np.ndarray) - - - - Not supported +Feature Group +************* + +A feature group in a feature store is a collection of related features that are often used together in machine learning models. Feature groups serve as an organizational unit within the feature store to manage, version, and share features across different machine learning projects. By organizing features into groups, data scientists and machine learning engineers can efficiently discover, reuse, and collaborate on features. Feature groups reduce the redundant work and ensure consistency in feature engineering. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a feature group. + + +The following example definse a feature group and gives it a name. A ``FeatureGroup`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.feature_group import FeatureGroup + # Dictionary containing arguments for the feature group for the transformation function. + transformation_kwargs = {} + + feature_group_flights = ( + FeatureGroup() + .with_feature_store_id(feature_store.id) + .with_primary_keys(["col1"]) + .with_partition_keys(["col1", "col2"]) + .with_name("flights_feature_group") + .with_entity_id("") + .with_compartment_id("ocid1.compartment..") + .with_schema_details_from_dataframe(dataframe) + .with_transformation_kwargs(transformation_kwargs) + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.feature_group import FeatureGroup + + yaml_string = """ + kind: FeatureGroup + spec: + compartmentId: ocid1.compartment.. + entityId: + featureStoreId: + id: + inputFeatureDetails: + - featureType: STRING + name: col1 + orderNumber: 1 + - featureType: STRING + name: col2 + orderNumber: 2 + name: + primaryKeys: + items: + - name: col1 + partitionKeys: + items: + - name: col1 + statisticsConfig: + isEnabled: true + type: featureGroup + """ + + feature_group = FeatureGroup.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``FeatureGroup`` instance to create a feature group. + +.. important:: + + This method doesn't persist any metadata or feature data in the feature store. To persist the feature group and save feature data, including the metadata in the feature store, use the ``materialise()`` method with a dataframe. + +.. code-block:: python3 + + # Create a feature group + feature_group.create() + + +Load +==== + +Use the ``from_id()`` method from the ``FeatureGroup`` class to load an existing feature group by specifying its OCID. A ``FeatureGroup`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.feature_group import FeatureGroup + + feature_group = FeatureGroup.from_id("") + + +Materialize +=========== + +Use the ``materialise() -> FeatureGroupJob`` method of the ``FeatureGroup`` instance to load the data to feature group. To persist the feature group and save feature group data, including the metadata in the feature store, use ``materialise()``. + +The ``.materialise()`` method has the following parameters: + +- ``input_dataframe: Union[DataFrame, pd.DataFrame]``. Spark dataframe or Pandas dataframe. +- ``from_timestamp: str(Optional)``. From timestamp of the feature group. +- ``to_timestamp: str(Optional)``. To timestamp of the feature group. +- ``feature_option_details: FeatureOptionDetails(Optional)``. Feature option details for the materialize operation. + - ``write_config_details: (merge_schema: bool, overwrite_schema: bool)``. Write configuration details for the feature option details. + - ``read_config_details: (version_as_of: int, timestamp_as_of: datetime)``. Read configuration details for the feature option details. + +.. code-block:: python3 + + from ads.feature_store.feature_group_job import FeatureGroupJob + + feature_group_job: FeatureGroupJob = feature_group.materialise(dataframe) + +.. seealso:: + For more information, see :ref:`Feature Group Job`. + +.. seealso:: + See :ref:`Data types` for the types supported by feature store. + +Materialise Stream +================== +You can call the ``materialise_stream() -> FeatureGroupJob`` method of the ``FeatureGroup`` instance to load the streaming data to feature group. To persist the feature_group and save feature_group data along the metadata in the feature store, call the ``materialise_stream()`` + +The ``.materialise_stream()`` method takes the following parameter: + - ``input_dataframe``: Features in Streaming Dataframe to be saved. + - ``query_name``: It is possible to optionally specify a name for the query to make it easier to recognise in the Spark UI. Defaults to ``None``. + - ``ingestion_mode``: Specifies how data of a streaming DataFrame/Dataset is written to a streaming sink. + - ``append``: Only the new rows in the streaming DataFrame/Dataset will be written to the sink. If the query doesn’t contain aggregations, it will be equivalent to append mode. Defaults to ``"append"``. + - ``complete``: All the rows in the streaming DataFrame/Dataset will be written to the sink every time there is some update. + - ``update``: only the rows that were updated in the streaming DataFrame/Dataset will be written to the sink every time there are some updates. + - ``await_termination``: Waits for the termination of this query, either by ``query.stop()`` or by an exception. If the query has terminated with an exception, then the exception will be thrown. If timeout is set, it returns whether the query has terminated or not within the timeout seconds. Defaults to ``False``. + - ``timeout``: Only relevant in combination with ``await_termination=True``. + - Defaults to ``None``. + - ``checkpoint_dir``: Checkpoint directory location. This will be used to as a reference to from where to resume the streaming job. Defaults to ``None``. + - ``write_options``: Additional write options for Spark as key-value pairs. + - Defaults to ``{}``. + +.. seealso:: + :ref:`Feature Group Job` + +.. seealso:: + Refer :ref:`Data types` supported by feature store + +Delete +====== + +Use the ``.delete()`` method on the ``FeatureGroup`` instance to delete a feature group. A feature group can only be deleted when its associated entities are all deleted. + +.. code-block:: python3 + + feature_group.delete() + +Select +====== +Use the ``select()`` method of the ``FeatureGroup`` instance to return the ``Query`` interface. The ``Query`` interface is used to join and filter on the feature group. + +Feature store provides an API similar to Pandas to join feature groups together, and to select features from different feature groups. Write the query by selecting all or a subset of features from a feature group, and then join them with all or a subset of features of another feature group. + +.. code-block:: python3 + + # Select all columns of feature group + feature_group.select().show() + + # Select subset columns of feature group + feature_group.select(['col1', 'col2']).show() + + # Filter feature groups + feature_group.filter(feature_group.col1 == 0).show() + + + # Join feature groups + query = feature_group_a.select()\ + .join(feature_group_b.select(), left_on=['a_1'], right_on=['b_1'])\ + .join(feature_group_c.select(), left_on=['b_1'], right_on=['c_1']) + query.show(5) + +Save Expectation Entity +======================= +Using a ``FeatureGroup`` instance, you can save the expectation details using ``with_expectation_suite()`` with the following parameters: + +- ``expectation_suite: ExpectationSuite``. ``ExpectationSuite`` of the great expectation library. +- ``expectation_type: ExpectationType``. Type of expectation. + - ``ExpectationType.STRICT``: Fail the job if the expectation isn't met. + - ``ExpectationType.LENIENT``: Pass the job even if the expectation isn't met. + +.. note:: + + `Great Expectations `_ is an open source Python-based library that validates, documents, and profiles data. It automates testing, which is essential for managing complex code bases. + +.. image:: figures/validation.png + +.. code-block:: python3 + + expectation_suite = ExpectationSuite( + expectation_suite_name="expectation_suite_name" + ) + expectation_suite.add_expectation( + ExpectationConfiguration( + expectation_type="expect_column_values_to_not_be_null", + kwargs={"column": ""}, + ) + + feature_group_resource = ( + FeatureGroup() + .with_feature_store_id(feature_store.id) + .with_primary_keys([""]) + .with_name("") + .with_entity_id(entity.id) + .with_compartment_id() + .with_schema_details_from_dataframe() + .with_expectation_suite( + expectation_suite=expectation_suite, + expectation_type=ExpectationType.STRICT, + ) + ) + +Use the ``get_validation_output()`` method of the ``FeatureGroup`` instance to fetch validation results for a specific ingestion job. +The ``get_validation_output()`` method has the following optional parameter: + +- ``job_id: string``. ID of the feature group job. +``get_validation_output().to_pandas()`` Outputs the validation results for each expectation as Pandas dataframe. + +.. image:: figures/validation_results.png + +``get_validation_output().to_summary()`` Outputs the overall summary of validation as Pandas dataframe. + +.. image:: figures/validation_summary.png +.. seealso:: + + :ref:`Feature Validation` + + +Statistics Computation +======================== +During the materialization, feature store performs computation of statistical metrics for all the features by default. Configure computation using a ``StatisticsConfig`` object, which is passed at the creation of a feature group, or it can be updated later. + +.. code-block:: python3 + + # Define statistics configuration for selected features + from ads.feature_store.statistics_config import StatisticsConfig + stats_config = StatisticsConfig().with_is_enabled(True).with_columns(["column1", "column2"]) + + +The following example uses computation with a feature group instance. + +.. code-block:: python3 + + # Fetch stats results for a feature group job + from ads.feature_store.feature_group import FeatureGroup + + feature_group_resource = ( + FeatureGroup() + .with_feature_store_id(feature_store.id) + .with_primary_keys([""]) + .with_name("") + .with_entity_id(entity.id) + .with_compartment_id() + .with_schema_details_from_dataframe() + .with_statistics_config(stats_config) + +Use the ``get_statistics()`` method of the feature group to fetch metrics for a specific ingestion job. + +The ``get_statistics()`` method has the following optional parameter: + +- ``job_id: string``. ID of feature the group job. + +.. code-block:: python3 + + # Fetch stats results for a feature group job + df = feature_group.get_statistics(job_id).to_pandas() + +.. image:: figures/stats_1.png + +.. code-block:: python3 + + # Fetch and visualize stats for a dataset job + df = feature_group.get_statistics(job_id).to_viz() + +.. image:: figures/feature_group_statistics_viz.png + +.. seealso:: + + :ref:`Statistics` + +Get Last Feature Group Job +========================== +Feature group job is the processing instance of a feature group. Each feature group job includes validation and statistics results. + +Using a FeatureGroup instance, you can get the last feature group job details using ``get_last_job()``. + +.. code-block:: python3 + + # Fetch validation results for a feature group + feature_group_job = feature_group.get_last_job() + +Get Features +============= +Use the ``get_features_df`` method of the ``FeatureGroup`` instance to fetch features in a feature group. + +.. code-block:: python3 + + # Fetch features for a feature group + df = feature_group.get_features_df() + + +Filter +====== +Use the ``filter()`` method of the ``FeatureGroup`` instance to return the ``Query`` interface. The ``Query`` interface is used to join and filter on the feature group or a set of feature groups. + +Feature store provides an API similar to Pandas to join feature groups together, and to select features from different feature groups. Write the query by selecting all or a subset of features from a feature group, and then join them with all or a subset of features of another feature group. + +.. code-block:: python3 + + # Filter feature group + feature_group.filter(feature_group.col1 > 10).show() + + +Preview +======= + +.. deprecated:: 1.0.3 + Use :func:`as_of` instead. + +Use the ``preview()`` method of the ``FeatureGroup`` instance to preview the feature group. + +The ``.preview()`` method has the following optional parameters: + +- ``timestamp: date-time``. Commit timestamp for the feature group. +- ``version_number: int``. Version number for the feature group. +- ``row_count: int``. Defaults to 10. Total number of rows to return. + +.. code-block:: python3 + + # Preview feature group + df = feature_group.preview(row_count=50) + +as_of +======= + +Use the ``as_of()`` method of the ``FeatureGroup`` instance to get specified point in time and time traveled data. + +The ``.as_of()`` method takes the following optional parameters: + +- ``commit_timestamp: date-time``. Commit timestamp for the feature group. +- ``version_number: int``. Version number for the feature group. + +.. code-block:: python3 + + # as_of feature group + df = feature_group.as_of(version_number=1) + +Restore +======= + +Use the ``restore()`` method of the ``FeatureGroup`` instance to restore the feature group to a particular version and timestamp. + +The ``.restore()`` method takes the following optional parameters: + +- ``timestamp: date-time``. Commit timestamp for thefeature group. +- ``version_number: int``. Version number for the feature group. + +.. code-block:: python3 + + # Restore feature group to a particular version and timestamp + df = feature_group.restore(version_number=2) + +Profile +======= + +Use the ``profile()`` method of the ``FeatureGroup`` instance to profile the feature group. + +.. code-block:: python3 + + # Profile feature group + df = feature_group.profile() + +History +======= + +Use the ``history()`` method of the ``FeatureGroup`` instance to show history of the feature group. + +.. code-block:: python3 + + # Show history of feature group + df = feature_group.history() + +Visualize Lineage +================= + +Use the ``show()`` method on the ``FeatureGroup`` instance to visualize the lineage of the feature group. + +The ``show()`` method has the following optional parameter: + + - ``rankdir: (str, optional)``. Defaults to ``LR``. The allowed values are ``TB`` or ``LR``. This parameter is applicable only for ``graph`` mode, and it renders the direction of the graph as either top to bottom (TB) or left to right (LR). + + +.. code-block:: python3 + + feature_store.show() + +The following is an example of the output: + +.. figure:: figures/feature_group_lineage.png + :width: 400 + +.. _Data types: + +Data Types +========== +The data is stored in a data type native to each store. Specifying ``with_schema_details_from_dataframe`` causes the feature store to automatically infer the data types from the dataframe. This option can also specify the data type explicitly. + +.. note:: + + Offline Data Types + ################### + The following table shows the mapping when registering a Spark dataframe, or a Pandas dataframe. For Spark dataframes, all data types including those not specified in the following table, are mapped to the Offline Feature Type, COMPLEX. + + .. list-table:: + :widths: 20 25 25 40 + :header-rows: 1 + + * - Spark Type + - Pandas Type + - Offline Feature Type + - Notes + * - BooleanType + - bool + - BOOLEAN + - True or False + * - ByteType + - uint8 + - INTEGER + - 8-bit integer + * - ShortType + - int16, Int16 + - INTEGER + - 16-bit integer + * - IntegerType + - int32 + - INTEGER + - 32-bit integer + * - LongType + - int64 + - INTEGER + - 64-bit integer + * - FloatType + - float32 + - FLOAT + - Floating-point values + * - DoubleType + - float64 + - FLOAT + - Double-precision floating-point values + * - DecimalType + - object(decimal) + - DECIMAL + - Fixed-point decimal numbers + * - TimestampType + - datetime64[ns] + - TIMESTAMP + - Timestamps + * - DateType + - datetime64[ns] + - TIMESTAMP + - Date values + * - StringType + - object + - STRING + - Textual data + * - ArrayType(IntegerType()) + - object (list), object (np.ndarray) + - INTEGER_ARRAY + - List of values + * - ArrayType(LongType()) + - object (list), object (np.ndarray) + - LONG_ARRAY + - List of values + * - ArrayType(FloatType()) + - object (list), object (np.ndarray) + - FLOAT_ARRAY + - List of values + * - ArrayType(DoubleType()) + - object (list), object (np.ndarray) + - DOUBLE_ARRAY + - List of values + * - ArrayType(BinaryType()) + - object (list), object (np.ndarray) - not supported + - BINARY_ARRAY + - List of values + * - ArrayType(DateType()) + - object (list), object (np.ndarray) + - DATE_ARRAY + - List of values + * - ArrayType(TimestampType()) + - object (list), object (np.ndarray) + - TIMESTAMP_ARRAY + - List of values + * - StructType + - object - not supported + - STRUCT + - Structured data + * - BinaryType + - object(bytes) - not supported + - BINARY + - Binary data + * - MapType(StringType(), StringType()) + - object - not supported + - STRING_STRING_MAP + - Key-value pairs + * - MapType(StringType(), IntegerType()) + - object - not supported + - STRING_INTEGER_MAP + - Key-value pairs + * - MapType(StringType(), ShortType()) + - object - not supported + - STRING_SHORT_MAP + - Key-value pairs + * - MapType(StringType(), LongType()) + - object - not supported + - STRING_LONG_MAP + - Key-value pairs + * - MapType(StringType(), FloatType()) + - object - not supported + - STRING_FLOAT_MAP + - Key-value pairs + * - MapType(StringType(), DoubleType()) + - object - not supported + - STRING_DOUBLE_MAP + - Key-value pairs + * - MapType(StringType(), TimestampType()) + - object - not supported + - STRING_TIMESTAMP_MAP + - Key-value pairs + * - MapType(StringType(), DateType()) + - object - not supported + - STRING_DATE_MAP + - Key-value pairs + + For Pandas dataframes, the initial step involves converting the dataframe into a Spark dataframe. This conversion uses the default conversion mechanism provided by Spark, which may result in a less precise mapping between Python and Spark types. + + .. list-table:: + :widths: 30 20 40 + :header-rows: 1 + + * - Pandas Type + - Spark Type + - Notes + * - bool + - BooleanType + - + * - int8, uint8, int16, uint16, int32, int, uint32, int64 + - LongType + - + * - float, float16, float32, float64 + - DoubleType + - + * - object (decimal.decimal) + - DecimalType + - + * - datetime64[ns], datetime64[ns, tz] + - TimestampType + - Timestamps and Timezones + * - object (datetime.date) + - DateType + - + * - object (str), object(np.unicode) + - StringType + - + * - object (list), object (np.ndarray) + - + - Not supported diff --git a/ads/feature_store/docs/source/feature_group_job.rst b/ads/feature_store/docs/source/feature_group_job.rst index d18757c40..dfd186578 100644 --- a/ads/feature_store/docs/source/feature_group_job.rst +++ b/ads/feature_store/docs/source/feature_group_job.rst @@ -1,81 +1,80 @@ -.. _Feature Group Job: - -Feature Group Job -***************** - -Feature group job is the execution instance of a dataset. Each feature group job will include validation results and statistics results. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a dataset job. - - -With the specified way below, you can define a feature_group_job and give it a name. -A ``FeatureGroupJob`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.feature_group_job import FeatureGroupJob - - feature_group_job = ( - FeatureGroupJob - .with_name("") - .with_feature_store_id("") - .with_description("") - .with_compartment_id("") - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.feature_group_job import FeatureGroupJob - - yaml_string = """ - kind: feature_group_job - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - type: feature_group_job - """ - - feature_group_job = FeatureGroupJob.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``FeatureGroupJob`` instance to create an dataset job. - -.. code-block:: python3 - - # Create an feature_group_job - feature_group_job.create() - - -Load -==== - -Use the ``from_id()`` method from the ``FeatureGroupJob`` class to load an existing dataset job with its OCID provided. It returns a ``FeatureGroupJob`` instance. - -.. code-block:: python3 - - from ads.feature_store.feature_group_job import FeatureGroupJob - - feature_group_job = FeatureGroupJob.from_id("ocid1.feature_group_job..") - -Delete -====== - -Use the ``.delete()`` method on the ``FeatureGroupJob`` instance to delete a dataset job. - -A feature_group_job can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - feature_group_job.delete() +.. _Feature Group Job: + +Feature Group Job +***************** + +A feature group job is the processing instance of a dataset. Each feature group job includes validation and statistics results. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a dataset job. + + +The following example defines a feature group job and gives it a name. A ``FeatureGroupJob`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.feature_group_job import FeatureGroupJob + + feature_group_job = ( + FeatureGroupJob + .with_name("") + .with_feature_store_id("") + .with_description("") + .with_compartment_id("") + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.feature_group_job import FeatureGroupJob + + yaml_string = """ + kind: feature_group_job + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + type: feature_group_job + """ + + feature_group_job = FeatureGroupJob.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``FeatureGroupJob`` instance to create a dataset job. + +.. code-block:: python3 + + # Create an feature_group_job + feature_group_job.create() + + +Load +==== + +Use the ``from_id()`` method from the ``FeatureGroupJob`` class to load an existing dataset job by specifyingh its OCID. It returns a ``FeatureGroupJob`` instance. + +.. code-block:: python3 + + from ads.feature_store.feature_group_job import FeatureGroupJob + + feature_group_job = FeatureGroupJob.from_id("") + +Delete +====== + +Use the ``.delete()`` method on the ``FeatureGroupJob`` instance to delete a dataset job. + +A feature group job can only be deleted when its associated entities are all deleted. + +.. code-block:: python3 + + feature_group_job.delete() diff --git a/ads/feature_store/docs/source/feature_store.rst b/ads/feature_store/docs/source/feature_store.rst index 0668a2532..1edcb6f45 100644 --- a/ads/feature_store/docs/source/feature_store.rst +++ b/ads/feature_store/docs/source/feature_store.rst @@ -1,126 +1,122 @@ -Feature Store -************* - -Feature store is the top level entity for feature store service. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a feature_store. - - -With the specified way below, you can define a feature store and give it a name. -A ``FeatureStore`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.feature_store import FeatureStore - - feature_store = ( - feature_store_resource = FeatureStore(). - with_description() - with_compartment_id("ocid1.compartment.."). - with_name(). - with_offline_config( - metastore_id=metastoreId - ) - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.feature_store import FeatureStore - - yaml_string = """ - kind: feature_store - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - type: feature_store - """ - - feature_store = FeatureStore.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``FeatureStore`` instance to create an feature store. - -.. code-block:: python3 - - # Create an feature store - feature_store.create() - - -Load -==== - -Use the ``from_id()`` method from the ``FeatureStore`` class to load an existing feature store with its OCID provided. It returns a ``FeatureStore`` instance. - -.. code-block:: python3 - - from ads.feature_store.feature_store import FeatureStore - - feature_store = FeatureStore.from_id("ocid1.feature_store..") - -Delete -====== - -Use the ``.delete()`` method on the ``FeatureStore`` instance to delete a feature store. - -A feature store can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - feature_store.delete() - -SQL -=== -You can call the ``sql()`` method of the FeatureStore instance to query a feature store. - -Query a feature store using sql -############################### - -.. code-block:: python3 - - # Fetch the entity id. Entity id is used as database name in feature store - entity_id = entity.id - - # Form a query with entity id and fetch the results - sql = (f"SELECT feature_group_a.* " - f"FROM {entity_id}.feature_group_a " - f"JOIN {entity_id}.feature_group_b " - f"ON {entity_id}.feature_group_a.col_1={entity_id}.feature_group_b.col_2 " - f"JOIN {entity_id}.feature_group_a.col_1={entity_id}.feature_group_b.col_3 ") - - # Run the sql query and fetch the results as data-frame - df = feature_store.sql(sql) - -Create Entity -============= -You can call the ``create_entity()`` method of the FeatureStore instance to create a ``Entity``. - -.. code-block:: python3 - - # Create a feature store entity - feature_store.create_entity(name="") - -Create Transformation -===================== -Transformations in a feature store refers to the operations and processes applied to raw data to create, modify or derive new features that can be used as inputs for ML Models. These transformations are crucial for improving the quality, relevance and usefulness of features which in turn can enhance the performance of ml models. -You can call the ``create_transformation()`` method of the FeatureStore instance to create a ``Transformation``. - -.. code-block:: python3 - - # Create a feature store entity - feature_store.create_transformation( - source_code_func="", - transformation_mode="SQL|PANDAS" - display_name="" - ) +Feature Store +************* + +Feature store is the top-level entity for a feature store. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a feature store. + + +The following example defines a feature store and gives it a name. A ``FeatureStore`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.feature_store import FeatureStore + + feature_store = ( + feature_store_resource = FeatureStore(). + with_description() + with_compartment_id("ocid1.compartment.."). + with_name(). + with_offline_config( + metastore_id=metastoreId + ) + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.feature_store import FeatureStore + + yaml_string = """ + kind: feature_store + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + type: feature_store + """ + + feature_store = FeatureStore.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``FeatureStore`` instance to create a feature store. + +.. code-block:: python3 + + # Create an feature store + feature_store.create() + + +Load +==== + +Use the ``from_id()`` method from the ``FeatureStore`` class to load an existing feature store by specifying its OCID. A ``FeatureStore`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.feature_store import FeatureStore + + feature_store = FeatureStore.from_id("") + +Delete +====== + +Use the ``.delete()`` method on the ``FeatureStore`` instance to delete a feature store. A feature store can only be deleted when its associated entities are all deleted. + +.. code-block:: python3 + + feature_store.delete() + +SQL +=== +Use the ``sql()`` method of the ``FeatureStore`` instance to query a feature store. + +The following example queries a feature store using SQL: + +.. code-block:: python3 + + # Fetch the entity id. Entity id is used as database name in feature store + entity_id = entity.id + + # Form a query with entity id and fetch the results + sql = (f"SELECT feature_group_a.* " + f"FROM {entity_id}.feature_group_a " + f"JOIN {entity_id}.feature_group_b " + f"ON {entity_id}.feature_group_a.col_1={entity_id}.feature_group_b.col_2 " + f"JOIN {entity_id}.feature_group_a.col_1={entity_id}.feature_group_b.col_3 ") + + # Run the sql query and fetch the results as data-frame + df = feature_store.sql(sql) + +Create an Entity +============= +Use the ``create_entity()`` method of the ``FeatureStore`` instance to create an ``Entity``. + +.. code-block:: python3 + + # Create a feature store entity + feature_store.create_entity(name="") + +Create a Transformation +===================== +Transformations in a feature store are the operations and processes applied to raw data to create, modify, or derive new features for use as inputs for machine learning models. These transformations are necessary for improving the quality, relevance, and usefulness of features that then enhance the performance of models. +You can call the ``create_transformation()`` method of the FeatureStore instance to create a ``Transformation``. + +.. code-block:: python3 + + # Create a feature store entity + feature_store.create_transformation( + source_code_func="", + transformation_mode="SQL|PANDAS" + display_name="" + ) diff --git a/ads/feature_store/docs/source/feature_validation.rst b/ads/feature_store/docs/source/feature_validation.rst index b62a929b3..2c28a7ce2 100644 --- a/ads/feature_store/docs/source/feature_validation.rst +++ b/ads/feature_store/docs/source/feature_validation.rst @@ -1,56 +1,60 @@ -.. _Feature Validation: - -Feature Validation -****************** - -Feature validation is the process of checking the quality and accuracy of the features used in a machine learning model. This is important because features that are not accurate or reliable can lead to poor model performance. -Feature store allows you to define expectation on the data which is being materialized into feature group and dataset. This is achieved using open source library Great Expectations. - -.. note:: - `Great Expectations `_ is a Python-based open-source library for validating, documenting, and profiling your data. It helps you to maintain data quality and improve communication about data between teams. Software developers have long known that automated testing is essential for managing complex codebases. - - -Expectations -============ -An Expectation is a verifiable assertion about your data. You can define expectation as below: - -.. code-block:: python3 - - from great_expectations.core.expectation_configuration import ExpectationConfiguration - - # Create an Expectation - expect_config = ExpectationConfiguration( - # Name of expectation type being added - expectation_type="expect_table_columns_to_match_ordered_list", - # These are the arguments of the expectation - # The keys allowed in the dictionary are Parameters and - # Keyword Arguments of this Expectation Type - kwargs={ - "column_list": [ - "column1", - "column2", - "column3", - "column4", - ] - }, - # This is how you can optionally add a comment about this expectation. - meta={ - "notes": { - "format": "markdown", - "content": "details about this expectation. **Markdown** `Supported`", - } - }, - ) - -Expectations Suite -=================== - -Expectation Suite is a collection of verifiable assertions i.e. expectations about your data. You can define expectation suite as below: - -.. code-block:: python3 - - # Create an Expectation Suite - expectation_suite = ExpectationSuite( - expectation_suite_name= - ) - expectation_suite.add_expectation(expect_config) +.. _Feature Validation: + +Feature Validation +****************** + +Feature validation is the process of checking the quality and accuracy of the features used in a machine learning model. This is important because features that aren't accurate or reliable can lead to poor model performance. Feature store allows you to define expectation on the data that is being materialized into feature groups and datasets. The Great Expectations open source library is used to define expectations. + +.. note:: + `Great Expectations `_ is an open source Python-based library that validates, documents, and profiles data. It automates testing, which is essential for managing complex code bases. + +.. image:: figures/data_validation.png + +Expectations +============ +An expectation is a verifiable assertion about your data. + +The following example defines an expectation: + +.. code-block:: python3 + + from great_expectations.core.expectation_configuration import ExpectationConfiguration + + # Create an Expectation + expect_config = ExpectationConfiguration( + # Name of expectation type being added + expectation_type="expect_table_columns_to_match_ordered_list", + # These are the arguments of the expectation + # The keys allowed in the dictionary are Parameters and + # Keyword Arguments of this Expectation Type + kwargs={ + "column_list": [ + "column1", + "column2", + "column3", + "column4", + ] + }, + # This is how you can optionally add a comment about this expectation. + meta={ + "notes": { + "format": "markdown", + "content": "details about this expectation. **Markdown** `Supported`", + } + }, + ) + +Expectations Suite +=================== + +An expectation suite is a collection of verifiable assertions. For example, expectations about your data. + +The following example defines an expectation suite: + +.. code-block:: python3 + + # Create an Expectation Suite + expectation_suite = ExpectationSuite( + expectation_suite_name= + ) + expectation_suite.add_expectation(expect_config) diff --git a/ads/feature_store/docs/source/figures/data_validation.png b/ads/feature_store/docs/source/figures/data_validation.png new file mode 100644 index 000000000..57128d2b3 Binary files /dev/null and b/ads/feature_store/docs/source/figures/data_validation.png differ diff --git a/ads/feature_store/docs/source/figures/data_versioning.png b/ads/feature_store/docs/source/figures/data_versioning.png new file mode 100644 index 000000000..d12cd3fbe Binary files /dev/null and b/ads/feature_store/docs/source/figures/data_versioning.png differ diff --git a/ads/feature_store/docs/source/figures/drift_monitoring.png b/ads/feature_store/docs/source/figures/drift_monitoring.png new file mode 100644 index 000000000..e379d1f97 Binary files /dev/null and b/ads/feature_store/docs/source/figures/drift_monitoring.png differ diff --git a/ads/feature_store/docs/source/index.rst b/ads/feature_store/docs/source/index.rst index b28e25fe9..5c6bf8424 100644 --- a/ads/feature_store/docs/source/index.rst +++ b/ads/feature_store/docs/source/index.rst @@ -1,24 +1,61 @@ -============================================= -Welcome to oci-feature-store's documentation! -============================================= - -.. toctree:: - :maxdepth: 3 - :caption: Contents: - - overview - terraform - quickstart - feature_store - entity - transformation - feature_group - feature_group_job - dataset - dataset_job - statistics - feature_validation - ui - demo - notebook - release_notes +============================================= +Welcome to oci-feature-store's documentation! +============================================= + +Feature Store +============= + +|PyPI|_ |PysparkConda|_ |Notebook Examples|_ |Delta|_ |PySpark|_ + +.. |PyPI| image:: https://img.shields.io/badge/python-3.8-blue?style=for-the-badge&logo=pypi&logoColor=white +.. _PyPI: https://pypi.org/project/oracle-ads/ +.. |PysparkConda| image:: https://img.shields.io/badge/fspyspark32_p38_cpu_v1-1.0-blue?style=for-the-badge&logo=pypi&logoColor=white +.. _PysparkConda: https://docs.oracle.com/en-us/iaas/data-science/using/conda-pyspark-fam.htm +.. |Notebook Examples| image:: https://img.shields.io/badge/docs-notebook--examples-blue?style=for-the-badge&logo=pypi&logoColor=white +.. _Notebook Examples: https://github.com/oracle-samples/oci-data-science-ai-samples/tree/master/notebook_examples +.. |Delta| image:: https://img.shields.io/badge/delta-2.0.1-blue?style=for-the-badge&logo=pypi&logoColor=white +.. _Delta: https://delta.io/ +.. |PySpark| image:: https://img.shields.io/badge/pyspark-3.2.1-blue?style=for-the-badge&logo=pypi&logoColor=white +.. _PySpark: https://spark.apache.org/docs/3.2.1/api/python/index.html + +.. toctree:: + :maxdepth: 2 + :caption: Getting started: + + overview + terraform + quickstart + +.. toctree:: + :maxdepth: 2 + :caption: Feature store entities: + + feature_store + entity + transformation + feature_group + feature_group_job + dataset + dataset_job + +.. toctree:: + :maxdepth: 2 + :caption: Feature store concepts: + + statistics + feature_validation + ui + +.. toctree:: + :maxdepth: 2 + :caption: Demo and examples: + + demo + notebook + + +.. toctree:: + :maxdepth: 2 + :caption: Release notes: + + release_notes diff --git a/ads/feature_store/docs/source/notebook.rst b/ads/feature_store/docs/source/notebook.rst index 33dc3d02d..4dd4c2631 100644 --- a/ads/feature_store/docs/source/notebook.rst +++ b/ads/feature_store/docs/source/notebook.rst @@ -1,36 +1,36 @@ -.. _Notebook Examples: - -================== -Notebook Examples -================== - -.. admonition:: Notebook Examples - :class: note - - .. list-table:: - :widths: 50 50 50 - :header-rows: 1 - - * - Html Notebook - - Jupyter Notebook - - Description - - * - `Feature store quickstart `__ - - `Feature store quickstart `__ - - | 1. Ingestion of data - | 2. Querying and exploration of data - - * - `Big data operations with feature store `__ - - `Big data operations with feature store `__ - - | 1. Ingestion of data using spark magic - | 2. Querying and exploration of data using spark magic - - * - `Schema enforcement and schema evolution `__ - - `Schema enforcement and schema evolution `__ - - | 1. Schema evolution is a feature that allows users to easily change a table's current schema to accommodate data that is changing over time. - | 2. Schema enforcement, also known as schema validation, is a safeguard in Delta Lake that ensures data quality by rejecting writes to a table that do not match the table's schema. - - * - `Embeddings in Feature Store `__ - - `Embeddings in Feature Store `__ - - | 1. One of the primary functions of an embedding feature store is to store pre-trained word embeddings, such as Word2Vec, GloVe, FastText, or BERT embeddings. These embeddings are learned from massive text corpora and contain information about word semantics, which can be valuable for various NLP tasks like text classification, named entity recognition, sentiment analysis, and more. - | 2. Embedding feature stores are optimized for fast and efficient retrieval of embeddings. This is crucial because embeddings can be high-dimensional and computationally expensive to calculate. By storing them in a dedicated store, you can avoid the need to recalculate embeddings for the same data repeatedly. +.. _Notebook Examples: + +================== +Notebook Examples +================== + +.. admonition:: Notebook Examples + :class: note + + .. list-table:: + :widths: 50 50 50 + :header-rows: 1 + + * - Html Notebook + - Jupyter Notebook + - Description + + * - `Feature store quickstart `__ + - `Feature store quickstart `__ + - | 1. Ingestion of data. + | 2. Querying and exploration of data. + + * - `Big data operations with feature store `__ + - `Big data operations with feature store `__ + - | 1. Ingestion of data using Spark Magic. + | 2. Querying and exploration of data using Spark Magic. + + * - `Schema enforcement and schema evolution `__ + - `Schema enforcement and schema evolution `__ + - | 1. Schema evolution allows you to easily change a table's current schema to accommodate data that is changing over time. + | 2. Schema enforcement, also known as schema validation, is a safeguard in Delta Lake that ensures data quality by rejecting writes to a table that don't match the table's schema. + + * - `Embeddings in Feature Store `__ + - `Embeddings in Feature Store `__ + - | 1. One of the primary functions of an embedding feature store is to store pre-trained word embeddings, such as Word2Vec, GloVe, FastText, or BERT embeddings. These embeddings are learned from massive text and contain information about word semantics. Embeddings can be valuable for various NLP tasks like text classification, named entity recognition, sentiment analysis, and so on. + | 2. Embedding feature stores are optimized for fast and efficient retrieval of embeddings. This is important because embeddings can be high-dimensional and computationally expensive to calculate. By storing them in a dedicated store, you can avoid the need to recalculate embeddings for the same data repeatedly. diff --git a/ads/feature_store/docs/source/overview.rst b/ads/feature_store/docs/source/overview.rst index ce9a3a2d0..04cb84a25 100644 --- a/ads/feature_store/docs/source/overview.rst +++ b/ads/feature_store/docs/source/overview.rst @@ -1,39 +1,41 @@ -Overview -******** - -Managing many datasets, data-sources and transformations for machine learning is complex and costly. Poorly cleaned data, data issues, bugs in transformations, data drift and training serving skew all leads to increased model development time and worse model performance. Here, feature store is well positioned to solve many of the problems since it provides a centralised way to transform and access data for training and serving time and helps defines a standardised pipeline for ingestion of data and querying of data. - -.. image:: figures/feature_store_overview.png - -Oracle feature store is a stack based solution that is deployed in the customer enclave using OCI resource manager. Customer can stand up the service with infrastructure in their own tenancy. The service consists of API which are deployed in customer tenancy using resource manager. - -- ``Feature Vector``: Set of feature values for any one primary/identifier key. Eg. All/subset of features of customer id '2536' can be called as one feature vector. -- ``Feature``: A feature is an individual measurable property or characteristic of a phenomenon being observed. -- ``Entity``: An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated features. Another way to look at it is that an entity is an object or concept that is described by its features. Examples of entities could be customer, product, transaction, review, image, document, etc. -- ``Feature Group``: A feature group in a feature store is a collection of related features that are often used together in ml models. It serves as an organizational unit within the feature store for users to manage, version and share features across different ml projects. By organizing features into groups, data scientists and ml engineers can efficiently discover, reuse and collaborate on features reducing the redundant work and ensuring consistency in feature engineering. -- ``Feature Group Job``: Feature group job is the execution instance of a feature group. Each feature group job will include validation results and statistics results. -- ``Dataset``: A dataset is a collection of feature that are used together to either train a model or perform model inference. -- ``Dataset Job``: Dataset job is the execution instance of a dataset. Each dataset job will include validation results and statistics results. - -.. important:: - - Prerequisite : Please contact #oci-feature-store_early-preview for getting your tenancy whitelisted for early access of feature store. - -.. important:: - - The OCI Feature Store support following versions - - .. list-table:: - :widths: 25 75 - :header-rows: 1 - - * - Package Name - - Latest Version - * - python - - .. image:: https://img.shields.io/badge/python-3.8-blue?style=for-the-badge&logo=pypi&logoColor=white - * - fspyspark32_p38_cpu_v1 - - .. image:: https://img.shields.io/badge/fspyspark32_p38_cpu_v1-1.0-blue?style=for-the-badge&logo=pypi&logoColor=white - * - delta-spark - - .. image:: https://img.shields.io/badge/delta-2.0.1-blue?style=for-the-badge&logo=pypi&logoColor=white - * - pyspark - - .. image:: https://img.shields.io/badge/pyspark-3.2.1-blue?style=for-the-badge&logo=pypi&logoColor=white \ No newline at end of file +Overview +******** + +Managing many datasets, data sources, and transformations for machine learning is complex and costly. Poorly cleaned data, data issues, bugs in transformations, data drift, and training serving skew all lead to increased model development time and poor model performance. Feature store solves many of the problems because it is a centralized way to transform and access data for training and serving time, Feature stores help define a standardised pipeline for ingestion of data and querying of data. + +.. image:: figures/feature_store_overview.png + +ADS feature store is a stack-based solution that is deployed in your tenancy using OCI Resource Manager. + +Following are brief descriptions of key concepts and the main components of ADS feature store. + +- ``Feature Vector``: Set of feature values for any one primary and identifier key. For example, all and a subset of features of customer ID 2536 can be called as one feature vector . +- ``Feature``: A feature is an individual measurable property or characteristic of an event being observed. +- ``Entity``: An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated with features. Another way to look at it is that an entity is an object or concept that's described by its features. Examples of entities are customer, product, transaction, review, image, document, and so on. +- ``Feature Group``: A feature group in a feature store is a collection of related features that are often used together in ML models. It serves as an organizational unit within the feature store for users to manage, version, and share features across different ML projects. By organizing features into groups, data scientists and ML engineers can efficiently discover, reuse, and collaborate on features reducing the redundant work and ensuring consistency in feature engineering. +- ``Feature Group Job``: Feature group jobs are the processing instance of a feature group. Each feature group job includes validation results and statistics results. +- ``Dataset``: A dataset is a collection of features that are used together to either train a model or perform model inference. +- ``Dataset Job``: A dataset job is the processing instance of a dataset. Each dataset job includes validation results and statistics results. + +.. important:: + + Prerequisite : Contact #oci-feature-store_early-preview to get your tenancy whitelisted for early access of ADS feature store. + +.. important:: + + The ADS feature store supports and requires the following versions: + + .. list-table:: + :widths: 25 75 + :header-rows: 1 + + * - Package Name + - Latest Version + * - python + - .. image:: https://img.shields.io/badge/python-3.8-blue?style=for-the-badge&logo=pypi&logoColor=white + * - fspyspark32_p38_cpu_v1 + - .. image:: https://img.shields.io/badge/fspyspark32_p38_cpu_v1-1.0-blue?style=for-the-badge&logo=pypi&logoColor=white + * - delta-spark + - .. image:: https://img.shields.io/badge/delta-2.0.1-blue?style=for-the-badge&logo=pypi&logoColor=white + * - pyspark + - .. image:: https://img.shields.io/badge/pyspark-3.2.1-blue?style=for-the-badge&logo=pypi&logoColor=white diff --git a/ads/feature_store/docs/source/quickstart.rst b/ads/feature_store/docs/source/quickstart.rst index e7942aa10..89c416b52 100644 --- a/ads/feature_store/docs/source/quickstart.rst +++ b/ads/feature_store/docs/source/quickstart.rst @@ -1,174 +1,175 @@ -Quick start -************ -1. Create a `OCI notebook session `__ to access jupyterlab interface. - -2. Open the terminal in the notebook session and install the ``fspyspark32_p38_cpu_v1`` plugin - - .. code-block:: shell - - odsc conda install -s fspyspark32_p38_cpu_v1 -3. Download the notebooks from the example notebook section. - -.. seealso:: - Refer :ref:`Notebook Examples` to check out more example for using feature store - -4. Upload the notebook in the notebook session and run the notebook after replacing the required variables. - - -Background reading to understand the concepts of Feature Store and OCI Data Science: - -- Getting started with `OCI Data Science Jobs `__ -- Getting started with `Oracle Accelerated Data Science SDK `__ to simplify `creating `__ and `running `__ Jobs -- Getting started with `Data Science Environments `__ -- Getting started with `Custom Conda Environments `__ - -**Authentication and Policies:** - -- Getting started with `OCI Data Science Policies `__ -- `API Key-Based Authentication `__ - ``api_key`` -- `Resource Principal Authentication `__ - ``resource_principal`` -- `Instance Principal Authentication `__ - ``instance_principal`` - -.. seealso:: - - Refer `Terraform section `__ for setting up feature store server. - -.. warning:: - - 1. Initial implementation will not allow parallel execution of similar logical constructs. Creation will be sequential. - 2. In case of failure , execution would stop and rollback would not happen. Retry/resume operation is also not supported in initial implementation. - 3. There needs to be definition of exactly 1 feature store construct in the yaml. we will not allow creation of multiple feature store constructs via yaml and user cannot omit providing feature store definition completely. - 4. In order to allow reference in the definition , name of the defined logical constructs in yaml should be unique. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.feature_group_expectation import Expectation, Rule, ExpectationType, ValidationEngineType - from ads.feature_store.feature_store import FeatureStore - from ads.feature_store.input_feature_detail import FeatureDetail, FeatureType - from ads.feature_store.transformation import TransformationMode - import ads - - compartment_id = "ocid1.compartment." - metastore_id = "ocid1.datacatalogmetastore.oc1.iad." - api_gateway_endpoint = "https://**.{region}.oci.customer-oci.com/20230101" - os.environ["OCI_FS_SERVICE_ENDPOINT"] = api_gateway_endpoint - - ads.set_auth(auth="api_key") - - # step1: Create feature store - feature_store_resource = ( - FeatureStore() - .with_description("") - .with_compartment_id(compartment_id) - .with_name("") - .with_offline_config(metastore_id=metastore_id) - ) - - feature_store = feature_store_resource.create() - entity = feature_store.create_entity(name="product") - - - # step2: Create feature store - def transactions_df(dataframe, **kwargs): - columns = kwargs.get('columns', '*') # Default to select all columns if 'columns' not provided - where_clause = kwargs.get('where_clause', '') # Default to empty where clause if 'where_clause' not provided - - sql_query = f""" - SELECT - {columns} - FROM - {table_name} - {where_clause} - """ - return sql_query - - transformation = feature_store.create_transformation( - transformation_mode=TransformationMode.SQL, - source_code_func=transactions_df - ) - - - # step3: Create expectation - expectation_suite = ExpectationSuite(expectation_suite_name="feature_definition") - expectation_suite.add_expectation( - ExpectationConfiguration( - expectation_type="expect_column_values_to_not_be_null", - kwargs={"column": "date"} - ) - ) - - input_feature_details = [FeatureDetail("rule_name").with_feature_type(FeatureType.STRING).with_order_number(1)] - - # step4: Create stats configuration - stats_config = StatisticsConfig().with_is_enabled(False) - - # step5: Create feature group - transformation_args = {"columns": "col1, col2", "where_clause": "col3 > 100"} - feature_group = entity.create_feature_group( - primary_keys=["name"], - partition_keys=["name"], - input_feature_details, - expectation_suite=expectation_suite, - expectation_type=ExpectationType.LENIENT, - statistics_config=stats_config, - name="", - transformation_id=transformation.id, - transformation_kwargs=transformation_args - ) - - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.feature_store_registrar import FeatureStoreRegistrar - - yaml_string = """ - apiVersion: 20230101 - kind: featureStore - spec: - name: *feature_store_name - offlineConfig: - metastoreId: *metastore_id - - entity: &entity - - kind: entity - spec: - name: *entity_name - - - transformation: &transformation - - kind: transformation - spec: - name: *transformation_name - transformationMode: *transformation_mode - sourceCode: *source_code - - featureGroup: - - kind: featureGroup - spec: - name: *feature_group_name - dataSource: *ds - description: *feature_group_desc - transformation: *transformation - entity: *entity - primaryKeys: - *fg_primary_key - inputFeatureDetails: - - name: *feature_name - featureType: *feature_type - orderNumber: 1 - - dataset: - - kind: dataset - spec: - name: *dataset_name - entity: *entity - datasetIngestionMode: *ingestion_mode - description: *dataset_description - query: *query_statement - """ - - feature_registrar = FeatureStoreRegistrar.from_yaml(yaml_string) +Quick start +************ +1. Create a `Data Science notebook session `__ to access jupyterlab interface. + +2. Open a terminal in the notebook session, and then install the ``fspyspark32_p38_cpu_v1`` plugin: + + .. code-block:: shell + + odsc conda install -s fspyspark32_p38_cpu_v1 +3. Download the notebook examples from the example notebook section. + +.. seealso:: + Refer :ref:`Notebook Examples` contains more examples for using feature store. + +4. Upload the notebook in the notebook session, and then run the notebook after replacing the required variables. + + + +**Feature Store and Data Science Concepts:** + +- Getting started with `OCI Data Science Jobs `__ +- Getting started with `Oracle Accelerated Data Science SDK `__ to simplify `creating `__ and `running `__ Jobs +- Getting started with `Data Science Environments `__ +- Getting started with `Custom Conda Environments `__ + +**Authentication and Policies:** + +- Getting started with `OCI Data Science Policies `__ +- `API Key-Based Authentication `__ - ``api_key`` +- `Resource Principal Authentication `__ - ``resource_principal`` +- `Instance Principal Authentication `__ - ``instance_principal`` + +.. seealso:: + + Review the `Terraform section `__ for setting up feature store server. + +.. warning:: + + 1. Feature store doesn’t allow parallel execution of similar logical constructs. Creation is sequential.. + 2. If a failure occurs, processing stops, and rollback can’t happen. Retrying the operation isn’t supported. + 3. Define exactly one feature store construct in the YAML file. Creation of multiple feature store constructs in the YAML file causes a failure. + 4. To allow reference in the feature store definition,the name of the defined logical constructs in the YAML file must be unique. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.feature_group_expectation import Expectation, Rule, ExpectationType, ValidationEngineType + from ads.feature_store.feature_store import FeatureStore + from ads.feature_store.input_feature_detail import FeatureDetail, FeatureType + from ads.feature_store.transformation import TransformationMode + import ads + + compartment_id = "ocid1.compartment." + metastore_id = "ocid1.datacatalogmetastore.oc1.iad." + api_gateway_endpoint = "https://**.{region}.oci.customer-oci.com/20230101" + os.environ["OCI_FS_SERVICE_ENDPOINT"] = api_gateway_endpoint + + ads.set_auth(auth="api_key") + + # step1: Create feature store + feature_store_resource = ( + FeatureStore() + .with_description("") + .with_compartment_id(compartment_id) + .with_name("") + .with_offline_config(metastore_id=metastore_id) + ) + + feature_store = feature_store_resource.create() + entity = feature_store.create_entity(name="product") + + + # step2: Create feature store + def transactions_df(dataframe, **kwargs): + columns = kwargs.get('columns', '*') # Default to select all columns if 'columns' not provided + where_clause = kwargs.get('where_clause', '') # Default to empty where clause if 'where_clause' not provided + + sql_query = f""" + SELECT + {columns} + FROM + {table_name} + {where_clause} + """ + return sql_query + + transformation = feature_store.create_transformation( + transformation_mode=TransformationMode.SQL, + source_code_func=transactions_df + ) + + + # step3: Create expectation + expectation_suite = ExpectationSuite(expectation_suite_name="feature_definition") + expectation_suite.add_expectation( + ExpectationConfiguration( + expectation_type="expect_column_values_to_not_be_null", + kwargs={"column": "date"} + ) + ) + + input_feature_details = [FeatureDetail("rule_name").with_feature_type(FeatureType.STRING).with_order_number(1)] + + # step4: Create stats configuration + stats_config = StatisticsConfig().with_is_enabled(False) + + # step5: Create feature group + transformation_args = {"columns": "col1, col2", "where_clause": "col3 > 100"} + feature_group = entity.create_feature_group( + primary_keys=["name"], + partition_keys=["name"], + input_feature_details, + expectation_suite=expectation_suite, + expectation_type=ExpectationType.LENIENT, + statistics_config=stats_config, + name="", + transformation_id=transformation.id, + transformation_kwargs=transformation_args + ) + + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.feature_store_registrar import FeatureStoreRegistrar + + yaml_string = """ + apiVersion: 20230101 + kind: featureStore + spec: + name: *feature_store_name + offlineConfig: + metastoreId: *metastore_id + + entity: &entity + - kind: entity + spec: + name: *entity_name + + + transformation: &transformation + - kind: transformation + spec: + name: *transformation_name + transformationMode: *transformation_mode + sourceCode: *source_code + + featureGroup: + - kind: featureGroup + spec: + name: *feature_group_name + dataSource: *ds + description: *feature_group_desc + transformation: *transformation + entity: *entity + primaryKeys: + *fg_primary_key + inputFeatureDetails: + - name: *feature_name + featureType: *feature_type + orderNumber: 1 + + dataset: + - kind: dataset + spec: + name: *dataset_name + entity: *entity + datasetIngestionMode: *ingestion_mode + description: *dataset_description + query: *query_statement + """ + + feature_registrar = FeatureStoreRegistrar.from_yaml(yaml_string) diff --git a/ads/feature_store/docs/source/release_notes.rst b/ads/feature_store/docs/source/release_notes.rst index ae2cdff2d..06590e9b0 100644 --- a/ads/feature_store/docs/source/release_notes.rst +++ b/ads/feature_store/docs/source/release_notes.rst @@ -1,131 +1,131 @@ -.. _Release Notes: - -============= -Release Notes -============= - -1.0.3 ------ -.. note:: - - .. list-table:: - :header-rows: 1 - - * - Package Name - - Latest Version - - Notes - * - Conda pack - - `fs_pyspark32_p38_cpu_v1` - - - * - SERVICE_VERSION - - 0.1.256.master - - - * - ADS_VERSION - - oracle-ads==2.9.0rc0 - - `https://github.com/oracle/accelerated-data-science/releases/tag/v2.9.0rc0` - * - Terraform Stack - - `link `__ - - - - -Release notes: September 22, 2023 - -* [FEATURE] Addition of ``featurestore_dataset`` as optional parameter in GenericModel ``save`` function. -* [FEATURE] Addition of ``transformation_kwargs`` in ``Transformation`` entity. -* [FEATURE] Addition of partition keys in ``FeatureGroup`` and ``Dataset`` -* [FEATURE] as_of interface for time travel -* [FEATURE] Manual association of feature groups in ``Dataset`` construct and support for complex queries -* [FEATURE] Simplify the ads init experience without users need to specify the feature store endpoint -* [FEATURE] Visualisation of feature statistics with ``to_viz`` in ``Statistics`` entity -* [FIX] Validation of model ids when associated with ``Dataset`` -* [UI] Stats visualisation of feature group and dataset. -* [UI] Transformation listing in the transformation tab -* [UI] Global search for feature store entities. -* [UI] Addition of onboarding page for feature store. -* [UI] Redirection of entities within the lineage tab of feature group and dataset. - -1.0.2 ------ -.. note:: - - .. list-table:: - :header-rows: 1 - - * - Package Name - - Latest Version - - Notes - * - Conda pack - - `fs_pyspark32_p38_cpu_v1` - - - * - SERVICE_VERSION - - 0.1.225.master - - - * - Terraform Stack - - `link `__ - - Par link expires Jan 5, 2026 - -Release notes: July 18, 2023 - -* [FEATURE] Supporting for deployment in ``us-ashburn`` and ``uk-london`` region. -* [FEATURE] For ``ValidationOutput`` instance, addition of ``to_summary()`` method for validation summary details. -* [FEATURE] For ``ValidationOutput`` instance, addition of ``to_pandas()`` method for validation detailed report. -* [FIX] Fixed unit test integration to support the merging of ADS into the main branch. -* [DOCS] For ``ValidationOutput`` instance, addition of ``to_summary()`` method for validation summary details. -* [DOCS] For ``ValidationOutput`` instance, addition of ``to_pandas()`` method for validation detailed report. - -1.0.1 ------ - -.. note:: - - .. list-table:: - :header-rows: 1 - - * - Package Name - - Latest Version - - Notes - * - Conda pack - - `fs_pyspark32_p38_cpu_v1` - - - * - SERVICE_VERSION - - 0.1.218.master - - - * - Terraform Stack - - `link `__ - - Par link expires Jan 5, 2026 - - -Release notes: July 5, 2023 - -* [FEATURE] Supporting Offline Feature Type COMPLEX -* [FEATURE] Added k8 default version as v1.25.4 -* [FEATURE] Improvements in logging during materialisation of feature group and dataset and showcasing validation results during materialisation -* [FIX] Fixed creation of singleton spark session without metastore id -* [DOCS] Data Type update for Offline Feature Type COMPLEX -* [DOCS] Updated terraform default version as 1.1.x - -1.0.0 ----- - -.. note:: - - .. list-table:: - :header-rows: 1 - - * - Package Name - - Latest Version - - Notes - * - Conda pack - - `fs_pyspark32_p38_cpu_v1` - - - * - SERVICE_VERSION - - 0.1.209.master - - - * - Terraform Stack - - `link `__ - - Par link expires Jan 5, 2026 - -Release notes: June 15, 2023 - -* [FEATURE] Included ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` concepts for feature store. -* [DOCS] Included documentation for ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` constructs +.. _Release Notes: + +============= +Release Notes +============= + +1.0.3 +----- +.. note:: + + .. list-table:: + :header-rows: 1 + + * - Package Name + - Latest Version + - Notes + * - Conda pack + - `fs_pyspark32_p38_cpu_v1` + - + * - SERVICE_VERSION + - 0.1.256.master + - + * - ADS_VERSION + - oracle-ads==2.9.0rc0 + - `https://github.com/oracle/accelerated-data-science/releases/tag/v2.9.0rc0` + * - Terraform Stack + - `link `__ + - + + +Release notes: September 22, 2023 + +* [FEATURE] Addition of ``featurestore_dataset`` as optional parameter in GenericModel ``save`` function. +* [FEATURE] Addition of ``transformation_kwargs`` in ``Transformation`` entity. +* [FEATURE] Addition of partition keys in ``FeatureGroup`` and ``Dataset``. +* [FEATURE] Addition of the as_of interface for time travel. +* [FEATURE] Manual association of feature groups in the ``Dataset`` construct and support for complex queries. +* [FEATURE] Simplify the ``ads init`` experience so you don't have to specify the feature store endpoint. +* [FEATURE] Visualisation of feature statistics with ``to_viz`` in ``Statistics`` entity. +* [FIX] Validation of model IDs when associated with ``Dataset``. +* [UI] Statistics visualisation of feature group and dataset. +* [UI] Transformation listing in the transformation tab. +* [UI] Global search for feature store entities. +* [UI] Addition of onboarding page for feature store. +* [UI] Redirection of entities within the lineage tab of feature group and dataset. + +1.0.2 +----- +.. note:: + + .. list-table:: + :header-rows: 1 + + * - Package Name + - Latest Version + - Notes + * - Conda pack + - `fs_pyspark32_p38_cpu_v1` + - + * - SERVICE_VERSION + - 0.1.225.master + - + * - Terraform Stack + - `link `__ + - Par link expires Jan 5, 2026 + +Release notes: July 18, 2023 + +* [FEATURE] Support for deployment in the ``us-ashburn`` and ``uk-london`` regions. +* [FEATURE] For a ``ValidationOutput`` instance, the addition of the ``to_summary()`` method for validation summary details. +* [FEATURE] For a ``ValidationOutput`` instance, the addition of the ``to_pandas()`` method for validation detailed report. +* [FIX] Fixed unit test integration to support the merging of ADS into the main branch. +* [DOCS] For ``ValidationOutput`` instance, the addition of the ``to_summary()`` method for validation summary details. +* [DOCS] For ``ValidationOutput`` instance, the addition of the ``to_pandas()`` method for validation detailed report. + +1.0.1 +----- + +.. note:: + + .. list-table:: + :header-rows: 1 + + * - Package Name + - Latest Version + - Notes + * - Conda pack + - `fs_pyspark32_p38_cpu_v1` + - + * - SERVICE_VERSION + - 0.1.218.master + - + * - Terraform Stack + - `link `__ + - Par link expires Jan 5, 2026 + + +Release notes: July 5, 2023 + +* [FEATURE] Supporting Offline Feature Type COMPLEX +* [FEATURE] Added k8 default version as v1.25.4 +* [FEATURE] Improvements in logging during materialisation of feature group and dataset and showcasing validation results during materialisation +* [FIX] Fixed creation of singleton spark session without metastore id +* [DOCS] Data Type update for Offline Feature Type COMPLEX +* [DOCS] Updated terraform default version as 1.1.x + +1.0.0 +---- + +.. note:: + + .. list-table:: + :header-rows: 1 + + * - Package Name + - Latest Version + - Notes + * - Conda pack + - `fs_pyspark32_p38_cpu_v1` + - + * - SERVICE_VERSION + - 0.1.209.master + - + * - Terraform Stack + - `link `__ + - Par link expires Jan 5, 2026 + +Release notes: June 15, 2023 + +* [FEATURE] Included ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` concepts for feature store. +* [DOCS] Included documentation for ``FeatureStore``, ``FeatureGroup``, ``Dataset``, ``Entity`` and ``Transformation`` constructs diff --git a/ads/feature_store/docs/source/statistics.rst b/ads/feature_store/docs/source/statistics.rst index 56f9546f7..19f73fe9b 100644 --- a/ads/feature_store/docs/source/statistics.rst +++ b/ads/feature_store/docs/source/statistics.rst @@ -1,51 +1,58 @@ -.. _Statistics: - -Statistics -************* - -Feature Store provides functionality to compute statistics for feature groups as well as datasets and persist them along with the metadata. These statistics can help you -to derive insights about the data quality. These statistical metrics are computed during materialisation time and persisting with other metadata. - -.. note:: - - Feature Store utilizes MLM Insights which is a Python API that helps evaluate and monitor data for entirety of ML Observability lifecycle. It performs data summarization which reduces a dataset into a set of descriptive statistics. - -The statistical metrics that are computed by feature store depend on the feature type. - -+------------------------+-----------------------+ -| Numerical Metrics | Categorical Metrics | -+========================+=======================+ -| Skewness | Count | -+------------------------+-----------------------+ -| StandardDeviation | TopKFrequentElements | -+------------------------+-----------------------+ -| Min | TypeMetric | -+------------------------+-----------------------+ -| IsConstantFeature | DuplicateCount | -+------------------------+-----------------------+ -| IQR | Mode | -+------------------------+-----------------------+ -| Range | DistinctCount | -+------------------------+-----------------------+ -| ProbabilityDistribution| | -+------------------------+-----------------------+ -| Variance | | -+------------------------+-----------------------+ -| FrequencyDistribution | | -+------------------------+-----------------------+ -| Count | | -+------------------------+-----------------------+ -| Max | | -+------------------------+-----------------------+ -| DistinctCount | | -+------------------------+-----------------------+ -| Sum | | -+------------------------+-----------------------+ -| IsQuasiConstantFeature | | -+------------------------+-----------------------+ -| Quartiles | | -+------------------------+-----------------------+ -| Mean | | -+------------------------+-----------------------+ -| Kurtosis | | -+------------------------+-----------------------+ +.. _Statistics: + +Statistics +************* + +Feature Store provides functionality to compute statistics for feature groups and datasets, and then persists them including the metadata. These statistics can help you +to derive insights about the data quality. These statistical metrics are computed during materialization time and persisting with other metadata. + +.. note:: + + Feature Store uses MLM Insights, which is a Python API that helps evaluate and monitor data for entire ML observability lifecycle. It performs data summarization, which reduces a dataset into a set of descriptive statistics. + +The statistical metrics that are computed by feature store depend on the feature type. + ++------------------------+-----------------------+ +| Numerical Metrics | Categorical Metrics | ++========================+=======================+ +| Skewness | Count | ++------------------------+-----------------------+ +| StandardDeviation | TopKFrequentElements | ++------------------------+-----------------------+ +| Min | TypeMetric | ++------------------------+-----------------------+ +| IsConstantFeature | DuplicateCount | ++------------------------+-----------------------+ +| IQR | Mode | ++------------------------+-----------------------+ +| Range | DistinctCount | ++------------------------+-----------------------+ +| ProbabilityDistribution| | ++------------------------+-----------------------+ +| Variance | | ++------------------------+-----------------------+ +| FrequencyDistribution | | ++------------------------+-----------------------+ +| Count | | ++------------------------+-----------------------+ +| Max | | ++------------------------+-----------------------+ +| DistinctCount | | ++------------------------+-----------------------+ +| Sum | | ++------------------------+-----------------------+ +| IsQuasiConstantFeature | | ++------------------------+-----------------------+ +| Quartiles | | ++------------------------+-----------------------+ +| Mean | | ++------------------------+-----------------------+ +| Kurtosis | | ++------------------------+-----------------------+ + +Drift Monitoring +================ + +Models can fail silently. Over and over we see the root cause of model issues in production can be traced back to the data itself, not the model. By applying data monitoring to the feature store, practitioners can automatically catch data issues like missing values, change in data format or unexpected values (change in data cardinality), and data drift upstream before the models are impacted + +.. image:: figures/drift_monitoring.png diff --git a/ads/feature_store/docs/source/terraform.rst b/ads/feature_store/docs/source/terraform.rst index d7f7d4dae..002985493 100644 --- a/ads/feature_store/docs/source/terraform.rst +++ b/ads/feature_store/docs/source/terraform.rst @@ -1,352 +1,338 @@ -=================================== -Terraform: Setting up feature store -=================================== - -Oracle feature store is a stack based solution that is deployed in the customer enclave using OCI resource manager. -Customer can stand up the service with infrastructure in their own tenancy. The service consists of API in customer -tenancy using resource manager. - -Below is the terraform stack deployment diagram of the feature store resources. - -.. figure:: figures/feature_store_deployment.png - :width: 400 - -.. note:: - - Blue-green deployment is a strategy for releasing new versions of an application with minimal downtime and risk. It is used in Kubernetes, as well as other deployment environments, to achieve a smooth transition between application versions with the ability to quickly rollback if issues are detected. - In a blue-green deployment, there are two environments, named "blue" and "green," that run concurrently. One environment is designated as the live or production environment (let's say "blue"), while the other environment ("green") is idle or running a newer version of the application for testing. Both environments have identical configurations and infrastructure. - -.. _User Policies: - -User policies for stack setup -============================= - -Prerequisites (For non admin users only) -######################################### - -Feature Store users need to provide the following access permissions in order to deploy the feature store terraform stack. Below mentioned are the policy statements required for terraform stack deployment - -.. code-block:: shell - - define tenancy service_tenancy as ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa - endorse group to read repos in tenancy service_tenancy - allow group to manage orm-stacks in compartment - allow group to manage orm-jobs in compartment - allow group to manage object-family in compartment - allow group to manage users in compartment - allow group to manage instance-family in compartment - allow group to manage tag-namespaces in compartment - allow group to manage groups in compartment - allow group to manage policies in compartment - allow group to manage dynamic-groups in compartment - allow group to manage virtual-network-family in compartment - allow group to manage functions-family in compartment - allow group to inspect compartments in compartment - allow group to manage cluster-family in compartment - allow group to manage mysql-family in compartment - allow group to manage api-gateway-family in compartment - -Deploy Using Oracle Resource Manager -==================================== - -.. note:: - - If you aren't already signed in, when prompted, enter the tenancy and user credentials. Review and accept the terms and conditions. Refer :ref:`Release Notes` for getting the latest conda pack and ``SERVICE_VERSION``. - -.. important:: - - Refer :ref:`User Policies` to create feature store stack for non admin users. No policies are explicitly required for admin user. - - -1. Download the stack from ``Terraform Stack`` column in :ref:`Release Notes`. Refer :ref:`User Policies` to create feature store stack for non admin users. No policies are explicitly required for admin user. - -2. Click to deploy the stack - -3. Select the region and compartment where you want to deploy the stack. - -4. Follow the on-screen prompts and instructions to create the stack. - -5. After creating the stack, click Terraform Actions, and select Plan. - -6. Wait for the job to be completed, and review the plan. - -7. To make any changes, return to the Stack Details page, click Edit Stack, and make the required changes. Then, run the Plan action again. - -8. If no further changes are necessary, return to the Stack Details page, click Terraform Actions, and select Apply. - -Deploy Using the Oracle CLI -============================== - -Prerequisites -############# - -1. Install `oci-cli `__ if not installed - -Steps -##### - -.. note:: - - Refer :ref:`Release Notes` for getting the latest conda pack and ``SERVICE_VERSION``. Remember to replace the values within angle brackets ("<>" symbols) in the command above with the relevant values for your environment. Also, Refer :ref:`User Policies` to create feature store stack for non admin users. No policies are explicitly required for admin user. - -1. Run the shell command. - - .. code-block:: shell - - rm -f feature-store-terraform.zip \ - && wget https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip \ - && oci resource-manager stack create \ - --compartment-id \ - --config-source feature-store-terraform.zip \ - --terraform-version 1.1.x \ - --variables '{ - "service_version": "", - "tenancy_ocid": "", - "compartment_ocid": "", - "region": "", - "user_ocid": "" - }' \ - --display-name "Feature Store Stack" \ - --working-directory "feature-store-terraform" \ - | tee stack_output.json \ - && stack_id=$(jq -r '.data."id"' stack_output.json) \ - && oci resource-manager job create-apply-job \ - --execution-plan-strategy AUTO_APPROVED \ - --stack-id $stack_id \ - --wait-for-state SUCCEEDED \ - --wait-for-state FAILED - -Update Feature Store Stack with the Latest using OCI CLI -======================================================== - -Prerequisites -############# - -1. Install `oci-cli `__ if not installed -2. In order to update the stack, get the from console by navigating to `OCI Resource Manager `__. - -.. figure:: figures/resource_manager.png - -.. figure:: figures/resource_manager_home.png - -.. figure:: figures/resource_manager_feature_store_stack.png - - -Steps -##### - -.. note:: - - Refer :ref:`Release Notes` for getting the latest conda pack and ``SERVICE_VERSION``. Remember to replace the values within angle brackets ("<>" symbols) in the command above with the relevant values for your environment. Also, Refer :ref:`User Policies` to create feature store stack for non admin users. No policies are explicitly required for admin user. - -1. Run the shell command. - - .. code-block:: shell - - rm -f feature-store-terraform.zip \ - && wget https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip \ - && oci resource-manager stack update \ - --stack-id \ - --variables '{"service_version": "", "tenancy_ocid": "", "compartment_ocid": "", "region": "", "user_ocid": ""}' \ - --config-source "feature-store-terraform.zip" \ - --working-directory "feature-store-terraform" --force \ - && oci resource-manager job create-apply-job \ - --execution-plan-strategy AUTO_APPROVED \ - --stack-id \ - --wait-for-state SUCCEEDED --wait-for-state FAILED - -Terraform Variables (Advanced) -=============================== - -A complete listing of the Terraform variables used in this stack are referenced below: - -.. list-table:: Terraform Variables - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `service_version` - - `0.1.218.master` - - The version of API to be deployed in customer tenancy. - * - `spec_version` - - `0.1.218.master` - - The version of API specs to be deployed in customer tenancy. - * - `deployment_name` - - `DEFAULT_NAME` - - Name of the deployment. - * - `db_name` - - `DEFAULT_NAME` - - Name of ATP/MySQL database. - * - `db_config` - - `DEFAULT_NAME` - - Config for db. - * - `compartment_ocid` - - `DEFAULT_NAME` - - OCID of compartment to deploy the feature store stack. - * - `vcn_details` - - `DEFAULT_NAME` - - VCN details required from user in case they are on-boarding the database which has network access within their VCN.. - * - `user_ocid` - - `ocid1.user..` - - If you do not have permission to create users, provide the user_ocid of a user that has permission to pull images from OCI Registry. - * - `tenancy_ocid` - - `ocid1.tenancy..` - - OCID of tenancy to deploy the feature store stack. - * - `ssh_authorized_key` - - `` - - OCID of tenancy to deploy the feature store stack. - * - `ocir_puller_auth_token` - - `` - - If the user provided above already has an auth_token to use, provide it here. If null a new token will be created. This requires that the user has 1 token at most already (as there is a limit of 2 tokens per user) . - * - `ocir_puller_group_ocid` - - `ocid1.group..` - - If you have permission to create users, and a group already exists with policies to pull images from OCI Registry, you can provide the group_ocid and a new user will be created and be made a member of this group. Leave null if you are providing a ocir_puller_user_ocid . - * - `ocir_puller_user_ocid` - - `ocid1.user..` - - If you do not have permission to create users, provide the user_ocid of a user that has permission to create mysql and object storage buckets. - * - `feature_store_user_group_id` - - `ocid1.group..` - - Provide the feature store user group id if the user is not an administrator. - - -.. note:: - Bring your own database (BYODB): Feature store does not support private access using private endpoint and private access gateway for ATP instances. - - User VCN Deployment - ################### - - User can provide the existing VCN details in order for the feature store to use the existing VCN. Feature store terraform stack provides a terraform vcn variable which takes VCN details as mentioned below: - - .. list-table:: user_vcn - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `vcn_id` - - `ocid1.vcn.oc1.iad.xxxxxxxxxxxx` - - The ocid of the VCN where user wants to deploy feature store. - * - `vcn_cidr` - - `10.0.0.0/16` - - The VCN CIDR range to be used. - * - `subnet_suffix` - - `8` - - The subnet suffix to be used in order to create service related subnets(for e.g. 10.0.0.0/16 + 8 => 10.0.0.0/24). - * - `max_subnet` - - `16` - - This is an optional variable which tells how many maximum subnet creations are allowed within the CIDR range. - * - 'dhcp_options_id' - - 'ocid1.dhcpoptions.oc1.iad.xxxxxxxxx' - - DHCP options ocid is required for instance configuration within the VCN - * - `igw_id` - - `ocid1.internetgateway.oc1.iad.xxxxxxxxxxxx` - - This is an optional variable which takes internet gateway ocid as an input. Feature store creates the IGW if not provided. - * - `nat_gw_id` - - `ocid1.natgateway.oc1.iad.xxxxxxxxxx` - - This is an optional variable which takes nat gateway ocid as an input. Feature store creates the NAT gateway if not provided. - * - `sgw_id` - - `ocid1.servicegateway.oc1.iad.xxxxxxxxxx` - - This is an optional variable which takes service gateway ocid as an input. Feature store does not create SGW even when its NULL or does not enable SGW till the time user explicitly using sgw_enable. This is done to ensure that SGW is only enabled for network resources (for e.g. ATP) which allow access through SGW. - * - `sgw_enable` - - `false` - - Enable service gateway usage or creation depending upon the sgw_id provided by the user. - * - `private_route_table_id` - - `ocid1.routetable.oc1.iad.xxxxxxxxxxxxxxxx` - - This is an optional variable which takes private route table ocid as an input. If user provides this then it would be user's reponsibilty to ensure that the nat gateway & service gateway(if applicable) route rules have been added for feature store service access. Feature store creates a private route table with supporting route rules if not provided. - * - `public_route_table_id` - - `ocid1.routetable.oc1.iad.xxxxxxxxxxxxxxxx` - - This is an optional variable which takes public route table ocid as an input. If user provides this then it would be user's reponsibilty to ensure that the internet gateway route rule has been added for feature store service access. Feature store creates a public route table with supporting route rules if not provided. - - Feature store is deployed in feature store specific subnets and security list access are maintained on the basis of details provided by the user. - - - User Input - ########## - - User will need to provide the following details in order to onboard their own database instances. - - 1. DB Config: This is general database configuration which is required for the purpose of initial database setup for BYODB or Feature store's own database setup. - - .. list-table:: db_config - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `vault_ocid` - - `ocid1.vault.oc1.iad.b5sb3bclaaaog.xxxxxxxxxxxxx` - - The ocid of the vault where user has kept the atp / mysql secret. This can be set to null in case of Feature store's own db setup. - * - `vault_compartment_id` - - `ocid1.tenancy.oc1.iad.b5sb3bclaaaog.xxxxxxxxxxxxx` - - The ocid of the vault compartment where user has created vault. This can be set to null in case of Feature store's own db setup. - * - `db_type` - - `mysql` - - The database type could be mysql /atp. - * - `db_secret_source` - - `VAULT` - - The database secret source. It should be kept VAULT for BYODB use case. It can be OSS for ATP and LOCAL for MYSQL in case of default feature store deployment without BYODB. - * - `user_db` - - `false` - - set user db to true to enable customer database support (BYODB). - - 2. User DB Config: User specific details in order to onboard user database, all of these fields can be kept null if user database is not onboarded - - .. list-table:: user_db_config - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `db_system_name` - - `featurestoretestatp_xxxx` - - The database instance name. - * - `db_username` - - `admin` - - The username for the database. - * - `db_password_secret` - - `test_atp_xxxx` - - Vault database password secret - - .. tabs:: - - .. tab:: MySQL DB Config: MySQL database configuration - - MySQL Instance can only be accessed within the network and for that user will need to deploy the Feature store within their VCN. User will also provide the vault secret for the MySQL database password. - Please ensure that ingress rules are in place to provide VCN access for the MySQL instance. - Please refer to this link (https://docs.oracle.com/en-us/iaas/mysql-database/doc/networking-setup-mysql-db-systems.html) for more details: - - .. list-table:: mysql_db_config - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `mysql_db_ip_addr` - - `192.168.xxx.xxxx` - - MySQL database IP address. - * - `mysql_db_port` - - `3306` - - mysql db port - - .. tab:: ATP DB Config: ATP database configuration - - The existing ATP instance can have two type of access: - - 1. Public Access: In this case the ATP instance can be accessed either with Feature store deployed in its own VCN or in user VCN. User will need to provide the vault secret names which will be used for ATP connection. - - 2. Network Access: If ATP instance has network access within VCN only then in cases like these User need to deploy feature store in VCN which has ATP access. User will need to provide the vault secret names which will be used for ATP connection. - - - .. list-table:: atp_db_config - :header-rows: 1 - - * - Variable Name - - Value - - Description - * - `wallet_file_secret` - - `["cwallet.sso", "ewallet.p12", "keystore.jks", "ojdbc.properties", "tnsnames.ora", "truststore.jks", "sqlnet.ora"]` - - List of ATP Wallet files vault secrets base64 encoded. Please ensure to encode the wallet files to base64 format and then push them as base64 encoded string to Vault. - * - `wallet_password_secret` - - `example-secret` - - Vault wallet password secret +=================================== +Terraform: Setting Up a Feature Store +=================================== + +The following shows a terraform stack deployment of the feature store resources. + +.. figure:: figures/feature_store_deployment.png + :width: 400 + +Blue-green deployment is a strategy for releasing new versions of an application with minimal downtime and risk. It is used in Kubernetes, and other deployment environments, to achieve a smooth transition between application versions with the ability to quickly rollback if issues are detected. In a blue-green deployment, there are two environments named blue and green that run concurrently. One environment is designated as the live or production environment called blue. While the other environment, named green, is idle or running a newer version of the application for testing. Both environments have identical configurations and infrastructure. + +.. _User Policies: + +Required Policies for a Terraform Stack +============================= + +Feature store requires the following policy statements to deploy the feature store in a Terraform stack: + +.. code-block:: shell + + define tenancy service_tenancy as ocid1.tenancy.oc1..aaaaaaaa462hfhplpx652b32ix62xrdijppq2c7okwcqjlgrbknhgtj2kofa + endorse group to read repos in tenancy service_tenancy + allow group to manage orm-stacks in compartment + allow group to manage orm-jobs in compartment + allow group to manage object-family in compartment + allow group to manage users in compartment + allow group to manage instance-family in compartment + allow group to manage tag-namespaces in compartment + allow group to manage groups in compartment + allow group to manage policies in compartment + allow group to manage dynamic-groups in compartment + allow group to manage virtual-network-family in compartment + allow group to manage functions-family in compartment + allow group to inspect compartments in compartment + allow group to manage cluster-family in compartment + allow group to manage mysql-family in compartment + allow group to manage api-gateway-family in compartment + +No policies are explicitly required for administrator users. + +Deploy Using Oracle Resource Manager +==================================== + +If you aren't already signed in, enter the tenancy and user credentials. Review and accept the terms and conditions. Review the :ref:`Release Notes` to obtain the latest conda environment and ``SERVICE_VERSION``. + +1. Ensure that you have the required policy statements. + +2. Download the stack from the ``Terraform Stack`` column in the :ref:`Release Notes`. + +3. Click to deploy the stack. + +4. Select the region and compartment where you want to deploy the stack. + +5. Follow the prompts and instructions to create the stack. + +6. After creating the stack, click **Terraform Actions**, and select **Plan**. + +7. Wait for the job to complet, and then review the plan. + +8. To make changes return to the Stack Details page, click **Edit Stack**, and make the required changes. Then run the Plan action again. + +9. If no further changes are necessary, return to the Stack Details page, click **Terraform Actions**, and click **Apply**. + +Deploy Using the Oracle CLI +============================== + +Prerequisites +############# + +Install `oci-cli `__ if it's not installed. + +Steps +##### + +.. note:: + + Review the :ref:`Release Notes` to obtain the latest conda environment and ``SERVICE_VERSION``. Remember to replace the values within angle brackets ("<>" symbols) in the commands with the relevant values for your environment. + +1. Ensure that you have the required policy statements. + +2. Run the shell command: + + .. code-block:: shell + + rm -f feature-store-terraform.zip \ + && wget https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip \ + && oci resource-manager stack create \ + --compartment-id \ + --config-source feature-store-terraform.zip \ + --terraform-version 1.1.x \ + --variables '{ + "service_version": "", + "tenancy_ocid": "", + "compartment_ocid": "", + "region": "", + "user_ocid": "" + }' \ + --display-name "Feature Store Stack" \ + --working-directory "feature-store-terraform" \ + | tee stack_output.json \ + && stack_id=$(jq -r '.data."id"' stack_output.json) \ + && oci resource-manager job create-apply-job \ + --execution-plan-strategy AUTO_APPROVED \ + --stack-id $stack_id \ + --wait-for-state SUCCEEDED \ + --wait-for-state FAILED + +Update Feature Store Stack Using OCI CLI +======================================================== + +Prerequisites +############# + +1. Install `oci-cli `__ if it's not installed. +2. T update the stack, get the from the stack details page in the Console using `Resource Manager `__: + +.. figure:: figures/resource_manager.png + +.. figure:: figures/resource_manager_home.png + +.. figure:: figures/resource_manager_feature_store_stack.png + + +Steps +##### + +.. note:: + + Review the :ref:`Release Notes` to obtain the latest conda environment and ``SERVICE_VERSION``. Remember to replace the values within angle brackets ("<>" symbols) in the commands with the relevant values for your environment. + +1. Run the shell command: + + .. code-block:: shell + + rm -f feature-store-terraform.zip \ + && wget https://objectstorage.us-ashburn-1.oraclecloud.com/p/vZogtXWwHqbkGLeqyKiqBmVxdbR4MK4nyOBqDsJNVE4sHGUY5KFi4T3mOFGA3FOy/n/idogsu2ylimg/b/oci-feature-store/o/beta/terraform/feature-store-terraform.zip \ + && oci resource-manager stack update \ + --stack-id \ + --variables '{"service_version": "", "tenancy_ocid": "", "compartment_ocid": "", "region": "", "user_ocid": ""}' \ + --config-source "feature-store-terraform.zip" \ + --working-directory "feature-store-terraform" --force \ + && oci resource-manager job create-apply-job \ + --execution-plan-strategy AUTO_APPROVED \ + --stack-id \ + --wait-for-state SUCCEEDED --wait-for-state FAILED + +Terraform Variables +=============================== + +The following Terraform variables used in this stack are: + +.. list-table:: + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `service_version` + - `0.1.218.master` + - The version of API to be deployed in customer tenancy. + * - `spec_version` + - `0.1.218.master` + - The version of API specs to be deployed in customer tenancy. + * - `deployment_name` + - `DEFAULT_NAME` + - Name of the deployment. + * - `db_name` + - `DEFAULT_NAME` + - Name of the ATP or MySQL database. + * - `db_config` + - `DEFAULT_NAME` + - configuration for the database. + * - `compartment_ocid` + - `DEFAULT_NAME` + - OCID of compartment to deploy the feature store stack in. + * - `vcn_details` + - `DEFAULT_NAME` + - VCN details required to onboard the database that has network access within the VCN. + * - `user_ocid` + - `ocid1.user..` + - If you don't have permission to create users, provide the user_ocid of a user that has permission to retrieve images from the OCI Registry. + * - `tenancy_ocid` + - `ocid1.tenancy..` + - OCID of the tenancy to deploy the feature store stack in. + * - `ssh_authorized_key` + - `` + - OCID of tenancy to deploy the feature store stack in. + * - `ocir_puller_auth_token` + - `` + - If the specified user has an auth_token to use, then specify it. If null, a new token is created. This variable requires that the user has one token at most . There is a limit of two tokens per user. + * - `ocir_puller_group_ocid` + - `ocid1.group..` + - If you have permission to create users and a group already exists with policies to pull images from OCI Registry, you can provide the group_ocid to create a new user and add it to this group. Leave null if you are specifying an ocir_puller_user_ocid. + * - `ocir_puller_user_ocid` + - `ocid1.user..` + - If you do not have permission to create users, provide the user_ocid of a user that has permission to create MySQL and Object Storage buckets. + * - `feature_store_user_group_id` + - `ocid1.group..` + - Specify the feature store user group ID if the specified user is not an administrator. + + +User VCN Deployment +=============================== +Bring your own database (BYODB) is not supported by feature store because it doesn't support private access using a private endpoint and private access gateway for ATP instances. + +You can specify VCN details for the feature store to use an existing VCN. A Terraform stack feature store has the following Terraform VCN variables: + + .. list-table:: + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `vcn_id` + - `ocid1.vcn.oc1.iad.xxxxxxxxxxxx` + - The OCID of the VCN where you want to deploy the feature store. + * - `vcn_cidr` + - `10.0.0.0/16` + - The VCN CIDR range to be use. + * - `subnet_suffix` + - `8` + - The subnet suffix to be used to create feature store related subnets. For example, 10.0.0.0/16 + 8 => 10.0.0.0/24. + * - `max_subnet` + - `16` + - Optional. Sets the maximum subnet creations that are allowed within the CIDR range. + * - 'dhcp_options_id' + - 'ocid1.dhcpoptions.oc1.iad.xxxxxxxxx' + - DHCP options OCID is required for instance configuration within the VCN. + * - `igw_id` + - `ocid1.internetgateway.oc1.iad.xxxxxxxxxxxx` + - Optional. The internet gateway OCID. Feature store creates the IGW if not specified. + * - `nat_gw_id` + - `ocid1.natgateway.oc1.iad.xxxxxxxxxx` + - Optional. The NAT gateway OCID. The feature store creates the NAT gateway if not specified. + * - `sgw_id` + - `ocid1.servicegateway.oc1.iad.xxxxxxxxxx` + - Optional. The feature store gateway OCID. Feature store doesn't create SGW even when its null, or doesn't enable SGW until the time you explicitly set using sgw_enable. This ensures that SGW is only enabled for network resources (ATP for example) that allow access through SGW. + * - `sgw_enable` + - `false` + - Enable service gateway use or creation depending on the sgw_id specified. + * - `private_route_table_id` + - `ocid1.routetable.oc1.iad.xxxxxxxxxxxxxxxx` + - Optional. The private route table OCID. Ensure that the NAT gateway and feature store gateway (if applicable) route rules have been added for feature store access. The feature store creates a private route table with supporting route rules if not specified. + * - `public_route_table_id` + - `ocid1.routetable.oc1.iad.xxxxxxxxxxxxxxxx` + - Optional. The public route table OCID. Ensure that the internet gateway route rule are added for feature store access. The feature store creates a public route table with supporting route rules if not specified. + + Feature store is deployed in feature store specific subnets and security list access is maintained on the basis of details specified. + + + Onboarding a Database + ########## + + Specify the following details to onboard database instancesL + + 1. Use the following variables to specify a general database configuration that's required for the initial database setup for a BYODB or feature store database. + + .. list-table:: db_config + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `vault_ocid` + - `ocid1.vault.oc1.iad.b5sb3bclaaaog.xxxxxxxxxxxxx` + - The OCID of the vault where the ATP or MySQL secret is. Set to null if setting up a feature store database. + * - `vault_compartment_id` + - `ocid1.tenancy.oc1.iad.b5sb3bclaaaog.xxxxxxxxxxxxx` + - The OCID of the vault compartment. Set to null if setting up a feature store database. + * - `db_type` + - `mysql` + - Set the database type to ATP or MySQL. + * - `db_secret_source` + - `VAULT` + - The database secret source. Use VAULT for BYODB databases. Use OSS for an ATP database and LOCAL for MySQL for default feature store deployment without BYODB. + * - `user_db` + - `false` + - Set to true for BYODB. + + 1. Use the following variables to specify the details to onboard your database. All of these variables can be remain null if you don't want to onboard your database. + + .. list-table:: user_db_config + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `db_system_name` + - `featurestoretestatp_xxxx` + - The database instance name. + * - `db_username` + - `admin` + - The username for the database. + * - `db_password_secret` + - `test_atp_xxxx` + - The vault database password secret. + + .. tabs:: + + .. tab:: MySQL DB Config: MySQL database configuration + + A MySQL instance can only be accessed within the network so you must deploy the feature store in your VCN. Specify the vault secret for the MySQL database password. + Ensure that ingress rules are in place to provide VCN access for the MySQL instance, see `Networking `__ for more details. + + .. list-table:: mysql_db_config + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `mysql_db_ip_addr` + - `192.168.xxx.xxxx` + - MySQL database IP address. + * - `mysql_db_port` + - `3306` + - MySQL database port. + + .. tab:: ATP DB Config: ATP database configuration + + The existing ATP instance can have two types of access: + + 1. Public Access: The ATP instance can be accessed with feature store deployed in its own VCN, or in your VCN. Specify the vault secret names to use for the ATP database connection. + + 2. Network Access: If the ATP instance has network access in a VCN, then you must deploy the feature store in the VCN that has ATP access. Specify the vault secret names to use for the ATP connection. + + + .. list-table:: atp_db_config + :header-rows: 1 + + * - Variable Name + - Value + - Description + * - `wallet_file_secret` + - `["cwallet.sso", "ewallet.p12", "keystore.jks", "ojdbc.properties", "tnsnames.ora", "truststore.jks", "sqlnet.ora"]` + - List of ATP Wallet files vault secrets base64 encoded. Ensure to encode the wallet files to the base64 format, and then push them as base64 encoded strings to the vault. + * - `wallet_password_secret` + - `example-secret` + - The vault wallet password secret. diff --git a/ads/feature_store/docs/source/transformation.rst b/ads/feature_store/docs/source/transformation.rst index f8644aa4f..004112ac3 100644 --- a/ads/feature_store/docs/source/transformation.rst +++ b/ads/feature_store/docs/source/transformation.rst @@ -1,83 +1,80 @@ -Transformation -************** - -Transformations in a feature store refers to the operations and processes applied to raw data to create, modify or derive new features that can be used as inputs for ML Models. These transformations are crucial for improving the quality, relevance and usefulness of features which in turn can enhance the performance of ml models. It is an object that represents a transformation applied on the feature group and can be a pandas transformation or spark sql transformation. - -Define -====== - -In an ADS feature store module, you can either use the Python API or YAML to define a transformation. - - -With the specified way below, you can define a transformation and give it a name. -A ``Transformation`` instance will be created. - -.. tabs:: - - .. code-tab:: Python3 - :caption: Python - - from ads.feature_store.transformation import Transformation - - transformation = ( - Transformation - .with_name("") - .with_feature_store_id("") - .with_source_code("") - .with_transformation_mode("") - .with_description("") - .with_compartment_id("") - ) - - .. code-tab:: Python3 - :caption: YAML - - from ads.feature_store.transformation import Transformation - - yaml_string = """ - kind: transformation - spec: - compartmentId: ocid1.compartment.. - description: - name: - featureStoreId: - sourceCode: - transformationMode: - type: transformation - """ - - transformation = Transformation.from_yaml(yaml_string) - - -Create -====== - -You can call the ``create()`` method of the ``Transformation`` instance to create an transformation. - -.. code-block:: python3 - - # Create an transformation - transformation.create() - - -Load -==== - -Use the ``from_id()`` method from the ``Transformation`` class to load an existing transformation with its OCID provided. It returns a ``Transformation`` instance. - -.. code-block:: python3 - - from ads.feature_store.transformation import Transformation - - transformation = Transformation.from_id("ocid1.transformation..") - -Delete -====== - -Use the ``.delete()`` method on the ``Transformation`` instance to delete a transformation. - -A transformation can only be deleted when its associated entities are all deleted, - -.. code-block:: python3 - - transformation.delete() +Transformation +************** + +Transformations in a feature store us the operations and processes applied to raw data to create, modify, or derive new features that can be used as inputs for machine learning models. These transformations are important for improving the quality, relevance, and usefulness of features and can enhance the performance of models. A transformation is an object that represents a transformation applied on the feature group and can be a Pandas transformation or Spark SQL transformation. + +Define +====== + +In an ADS feature store module, you can use the Python API or a yaml file to define a transformation. + + +The following example defines a transformation and gives it a name. A ``Transformation`` instance is created. + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.feature_store.transformation import Transformation + + transformation = ( + Transformation + .with_name("") + .with_feature_store_id("") + .with_source_code("") + .with_transformation_mode("") + .with_description("") + .with_compartment_id("") + ) + + .. code-tab:: Python3 + :caption: YAML + + from ads.feature_store.transformation import Transformation + + yaml_string = """ + kind: transformation + spec: + compartmentId: ocid1.compartment.. + description: + name: + featureStoreId: + sourceCode: + transformationMode: + type: transformation + """ + + transformation = Transformation.from_yaml(yaml_string) + + +Create +====== + +Use the ``create()`` method of the ``Transformation`` instance to create an transformation. + +.. code-block:: python3 + + # Create an transformation + transformation.create() + + +Load +==== + +Use the ``from_id()`` method from the ``Transformation`` class to load an existing transformation by specifiying its OCID. A ``Transformation`` instance is returned. + +.. code-block:: python3 + + from ads.feature_store.transformation import Transformation + + transformation = Transformation.from_id("") + +Delete +====== + +Use the ``.delete()`` method on the ``Transformation`` instance to delete a transformation. A transformation can only be deleted when its associated entities are all deleted. + +.. code-block:: python3 + + transformation.delete() diff --git a/ads/feature_store/docs/source/ui.rst b/ads/feature_store/docs/source/ui.rst index c15e48541..b4fe20731 100644 --- a/ads/feature_store/docs/source/ui.rst +++ b/ads/feature_store/docs/source/ui.rst @@ -1,63 +1,65 @@ -UI -*** -The users will have the convenience of utilizing a user-friendly UI interface for feature discovery via notebook extensions. The UI interface will empower users to identify which features belong to specific datasets and feature groups, establishing the process of exploring available features in a particular dataset or feature group. Groups within the organisation have a need to share the pipelines existing within the ecosystem to promote re-use of features if the pipelines already exists. Currently, The work happens in silos which makes discovery impossible and results in teams not able to leverage each other work. Below are a few examples of the interfaces which could help users discover features part of a particular feature group, discover features part of a particular dataset and how transformations are applied in order to get a know how of the source and destination of the data. - -Feature Store -============= -The "Feature Store" is the top-level entity within a feature store service. - - -Feature Group -============= -Feature Groups are instrumental in feature discovery as they offer a structured way to organize and manage features based on their semantic meaning or context. By grouping related features together, data scientists can efficiently locate and leverage relevant attributes for model development. Moreover, the versioning capability of Feature Groups ensures that changes in feature definitions are tracked, enabling reproducibility and aiding in evaluating the impact of feature modifications on model performance. Furthermore, the collaborative aspect of Feature Groups fosters knowledge sharing and reuse across data-driven projects, promoting efficiency and consistency in feature discovery processes. - -.. image:: figures/featuregroup.gif - -Validation -########### -Feature validation involves assessing the quality, relevance, and effectiveness of features through techniques like cross-validation, helping to prevent overfitting and improve model generalization. - -.. image:: figures/validation_fg.png - -Lineage -########### -Lineage tracking provides a historical record of data transformations and processing, ensuring transparency and reproducibility in feature engineering and model development. - -.. image:: figures/lineage_fg.png - -Stats -########### -Statistical analysis of features helps uncover insights about their distributions, central tendencies, and variations, aiding in feature selection and understanding data characteristics. - -.. image:: figures/stats_fg.png - -Dataset -====== -Datasets also support iterative experimentation, allowing data scientists to create various dataset configurations with different feature combinations and transformations, facilitating the discovery of the most valuable features for model training. - -.. image:: figures/dataset.gif - - -Lineage -########### -Lineage tracking provides a historical record of data transformations and processing, ensuring transparency and reproducibility in feature engineering and model development. - -.. image:: figures/lineage_d1.png - -.. image:: figures/lineage_d2.png - -Stats -########### -Statistical analysis of features helps uncover insights about their distributions, central tendencies, and variations, aiding in feature selection and understanding data characteristics. - -.. image:: figures/stats_d.png - -Entity -====== -An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated features. Another way to look at it is that an entity is an object or concept that is described by its features. Examples of entities could be customer, product, transaction, review, image, document, etc. - -Transformation -============== -Transformation constructs are a pivotal component of feature engineering, enabling data scientists to adapt and enhance features to improve model performance. With the flexibility to perform mathematical operations, scaling, normalization, handling missing data, and encoding categorical variables, transformation constructs empower data professionals to craft features that align with specific modeling requirements. - -.. image:: figures/transformations.gif +User Interface +**************** +You can use the user-friendly UI for feature discovery by using notebook extensions. The UI helps you to identify which features belong to specific datasets and feature groups, and establish the process of exploring available features in a particular dataset or feature group. + +Groups within the organization must share the pipelines existing in the ecosystem to promote reuse of features if the pipelines already exists. The work happens in silos that make discovery impossible, which results in teams being unable to leverage each others work. Following are a few examples of the interfaces that could help you discover features as part of a particular feature group, discover features part of a particular dataset, and how transformations are applied to see the source and destination of the data. + +Feature Store +============= +Feature store is the top-level entity for a feature store. + + +Feature Group +============= +Feature groups are instrumental in feature discovery as they offer a structured way to organize and manage features based on their semantic meaning or context. By grouping related features together, data scientists can efficiently locate and leverage relevant attributes for model development. The versioning capability of feature groups ensures that changes in feature definitions are tracked. Tracking enables reproducibility and aids in evaluating the impact of feature modifications on model performance. The collaborative aspect of feature groups fosters knowledge sharing and reuse across data driven projects, which promots efficiency and consistency in the feature discovery processes. + +.. image:: figures/featuregroup.gif + +Validation +########### +Feature validation is assessing the quality, relevance, and effectiveness of features through techniques like cross-validation. Validaion helps to prevent overfitting, and improves model generalization. + +.. image:: figures/validation_fg.png + +Lineage +########### +Lineage tracking provides a historical record of data transformations and processing, which ensures transparency and reproducibility in feature engineering and model development. + +.. image:: figures/lineage_fg.png + +Statistics +########### +Statistical analysis of features helps uncover insights about their distributions, central tendencies, and variations, which aids in feature selection and understanding data characteristics. + +.. image:: figures/stats_fg.png + +Dataset +====== +Datasets also support iterative experimentation, allowing data scientists to create various dataset configurations with different feature combinations and transformations, which facilites the discovery of the most valuable features for model training. + +.. image:: figures/dataset.gif + + +Lineage +########### +Lineage tracking provides a historical record of data transformations and processing, which ensures transparency and reproducibility in feature engineering and model development. + +.. image:: figures/lineage_d1.png + +.. image:: figures/lineage_d2.png + +Statistics +########### +Statistical analysis of features helps uncover insights about their distributions, central tendencies, and variations, which aids in feature selection and understanding data characteristics. + +.. image:: figures/stats_d.png + +Entity +====== +An entity is a group of semantically related features. The first step a consumer of features would typically do when accessing the feature store service is to list the entities and the entities associated with features. Another way to look at it is that an entity is an object or concept that's described by its features. Examples of entities are customer, product, transaction, review, image, document, and so on. + +Transformation +============== +Transformation constructs are a pivotal component of feature engineering, and enable data scientists to adapt and enhance features to improve model performance. The flexibility to perform mathematical operations, scaling, normalization, handling missing data, and encoding categorical variables, transformation constructs empowers data professionals to craft features that align with specific modeling requirements. + +.. image:: figures/transformations.gif diff --git a/ads/feature_store/entity.py b/ads/feature_store/entity.py index 0cb452ff4..5255cabd5 100644 --- a/ads/feature_store/entity.py +++ b/ads/feature_store/entity.py @@ -111,7 +111,7 @@ def _to_oci_fs_entity(self, **kwargs): kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Entity` accepts. + Can be any attribute that `feature_store.models.Entity` accepts. Returns ------- @@ -252,7 +252,7 @@ def create(self, **kwargs) -> "Entity": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Entity` accepts. + Can be any attribute that `feature_store.models.Entity` accepts. Returns ------- diff --git a/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py b/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py index 361815b6d..d6ea68883 100644 --- a/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py +++ b/ads/feature_store/execution_strategy/delta_lake/delta_lake_service.py @@ -7,7 +7,7 @@ import logging from ads.common.decorator.runtime_dependency import OptionalDependency -from ads.feature_store.common.enums import IngestionMode +from ads.feature_store.common.enums import BatchIngestionMode from ads.feature_store.execution_strategy.engine.spark_engine import SparkEngine try: @@ -57,9 +57,10 @@ def write_dataframe_to_delta_lake( None. """ logger.info(f"target table name {target_table_name}") + if ( self.spark_engine.is_delta_table_exists(target_table_name) - and ingestion_mode.upper() == IngestionMode.UPSERT.value + and ingestion_mode.upper() == BatchIngestionMode.UPSERT.value ): logger.info(f"Upsert ops for target table {target_table_name} begin") @@ -341,3 +342,34 @@ def __get_insert_update_query_expression(feature_data_source_columns, table_name logger.info(f"get_insert_update_query_expression {feature_data_update_set}") return feature_data_update_set + + def write_stream_dataframe_to_delta_lake( + self, + stream_dataframe, + target_table, + output_mode, + query_name, + await_termination, + timeout, + checkpoint_dir, + feature_option_details, + ): + if query_name is None: + query_name = "insert_stream_" + target_table.split(".")[1] + + query = ( + stream_dataframe.writeStream.outputMode(output_mode) + .format("delta") + .option( + "checkpointLocation", + checkpoint_dir, + ) + .options(**self.get_delta_write_config(feature_option_details)) + .queryName(query_name) + .toTable(target_table) + ) + + if await_termination: + query.awaitTermination(timeout) + + return query diff --git a/ads/feature_store/execution_strategy/engine/spark_engine.py b/ads/feature_store/execution_strategy/engine/spark_engine.py index 77b2b3747..ccabe9051 100644 --- a/ads/feature_store/execution_strategy/engine/spark_engine.py +++ b/ads/feature_store/execution_strategy/engine/spark_engine.py @@ -186,19 +186,31 @@ def get_tables_from_database(self, database): return permanent_tables - def get_columns_from_table(self, table_name: str): + def get_output_columns_from_table_or_dataframe( + self, table_name: str = None, dataframe=None + ): """Returns the column(features) along with type from the given table. Args: table_name(str): A string specifying the name of table name for which columns should be returned. + dataframe: Dataframe containing the transformed dataframe. Returns: List[{"name": "","featureType": ""}] Returns the List of dictionary of column with name and type from the given table. + """ + if table_name is None and dataframe is None: + raise ValueError( + "Either 'table_name' or 'dataframe' must be provided to retrieve output columns." + ) + + if dataframe is not None: + feature_data_target = dataframe + else: + feature_data_target = self.spark.sql(f"SELECT * FROM {table_name} LIMIT 1") target_table_columns = [] - feature_data_target = self.spark.sql(f"SELECT * FROM {table_name} LIMIT 1") for field in feature_data_target.schema.fields: target_table_columns.append( diff --git a/ads/feature_store/execution_strategy/execution_strategy.py b/ads/feature_store/execution_strategy/execution_strategy.py index 66650e58e..7d9c5b7e5 100644 --- a/ads/feature_store/execution_strategy/execution_strategy.py +++ b/ads/feature_store/execution_strategy/execution_strategy.py @@ -42,6 +42,19 @@ def ingest_feature_definition( """ pass + @abstractmethod + def ingest_feature_definition_stream( + self, + feature_group, + feature_group_job: FeatureGroupJob, + dataframe, + query_name, + await_termination, + timeout, + checkpoint_dir, + ): + pass + @abstractmethod def ingest_dataset(self, dataset, dataset_job: DatasetJob): """ diff --git a/ads/feature_store/execution_strategy/spark/spark_execution.py b/ads/feature_store/execution_strategy/spark/spark_execution.py index caa74dd46..d7b9a7f91 100644 --- a/ads/feature_store/execution_strategy/spark/spark_execution.py +++ b/ads/feature_store/execution_strategy/spark/spark_execution.py @@ -87,6 +87,29 @@ def ingest_feature_definition( except Exception as e: raise SparkExecutionException(e).with_traceback(e.__traceback__) + def ingest_feature_definition_stream( + self, + feature_group, + feature_group_job: FeatureGroupJob, + dataframe, + query_name, + await_termination, + timeout, + checkpoint_dir, + ): + try: + return self._save_offline_dataframe_stream( + dataframe, + feature_group, + feature_group_job, + query_name, + await_termination, + timeout, + checkpoint_dir, + ) + except Exception as e: + raise SparkExecutionException(e).with_traceback(e.__traceback__) + def ingest_dataset(self, dataset, dataset_job: DatasetJob): try: self._save_dataset_input(dataset, dataset_job) @@ -283,12 +306,15 @@ def _save_offline_dataframe( # Get the output features output_features = get_features( - self.spark_engine.get_columns_from_table(target_table), feature_group.id + self.spark_engine.get_output_columns_from_table_or_dataframe( + target_table + ), + feature_group.id, ) logger.info(f"output features for the FeatureGroup: {output_features}") - # Compute Feature Statistics + # Compute Feature Statistics feature_statistics = StatisticsService.compute_stats_with_mlm( statistics_config=feature_group.oci_feature_group.statistics_config, input_df=featured_data, @@ -419,7 +445,9 @@ def _save_dataset_input(self, dataset, dataset_job: DatasetJob): # Get the output features output_features = get_features( - output_columns=self.spark_engine.get_columns_from_table(target_table), + output_columns=self.spark_engine.get_output_columns_from_table_or_dataframe( + table_name=target_table + ), parent_id=dataset.id, entity_type=EntityType.DATASET, ) @@ -484,3 +512,97 @@ def _update_job_and_parent_details( # Update both the parent and job entities. parent_entity.update() + + def _save_offline_dataframe_stream( + self, + dataframe, + feature_group, + feature_group_job, + query_name, + await_termination, + timeout, + checkpoint_dir, + ): + output_features = [] + output_details = { + "error_details": None, + "validation_output": None, + "commit_id": "commit_id", + "feature_statistics": None, + } + + try: + # Create database in hive metastore if not exist + database = feature_group.entity_id + self.spark_engine.create_database(database) + + target_table = f"{database}.{feature_group.name}" + + # Apply the transformation + if feature_group.transformation_id: + logger.info("Dataframe is transformation enabled.") + + # Get the Transformation Arguments if exists and pass to the transformation function. + transformation_kwargs = Base64EncoderDecoder.decode( + feature_group.transformation_kwargs + ) + + # Loads the transformation resource + transformation = Transformation.from_id(feature_group.transformation_id) + + featured_data = TransformationUtils.apply_transformation( + self._spark_session, + dataframe, + transformation, + transformation_kwargs, + ) + else: + logger.info("Transformation not defined.") + featured_data = dataframe + + # Get the output features + output_features = get_features( + self.spark_engine.get_output_columns_from_table_or_dataframe( + dataframe=featured_data + ), + feature_group.id, + ) + + self._update_job_and_parent_details( + parent_entity=feature_group, + job_entity=feature_group_job, + output_features=output_features, + output_details=output_details, + ) + + streaming_query = ( + self.delta_lake_service.write_stream_dataframe_to_delta_lake( + featured_data, + target_table, + feature_group_job.ingestion_mode, + query_name, + await_termination, + timeout, + checkpoint_dir, + feature_group_job.feature_option_details, + ) + ) + + return streaming_query + + except Exception as ex: + # Update Job with Failed Status + error_details = str(ex) + tb = traceback.format_exc() + logger.error( + f"FeatureGroup Stream Materialization Failed with : {type(ex)} with error message: {ex} and stacktrace {tb}", + ) + + output_details["error_details"] = error_details + + self._update_job_and_parent_details( + parent_entity=feature_group, + job_entity=feature_group_job, + output_features=output_features, + output_details=output_details, + ) diff --git a/ads/feature_store/feature_group.py b/ads/feature_store/feature_group.py index c11fc34a9..72d117dfc 100644 --- a/ads/feature_store/feature_group.py +++ b/ads/feature_store/feature_group.py @@ -17,7 +17,13 @@ from ads.common import utils from ads.common.decorator.runtime_dependency import OptionalDependency from ads.common.oci_mixin import OCIModelMixin -from ads.feature_store.common.enums import ExpectationType, EntityType +from ads.feature_store.common.enums import ( + ExpectationType, + EntityType, + StreamingIngestionMode, + IngestionType, + BatchIngestionMode, +) from ads.feature_store.common.exceptions import ( NotMaterializedError, ) @@ -36,7 +42,7 @@ ) from ads.feature_store.feature import Feature from ads.feature_store.feature_group_expectation import Expectation -from ads.feature_store.feature_group_job import IngestionMode, FeatureGroupJob +from ads.feature_store.feature_group_job import FeatureGroupJob from ads.feature_store.feature_option_details import FeatureOptionDetails from ads.feature_store.input_feature_detail import FeatureDetail, FeatureType from ads.feature_store.query.filter import Filter, Logic @@ -694,7 +700,7 @@ def create(self, **kwargs) -> "FeatureGroup": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureGroup` accepts. + Can be any attribute that `feature_store.models.FeatureGroup` accepts. Returns ------- @@ -784,7 +790,7 @@ def update(self, **kwargs) -> "FeatureGroup": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureGroup` accepts. + Can be any attribute that `feature_store.models.FeatureGroup` accepts. Returns ------- @@ -865,7 +871,7 @@ def _build_feature_group_job( def materialise( self, input_dataframe: Union[DataFrame, pd.DataFrame], - ingestion_mode: IngestionMode = IngestionMode.OVERWRITE, + ingestion_mode: BatchIngestionMode = BatchIngestionMode.OVERWRITE, from_timestamp: str = None, to_timestamp: str = None, feature_option_details: FeatureOptionDetails = None, @@ -890,10 +896,10 @@ def materialise( # Create Feature Definition Job and persist it feature_group_job = self._build_feature_group_job( - ingestion_mode, - from_timestamp, - to_timestamp, - feature_option_details, + ingestion_mode=ingestion_mode, + from_timestamp=from_timestamp, + to_timestamp=to_timestamp, + feature_option_details=feature_option_details, ) # Create the Job @@ -912,6 +918,93 @@ def materialise( self, feature_group_job, input_dataframe ) + def materialise_stream( + self, + input_dataframe: Union[DataFrame], + checkpoint_dir: str, + query_name: Optional[str] = None, + ingestion_mode: StreamingIngestionMode = StreamingIngestionMode.APPEND, + await_termination: Optional[bool] = False, + timeout: Optional[int] = None, + feature_option_details: FeatureOptionDetails = None, + ): + """Ingest a Spark Structured Streaming Dataframe to the feature store. + + This method creates a long running Spark Streaming Query, you can control the + termination of the query through the arguments. + + It is possible to stop the returned query with the `.stop()` and check its + status with `.isActive`. + + !!! warning "Engine Support" + **Spark only** + + Stream ingestion using Pandas/Python as engine is currently not supported. + Python/Pandas has no notion of streaming. + + !!! warning "Data Validation Support" + `materialise_stream` does not perform any data validation using Great Expectations + even when a expectation suite is attached. + + # Arguments + input_dataframe: Features in Streaming Dataframe to be saved. + query_name: It is possible to optionally specify a name for the query to + make it easier to recognise in the Spark UI. Defaults to `None`. + ingestion_mode: Specifies how data of a streaming DataFrame/Dataset is + written to a streaming sink. (1) `"append"`: Only the new rows in the + streaming DataFrame/Dataset will be written to the sink. (2) + `"complete"`: All the rows in the streaming DataFrame/Dataset will be + written to the sink every time there is some update. (3) `"update"`: + only the rows that were updated in the streaming DataFrame/Dataset will + be written to the sink every time there are some updates. + If the query doesn’t contain aggregations, it will be equivalent to + append mode. Defaults to `"append"`. + await_termination: Waits for the termination of this query, either by + query.stop() or by an exception. If the query has terminated with an + exception, then the exception will be thrown. If timeout is set, it + returns whether the query has terminated or not within the timeout + seconds. Defaults to `False`. + timeout: Only relevant in combination with `await_termination=True`. + Defaults to `None`. + checkpoint_dir: Checkpoint directory location. This will be used to as a reference to + from where to resume the streaming job. If `None` then hsfs will construct as + "insert_stream_" + online_topic_name. Defaults to `None`. + write_options: Additional write options for Spark as key-value pairs. + Defaults to `{}`. + + # Returns + `StreamingQuery`: Spark Structured Streaming Query object. + """ + + # Create Feature Definition Job and persist it + feature_group_job = self._build_feature_group_job( + ingestion_mode=ingestion_mode, + feature_option_details=feature_option_details, + ) + + # Create the Job + feature_group_job.create() + + # Update the feature group with corresponding job so that user can see the details about the job + self.with_job_id(feature_group_job.id) + + feature_group_execution_strategy = ( + OciExecutionStrategyProvider.provide_execution_strategy( + execution_engine=get_execution_engine_type(input_dataframe), + metastore_id=get_metastore_id(self.feature_store_id), + ) + ) + + return feature_group_execution_strategy.ingest_feature_definition_stream( + self, + feature_group_job, + input_dataframe, + query_name, + await_termination, + timeout, + checkpoint_dir, + ) + def get_last_job(self) -> "FeatureGroupJob": """Gets the Job details for the last running job. @@ -969,7 +1062,7 @@ def delete(self): None """ # Create Feature Definition Job and persist it - feature_group_job = self._build_feature_group_job(IngestionMode.DEFAULT) + feature_group_job = self._build_feature_group_job(BatchIngestionMode.DEFAULT) # Create the Job feature_group_job.create() diff --git a/ads/feature_store/feature_group_job.py b/ads/feature_store/feature_group_job.py index cae791da1..f2d2bba0e 100644 --- a/ads/feature_store/feature_group_job.py +++ b/ads/feature_store/feature_group_job.py @@ -8,12 +8,12 @@ import logging from copy import deepcopy from enum import Enum -from typing import Dict, List, Any +from typing import Dict, List, Any, Union import pandas from ads.common import utils -from ads.feature_store.common.enums import IngestionMode +from ads.feature_store.common.enums import BatchIngestionMode, StreamingIngestionMode from ads.feature_store.feature_option_details import FeatureOptionDetails from ads.feature_store.service.oci_feature_group_job import OCIFeatureGroupJob from ads.jobs.builders.base import Builder @@ -121,7 +121,7 @@ def _to_oci_fs_feature_group_run(self, **kwargs): kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureGroupJob` accepts. + Can be any attribute that `feature_store.models.FeatureGroupJob` accepts. Returns ------- @@ -331,16 +331,19 @@ def ingestion_mode(self) -> str: return self.get_spec(self.CONST_INGESTION_MODE) @ingestion_mode.setter - def ingestion_mode(self, ingestion_mode: IngestionMode) -> "FeatureGroupJob": + def ingestion_mode( + self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode] + ) -> "FeatureGroupJob": return self.with_ingestion_mode(ingestion_mode) - def with_ingestion_mode(self, ingestion_mode: IngestionMode) -> "FeatureGroupJob": + def with_ingestion_mode( + self, ingestion_mode: Union[BatchIngestionMode, StreamingIngestionMode] + ) -> "FeatureGroupJob": """Sets the mode of the dataset ingestion mode. Parameters ---------- - ingestion_mode: IngestionMode - The mode of the dataset ingestion mode. + ingestion_mode Returns ------- @@ -406,7 +409,7 @@ def create(self, **kwargs) -> "FeatureGroupJob": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureGroupJob` accepts. + Can be any attribute that `feature_store.models.FeatureGroupJob` accepts. Returns ------- @@ -441,7 +444,7 @@ def update(self, **kwargs) -> "FeatureGroupJob": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureGroupJob` accepts. + Can be any attribute that `feature_store.models.FeatureGroupJob` accepts. Returns ------- diff --git a/ads/feature_store/feature_store.py b/ads/feature_store/feature_store.py index 13b9b2494..547337046 100644 --- a/ads/feature_store/feature_store.py +++ b/ads/feature_store/feature_store.py @@ -110,7 +110,7 @@ def _to_oci_fs(self, **kwargs): kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureStore` accepts. + Can be any attribute that `feature_store.models.FeatureStore` accepts. Returns ------- @@ -271,7 +271,7 @@ def create(self, **kwargs) -> "FeatureStore": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.FeatureStore` accepts. + Can be any attribute that `feature_store.models.FeatureStore` accepts. Returns ------- diff --git a/ads/feature_store/mixin/oci_feature_store.py b/ads/feature_store/mixin/oci_feature_store.py index c78087a1a..294f7b424 100644 --- a/ads/feature_store/mixin/oci_feature_store.py +++ b/ads/feature_store/mixin/oci_feature_store.py @@ -1,16 +1,21 @@ #!/usr/bin/env python # -*- coding: utf-8; -*- +from types import MethodType + from ads.common.decorator.utils import class_or_instance_method +from oci.signer import AbstractBaseSigner # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import logging +import email.utils import os +import oci +import feature_store_client.feature_store as fs logger = logging.getLogger(__name__) from ads.common.oci_mixin import OCIModelMixin -import oci.feature_store import yaml @@ -34,9 +39,7 @@ class OCIFeatureStoreMixin(OCIModelMixin): SERVICE_ENDPOINT = "service_endpoint" @classmethod - def init_client( - cls, **kwargs - ) -> oci.feature_store.feature_store_client.FeatureStoreClient: + def init_client(cls, **kwargs) -> fs.feature_store_client.FeatureStoreClient: default_kwargs: dict = cls._get_auth().get("client_kwargs", {}) fs_service_endpoint = ( @@ -70,13 +73,15 @@ def init_client( if fs_service_endpoint: kwargs[cls.SERVICE_ENDPOINT] = fs_service_endpoint - client = cls._init_client( - client=oci.feature_store.feature_store_client.FeatureStoreClient, **kwargs + client: fs.FeatureStoreClient = cls._init_client( + client=fs.FeatureStoreClient, **kwargs ) + signer: oci.Signer = client.base_client.signer + signer.do_request_sign = MethodType(fs_do_request_sign, signer) return client @property - def client(self) -> oci.feature_store.feature_store_client.FeatureStoreClient: + def client(self) -> fs.feature_store_client.FeatureStoreClient: return super().client @class_or_instance_method @@ -119,3 +124,22 @@ def list_resource( **kwargs, ).data return [cls.from_oci_model(item) for item in items] + + +def inject_missing_headers(request): + # Inject date, host, and content-type if missing + date = email.utils.formatdate(usegmt=True) + if request.path_url.startswith("/20230101"): + request.headers.setdefault("x-date", date) + request.headers.setdefault( + "path", request.method.lower() + " " + request.path_url + ) + request.headers.setdefault("date", date) + + +def fs_do_request_sign(self, request, enforce_content_headers=True): + inject_missing_headers(request) + do_request_sign = MethodType(AbstractBaseSigner.do_request_sign, self) + return do_request_sign(request, enforce_content_headers) + + # inject_missing_headers_og(request, sign_body, enforce_content_headers) diff --git a/ads/feature_store/service/oci_dataset.py b/ads/feature_store/service/oci_dataset.py index 263084e7d..b8ba6c37d 100644 --- a/ads/feature_store/service/oci_dataset.py +++ b/ads/feature_store/service/oci_dataset.py @@ -6,13 +6,17 @@ import datetime -import oci -from oci.feature_store.models import CreateDatasetDetails, UpdateDatasetDetails +import feature_store_client.feature_store as fs +from feature_store_client.feature_store.models import ( + Dataset, + CreateDatasetDetails, + UpdateDatasetDetails, +) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin -class OCIDataset(OCIFeatureStoreMixin, oci.feature_store.models.Dataset): +class OCIDataset(OCIFeatureStoreMixin, Dataset): """Represents an OCI Data Science dataset. This class contains all attributes of the `oci.data_science.models.Dataset`. The main purpose of this class is to link the `oci.data_science.models.Dataset` @@ -59,7 +63,7 @@ class OCIDataset(OCIFeatureStoreMixin, oci.feature_store.models.Dataset): """ @property - def client(self) -> oci.feature_store.feature_store_client.FeatureStoreClient: + def client(self) -> fs.feature_store_client.FeatureStoreClient: return super().client def create(self) -> "OCIDataset": diff --git a/ads/feature_store/service/oci_dataset_job.py b/ads/feature_store/service/oci_dataset_job.py index b957bc22a..e735acda3 100644 --- a/ads/feature_store/service/oci_dataset_job.py +++ b/ads/feature_store/service/oci_dataset_job.py @@ -8,8 +8,11 @@ import logging import time -import oci.feature_store -from oci.feature_store.models import CreateDatasetJobDetails, CompleteDatasetJobDetails +from feature_store_client.feature_store.models import ( + CreateDatasetJobDetails, + CompleteDatasetJobDetails, + DatasetJob, +) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin @@ -18,7 +21,7 @@ SLEEP_INTERVAL = 3 -class OCIDatasetJob(OCIFeatureStoreMixin, oci.feature_store.models.DatasetJob): +class OCIDatasetJob(OCIFeatureStoreMixin, DatasetJob): """Represents an OCI Data Science DatasetJob. This class contains all attributes of the `oci.data_science.models.DatasetJob`. The main purpose of this class is to link the `oci.data_science.models.DatasetJob` @@ -62,8 +65,8 @@ class OCIDatasetJob(OCIFeatureStoreMixin, oci.feature_store.models.DatasetJob): """ TERMINAL_STATES = [ - oci.feature_store.models.DatasetJob.LIFECYCLE_STATE_SUCCEEDED, - oci.feature_store.models.DatasetJob.LIFECYCLE_STATE_FAILED, + DatasetJob.LIFECYCLE_STATE_SUCCEEDED, + DatasetJob.LIFECYCLE_STATE_FAILED, ] def __init__(self, **kwargs) -> None: @@ -72,7 +75,7 @@ def __init__(self, **kwargs) -> None: Parameters ---------- kwargs: - Same as kwargs in oci.feature_store.models.OCIDatasetJob. + Same as kwargs in feature_store.models.OCIDatasetJob. Keyword arguments are passed into OCI feature group job model to initialize the properties. """ diff --git a/ads/feature_store/service/oci_entity.py b/ads/feature_store/service/oci_entity.py index 357a83e5f..0fb97bd4b 100644 --- a/ads/feature_store/service/oci_entity.py +++ b/ads/feature_store/service/oci_entity.py @@ -4,8 +4,12 @@ # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import oci.feature_store -from oci.feature_store.models import CreateEntityDetails, UpdateEntityDetails +import feature_store_client.feature_store as fs +from feature_store_client.feature_store.models import ( + CreateEntityDetails, + UpdateEntityDetails, + Entity, +) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin import logging @@ -13,7 +17,7 @@ logger = logging.getLogger(__name__) -class OCIEntity(OCIFeatureStoreMixin, oci.feature_store.models.Entity): +class OCIEntity(OCIFeatureStoreMixin, Entity): """Represents an OCI Data Science Entity. This class contains all attributes of the `oci.data_science.models.Entity`. The main purpose of this class is to link the `oci.data_science.models.Entity` @@ -60,9 +64,7 @@ class OCIEntity(OCIFeatureStoreMixin, oci.feature_store.models.Entity): # Overriding default behavior @classmethod - def init_client( - cls, **kwargs - ) -> oci.feature_store.feature_store_client.FeatureStoreClient: + def init_client(cls, **kwargs) -> fs.feature_store_client.FeatureStoreClient: client = super().init_client(**kwargs) # Define the list entities callable to list the resources diff --git a/ads/feature_store/service/oci_feature_group.py b/ads/feature_store/service/oci_feature_group.py index f856ed548..8aadcf103 100644 --- a/ads/feature_store/service/oci_feature_group.py +++ b/ads/feature_store/service/oci_feature_group.py @@ -8,16 +8,16 @@ import pandas as pd from ads.common import utils -import oci -from oci.feature_store.models import ( +from feature_store_client.feature_store.models import ( CreateFeatureGroupDetails, UpdateFeatureGroupDetails, + FeatureGroup, ) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin -class OCIFeatureGroup(OCIFeatureStoreMixin, oci.feature_store.models.FeatureGroup): +class OCIFeatureGroup(OCIFeatureStoreMixin, FeatureGroup): """Represents an OCI Data Science feature group. This class contains all attributes of the `oci.data_science.models.FeatureDefinition`. The main purpose of this class is to link the `oci.data_science.models.FeatureDefinition` diff --git a/ads/feature_store/service/oci_feature_group_job.py b/ads/feature_store/service/oci_feature_group_job.py index 71c59b02f..5392ee34a 100644 --- a/ads/feature_store/service/oci_feature_group_job.py +++ b/ads/feature_store/service/oci_feature_group_job.py @@ -8,10 +8,12 @@ import logging import time -import oci.feature_store -from oci.feature_store.models import ( +import feature_store_client.feature_store as fs +from feature_store_client.feature_store.models import ( CreateFeatureGroupJobDetails, CompleteFeatureGroupJobDetails, + FeatureGroupJob, + DatasetJob, ) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin @@ -21,9 +23,7 @@ SLEEP_INTERVAL = 3 -class OCIFeatureGroupJob( - OCIFeatureStoreMixin, oci.feature_store.models.FeatureGroupJob -): +class OCIFeatureGroupJob(OCIFeatureStoreMixin, FeatureGroupJob): """Represents an OCI Data Science FeatureGroupJob. This class contains all attributes of the `oci.data_science.models.FeatureGroupJob`. The main purpose of this class is to link the `oci.data_science.models.FeatureGroupJob` @@ -67,8 +67,8 @@ class OCIFeatureGroupJob( """ TERMINAL_STATES = [ - oci.feature_store.models.DatasetJob.LIFECYCLE_STATE_SUCCEEDED, - oci.feature_store.models.DatasetJob.LIFECYCLE_STATE_FAILED, + DatasetJob.LIFECYCLE_STATE_SUCCEEDED, + DatasetJob.LIFECYCLE_STATE_FAILED, ] def __init__(self, **kwargs) -> None: @@ -77,7 +77,7 @@ def __init__(self, **kwargs) -> None: Parameters ---------- kwargs: - Same as kwargs in oci.feature_store.models.OCIFeatureGroupJob. + Same as kwargs in feature_store.models.OCIFeatureGroupJob. Keyword arguments are passed into OCI feature group job model to initialize the properties. """ @@ -86,9 +86,7 @@ def __init__(self, **kwargs) -> None: # Overriding default behavior @classmethod - def init_client( - cls, **kwargs - ) -> oci.feature_store.feature_store_client.FeatureStoreClient: + def init_client(cls, **kwargs) -> fs.feature_store_client.FeatureStoreClient: client = super().init_client(**kwargs) # Define the list entities callable to list the resources diff --git a/ads/feature_store/service/oci_feature_store.py b/ads/feature_store/service/oci_feature_store.py index f9f564b08..40133a67f 100644 --- a/ads/feature_store/service/oci_feature_store.py +++ b/ads/feature_store/service/oci_feature_store.py @@ -8,10 +8,10 @@ from functools import wraps from typing import Callable -import oci.feature_store -from oci.feature_store.models import ( +from feature_store_client.feature_store.models import ( CreateFeatureStoreDetails, UpdateFeatureStoreDetails, + FeatureStore, ) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin @@ -72,7 +72,7 @@ def wrapper(self, *args, **kwargs): return decorator -class OCIFeatureStore(OCIFeatureStoreMixin, oci.feature_store.models.FeatureStore): +class OCIFeatureStore(OCIFeatureStoreMixin, FeatureStore): """Represents an OCI Data Science feature store. This class contains all attributes of the `oci.data_science.models.FeatureStore`. The main purpose of this class is to link the `oci.data_science.models.FeatureStore` diff --git a/ads/feature_store/service/oci_lineage.py b/ads/feature_store/service/oci_lineage.py index 3176c29a3..6660f57d4 100644 --- a/ads/feature_store/service/oci_lineage.py +++ b/ads/feature_store/service/oci_lineage.py @@ -6,14 +6,18 @@ import logging -import oci.feature_store +import feature_store_client.feature_store as fs from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin +from feature_store_client.feature_store.models import ( + Lineage, +) + logger = logging.getLogger(__name__) -class OCILineage(OCIFeatureStoreMixin, oci.feature_store.models.Lineage): +class OCILineage(OCIFeatureStoreMixin, Lineage): """Represents an OCI Data Science Lineage Resource Class. This class contains all attributes of the `oci.data_science.models.Lineaage`. The main purpose of this class is to link the `oci.data_science.models.Lineaage` @@ -58,14 +62,14 @@ def __init__(self, **kwargs) -> None: Parameters ---------- kwargs: - Same as kwargs in oci.feature_store.models.Lineage. + Same as kwargs in feature_store.models.Lineage. Keyword arguments are passed into OCI feature group Lineage model to initialize the properties. """ super().__init__(**kwargs) - def from_id(self, feature_store_lineage_resource_id: str, **kwargs) -> "Lineage": + def from_id(self, feature_store_lineage_resource_id: str, **kwargs) -> Lineage: """Gets lineage resource by feature store id. Parameters diff --git a/ads/feature_store/service/oci_transformation.py b/ads/feature_store/service/oci_transformation.py index c158e8e58..455331d7c 100644 --- a/ads/feature_store/service/oci_transformation.py +++ b/ads/feature_store/service/oci_transformation.py @@ -4,8 +4,10 @@ # Copyright (c) 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ -import oci.feature_store -from oci.feature_store.models import CreateTransformationDetails +from feature_store_client.feature_store.models import ( + CreateTransformationDetails, + Transformation, +) from ads.feature_store.mixin.oci_feature_store import OCIFeatureStoreMixin import logging @@ -13,7 +15,7 @@ logger = logging.getLogger(__name__) -class OCITransformation(OCIFeatureStoreMixin, oci.feature_store.models.Transformation): +class OCITransformation(OCIFeatureStoreMixin, Transformation): """Represents an OCI Data Science Transformation. This class contains all attributes of the `oci.data_science.models.Transformation`. The main purpose of this class is to link the `oci.data_science.models.Transformation` diff --git a/ads/feature_store/transformation.py b/ads/feature_store/transformation.py index 1ed3e5718..90f81e0ae 100644 --- a/ads/feature_store/transformation.py +++ b/ads/feature_store/transformation.py @@ -120,7 +120,7 @@ def _to_oci_fs_transformation(self, **kwargs): kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Transformation` accepts. + Can be any attribute that `feature_store.models.Transformation` accepts. Returns ------- @@ -213,10 +213,8 @@ def transformation_mode(self) -> str: return self.get_spec(self.CONST_TRANSFORMATION_MODE) @transformation_mode.setter - def transformation_mode( - self, transformation_mode: TransformationMode - ) -> "Transformation": - return self.with_transformation_mode(transformation_mode) + def transformation_mode(self, transformation_mode: TransformationMode) -> None: + self.with_transformation_mode(transformation_mode) def with_transformation_mode( self, transformation_mode: TransformationMode @@ -327,7 +325,7 @@ def create(self, **kwargs) -> "Transformation": ---------- kwargs Additional kwargs arguments. - Can be any attribute that `oci.feature_store.models.Transformation` accepts. + Can be any attribute that `feature_store.models.Transformation` accepts. Returns ------- diff --git a/ads/jobs/ads_job.py b/ads/jobs/ads_job.py index b50c3d62d..8d1fd3eba 100644 --- a/ads/jobs/ads_job.py +++ b/ads/jobs/ads_job.py @@ -3,14 +3,20 @@ # Copyright (c) 2021, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +import time from typing import List, Union, Dict from urllib.parse import urlparse import fsspec +import oci from ads.common.auth import default_signer from ads.jobs.builders.base import Builder from ads.jobs.builders.infrastructure.dataflow import DataFlow, DataFlowRun -from ads.jobs.builders.infrastructure.dsc_job import DataScienceJob, DataScienceJobRun +from ads.jobs.builders.infrastructure.dsc_job import ( + DataScienceJob, + DataScienceJobRun, + SLEEP_INTERVAL +) from ads.jobs.builders.runtimes.pytorch_runtime import PyTorchDistributedRuntime from ads.jobs.builders.runtimes.container_runtime import ContainerRuntime from ads.jobs.builders.runtimes.python_runtime import ( @@ -460,7 +466,29 @@ def run_list(self, **kwargs) -> list: A list of job run instances, the actual object type depends on the infrastructure. """ return self.infrastructure.run_list(**kwargs) + + def cancel(self, wait_for_completion: bool = True) -> None: + """Cancels the runs of the job. + Parameters + ---------- + wait_for_completion: bool + Whether to wait for run to be cancelled before proceeding. + Defaults to True. + """ + runs = self.run_list() + for run in runs: + run.cancel(wait_for_completion=False) + + if wait_for_completion: + for run in runs: + while ( + run.lifecycle_state != + oci.data_science.models.JobRun.LIFECYCLE_STATE_CANCELED + ): + run.sync() + time.sleep(SLEEP_INTERVAL) + def delete(self) -> None: """Deletes the job from the infrastructure.""" self.infrastructure.delete() diff --git a/ads/jobs/builders/infrastructure/dsc_job.py b/ads/jobs/builders/infrastructure/dsc_job.py index ee5f2966a..d67876012 100644 --- a/ads/jobs/builders/infrastructure/dsc_job.py +++ b/ads/jobs/builders/infrastructure/dsc_job.py @@ -725,9 +725,14 @@ def stop_condition(): return self - def cancel(self) -> DataScienceJobRun: + def cancel(self, wait_for_completion: bool = True) -> DataScienceJobRun: """Cancels a job run - This method will wait for the job run to be canceled before returning. + + Parameters + ---------- + wait_for_completion: bool + Whether to wait for job run to be cancelled before proceeding. + Defaults to True. Returns ------- @@ -735,9 +740,13 @@ def cancel(self) -> DataScienceJobRun: The job run instance. """ self.client.cancel_job_run(self.id) - while self.lifecycle_state != "CANCELED": - self.sync() - time.sleep(SLEEP_INTERVAL) + if wait_for_completion: + while ( + self.lifecycle_state != + oci.data_science.models.JobRun.LIFECYCLE_STATE_CANCELED + ): + self.sync() + time.sleep(SLEEP_INTERVAL) return self def __repr__(self) -> str: @@ -1473,7 +1482,7 @@ def _update_job_infra(self, dsc_job: DSCJob) -> DataScienceJob: if self.storage_mount: if not hasattr( - oci.data_science.models, "JobStorageMountConfigurationDetails" + oci.data_science.models, "StorageMountConfigurationDetails" ): raise EnvironmentError( "Storage mount hasn't been supported in the current OCI SDK installed." diff --git a/ads/model/artifact_uploader.py b/ads/model/artifact_uploader.py index 260761d34..743d2c856 100644 --- a/ads/model/artifact_uploader.py +++ b/ads/model/artifact_uploader.py @@ -10,6 +10,7 @@ from typing import Dict, Optional from ads.common import utils +from ads.common.object_storage_details import ObjectStorageDetails from ads.model.common import utils as model_utils from ads.model.service.oci_datascience_model import OCIDataScienceModel @@ -29,7 +30,10 @@ def __init__(self, dsc_model: OCIDataScienceModel, artifact_path: str): artifact_path: str The model artifact location. """ - if not os.path.exists(artifact_path): + if not ( + ObjectStorageDetails.is_oci_path(artifact_path) + or os.path.exists(artifact_path) + ): raise ValueError(f"The `{artifact_path}` does not exist") self.dsc_model = dsc_model @@ -45,7 +49,7 @@ def upload(self): ) as progress: self.progress = progress self.progress.update("Preparing model artifacts ZIP archive.") - self._prepare_artiact_tmp_zip() + self._prepare_artifact_tmp_zip() self.progress.update("Uploading model artifacts.") self._upload() self.progress.update( @@ -55,22 +59,19 @@ def upload(self): except Exception: raise finally: - self._remove_artiact_tmp_zip() + self._remove_artifact_tmp_zip() - def _prepare_artiact_tmp_zip(self) -> str: + def _prepare_artifact_tmp_zip(self) -> str: """Prepares model artifacts ZIP archive. - Parameters - ---------- - progress: (TqdmProgressBar, optional). Defaults to `None`. - The progress indicator. - Returns ------- str Path to the model artifact ZIP archive. """ - if os.path.isfile(self.artifact_path) and self.artifact_path.lower().endswith( + if ObjectStorageDetails.is_oci_path(self.artifact_path): + self.artifact_zip_path = self.artifact_path + elif os.path.isfile(self.artifact_path) and self.artifact_path.lower().endswith( ".zip" ): self.artifact_zip_path = self.artifact_path @@ -80,7 +81,7 @@ def _prepare_artiact_tmp_zip(self) -> str: ) return self.artifact_zip_path - def _remove_artiact_tmp_zip(self): + def _remove_artifact_tmp_zip(self): """Removes temporary created artifact zip archive.""" if ( self.artifact_zip_path @@ -112,7 +113,10 @@ class LargeArtifactUploader(ArtifactUploader): Attributes ---------- artifact_path: str - The model artifact location. + The model artifact location. Possible values are: + - object storage path to zip archive. Example: `oci://@/prefix/mymodel.zip`. + - local path to zip archive. Example: `./mymodel.zip`. + - local path to folder with artifacts. Example: `./mymodel`. artifact_zip_path: str The uri of the zip of model artifact. auth: dict @@ -124,6 +128,11 @@ class LargeArtifactUploader(ArtifactUploader): The OCI Object Storage URI where model artifacts will be copied to. The `bucket_uri` is only necessary for uploading large artifacts which size is greater than 2GB. Example: `oci://@/prefix/`. + + .. versionadded:: 2.8.10 + + If artifact_path is object storage path to a zip archive, bucket_uri will be ignored. + dsc_model: OCIDataScienceModel The data scince model instance. overwrite_existing_artifact: bool @@ -145,7 +154,7 @@ def __init__( self, dsc_model: OCIDataScienceModel, artifact_path: str, - bucket_uri: str, + bucket_uri: str = None, auth: Optional[Dict] = None, region: Optional[str] = None, overwrite_existing_artifact: Optional[bool] = True, @@ -159,11 +168,19 @@ def __init__( dsc_model: OCIDataScienceModel The data scince model instance. artifact_path: str - The model artifact location. - bucket_uri: str + The model artifact location. Possible values are: + - object storage path to zip archive. Example: `oci://@/prefix/mymodel.zip`. + - local path to zip archive. Example: `./mymodel.zip`. + - local path to folder with artifacts. Example: `./mymodel`. + bucket_uri: (str, optional). Defaults to `None`. The OCI Object Storage URI where model artifacts will be copied to. - The `bucket_uri` is only necessary for uploading large artifacts which + The `bucket_uri` is only necessary for uploading large artifacts from local which size is greater than 2GB. Example: `oci://@/prefix/`. + + .. versionadded:: 2.8.10 + + If `artifact_path` is object storage path to a zip archive, `bucket_uri` will be ignored. + auth: (Dict, optional). Defaults to `None`. The default authetication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or @@ -179,11 +196,22 @@ def __init__( parallel_process_count: (int, optional). The number of worker processes to use in parallel for uploading individual parts of a multipart upload. """ + self.auth = auth or dsc_model.auth + if ObjectStorageDetails.is_oci_path(artifact_path): + if not artifact_path.endswith(".zip"): + raise ValueError( + f"The `artifact_path={artifact_path}` is invalid." + "The remote path for model artifact should be a zip archive, " + "e.g. `oci://@/prefix/mymodel.zip`." + ) + if not utils.is_path_exists(uri=artifact_path, auth=self.auth): + raise ValueError(f"The `{artifact_path}` does not exist.") + bucket_uri = artifact_path + if not bucket_uri: raise ValueError("The `bucket_uri` must be provided.") super().__init__(dsc_model=dsc_model, artifact_path=artifact_path) - self.auth = auth or dsc_model.auth self.region = region or utils.extract_region(self.auth) self.bucket_uri = bucket_uri self.overwrite_existing_artifact = overwrite_existing_artifact @@ -192,38 +220,38 @@ def __init__( def _upload(self): """Uploads model artifacts to the model catalog.""" - self.progress.update("Copying model artifact to the Object Storage bucket") - bucket_uri = self.bucket_uri - bucket_uri_file_name = os.path.basename(bucket_uri) - - if not bucket_uri_file_name: - bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip") - elif not bucket_uri.lower().endswith(".zip"): - bucket_uri = f"{bucket_uri}.zip" - - if not self.overwrite_existing_artifact and utils.is_path_exists( - uri=bucket_uri, auth=self.auth - ): - raise FileExistsError( - f"The bucket_uri=`{self.bucket_uri}` exists. Please use a new file name or " - "set `overwrite_existing_artifact` to `True` if you wish to overwrite." - ) + self.progress.update("Copying model artifact to the Object Storage bucket") + if not bucket_uri == self.artifact_zip_path: + bucket_uri_file_name = os.path.basename(bucket_uri) + + if not bucket_uri_file_name: + bucket_uri = os.path.join(bucket_uri, f"{self.dsc_model.id}.zip") + elif not bucket_uri.lower().endswith(".zip"): + bucket_uri = f"{bucket_uri}.zip" + + if not self.overwrite_existing_artifact and utils.is_path_exists( + uri=bucket_uri, auth=self.auth + ): + raise FileExistsError( + f"The bucket_uri=`{self.bucket_uri}` exists. Please use a new file name or " + "set `overwrite_existing_artifact` to `True` if you wish to overwrite." + ) - try: - utils.upload_to_os( - src_uri=self.artifact_zip_path, - dst_uri=bucket_uri, - auth=self.auth, - parallel_process_count=self._parallel_process_count, - force_overwrite=self.overwrite_existing_artifact, - progressbar_description="Copying model artifact to the Object Storage bucket.", - ) - except Exception as ex: - raise RuntimeError( - f"Failed to upload model artifact to the given Object Storage path `{self.bucket_uri}`." - f"See Exception: {ex}" - ) + try: + utils.upload_to_os( + src_uri=self.artifact_zip_path, + dst_uri=bucket_uri, + auth=self.auth, + parallel_process_count=self._parallel_process_count, + force_overwrite=self.overwrite_existing_artifact, + progressbar_description="Copying model artifact to the Object Storage bucket.", + ) + except Exception as ex: + raise RuntimeError( + f"Failed to upload model artifact to the given Object Storage path `{self.bucket_uri}`." + f"See Exception: {ex}" + ) self.progress.update("Exporting model artifact to the model catalog") self.dsc_model.export_model_artifact(bucket_uri=bucket_uri, region=self.region) diff --git a/ads/model/datascience_model.py b/ads/model/datascience_model.py index 8bbf6d0da..d7f03163e 100644 --- a/ads/model/datascience_model.py +++ b/ads/model/datascience_model.py @@ -11,6 +11,7 @@ import pandas from ads.common import utils +from ads.common.object_storage_details import ObjectStorageDetails from ads.config import COMPARTMENT_OCID, PROJECT_OCID from ads.feature_engineering.schema import Schema from ads.jobs.builders.base import Builder @@ -548,6 +549,11 @@ def create(self, **kwargs) -> "DataScienceModel": The OCI Object Storage URI where model artifacts will be copied to. The `bucket_uri` is only necessary for uploading large artifacts which size is greater than 2GB. Example: `oci://@/prefix/`. + + .. versionadded:: 2.8.10 + + If `artifact` is provided as an object storage path to a zip archive, `bucket_uri` will be ignored. + overwrite_existing_artifact: (bool, optional). Defaults to `True`. Overwrite target bucket artifact if exists. remove_existing_artifact: (bool, optional). Defaults to `True`. @@ -636,6 +642,11 @@ def upload_artifact( The OCI Object Storage URI where model artifacts will be copied to. The `bucket_uri` is only necessary for uploading large artifacts which size is greater than 2GB. Example: `oci://@/prefix/`. + + .. versionadded:: 2.8.10 + + If `artifact` is provided as an object storage path to a zip archive, `bucket_uri` will be ignored. + auth: (Dict, optional). Defaults to `None`. The default authentication is set using `ads.set_auth` API. If you need to override the default, use the `ads.common.auth.api_keys` or @@ -668,6 +679,13 @@ def upload_artifact( "timeout": timeout, } + if ObjectStorageDetails.is_oci_path(self.artifact): + if bucket_uri and bucket_uri != self.artifact: + logger.warn( + "The `bucket_uri` will be ignored and the value of `self.artifact` will be used instead." + ) + bucket_uri = self.artifact + if bucket_uri or utils.folder_size(self.artifact) > _MAX_ARTIFACT_SIZE_IN_BYTES: if not bucket_uri: raise ModelArtifactSizeError( diff --git a/ads/model/deployment/model_deployment.py b/ads/model/deployment/model_deployment.py index 1b6e1c3d0..efc63db78 100644 --- a/ads/model/deployment/model_deployment.py +++ b/ads/model/deployment/model_deployment.py @@ -40,6 +40,7 @@ from ads.model.deployment.model_deployment_runtime import ( ModelDeploymentCondaRuntime, ModelDeploymentContainerRuntime, + ModelDeploymentMode, ModelDeploymentRuntime, ModelDeploymentRuntimeType, OCIModelDeploymentRuntimeType, @@ -80,11 +81,6 @@ class ModelDeploymentLogType: ACCESS = "access" -class ModelDeploymentMode: - HTTPS = "HTTPS_ONLY" - STREAM = "STREAM_ONLY" - - class LogNotConfiguredError(Exception): # pragma: no cover pass @@ -911,48 +907,59 @@ def predict( "`data` and `json_input` are both provided. You can only use one of them." ) - if auto_serialize_data: - data = data or json_input - serialized_data = serializer.serialize(data=data) - return send_request( - data=serialized_data, - endpoint=endpoint, - is_json_payload=_is_json_serializable(serialized_data), - header=header, - ) + try: + if auto_serialize_data: + data = data or json_input + serialized_data = serializer.serialize(data=data) + return send_request( + data=serialized_data, + endpoint=endpoint, + is_json_payload=_is_json_serializable(serialized_data), + header=header, + ) - if json_input is not None: - if not _is_json_serializable(json_input): - raise ValueError( - "`json_input` must be json serializable. " - "Set `auto_serialize_data` to True, or serialize the provided input data first," - "or using `data` to pass binary data." + if json_input is not None: + if not _is_json_serializable(json_input): + raise ValueError( + "`json_input` must be json serializable. " + "Set `auto_serialize_data` to True, or serialize the provided input data first," + "or using `data` to pass binary data." + ) + utils.get_logger().warning( + "The `json_input` argument of `predict()` will be deprecated soon. " + "Please use `data` argument. " ) - utils.get_logger().warning( - "The `json_input` argument of `predict()` will be deprecated soon. " - "Please use `data` argument. " - ) - data = json_input + data = json_input - is_json_payload = _is_json_serializable(data) - if not isinstance(data, bytes) and not is_json_payload: - raise TypeError( - "`data` is not bytes or json serializable. Set `auto_serialize_data` to `True` to serialize the input data." - ) - if model_name and model_version: - header["model-name"] = model_name - header["model-version"] = model_version - elif bool(model_version) ^ bool(model_name): - raise ValueError( - "`model_name` and `model_version` have to be provided together." + is_json_payload = _is_json_serializable(data) + if not isinstance(data, bytes) and not is_json_payload: + raise TypeError( + "`data` is not bytes or json serializable. Set `auto_serialize_data` to `True` to serialize the input data." + ) + if model_name and model_version: + header["model-name"] = model_name + header["model-version"] = model_version + elif bool(model_version) ^ bool(model_name): + raise ValueError( + "`model_name` and `model_version` have to be provided together." + ) + prediction = send_request( + data=data, + endpoint=endpoint, + is_json_payload=is_json_payload, + header=header, ) - prediction = send_request( - data=data, - endpoint=endpoint, - is_json_payload=is_json_payload, - header=header, - ) - return prediction + return prediction + except oci.exceptions.ServiceError as ex: + # When bandwidth exceeds the allocated value, TooManyRequests error (429) will be raised by oci backend. + if ex.status == 429: + bandwidth_mbps = self.infrastructure.bandwidth_mbps or MODEL_DEPLOYMENT_BANDWIDTH_MBPS + utils.get_logger().warning( + f"Load balancer bandwidth exceeds the allocated {bandwidth_mbps} Mbps." + "To estimate the actual bandwidth, use formula: (payload size in KB) * (estimated requests per second) * 8 / 1024." + "To resolve the issue, try sizing down the payload, slowing down the request rate or increasing the allocated bandwidth." + ) + raise def activate( self, diff --git a/ads/model/deployment/model_deployment_infrastructure.py b/ads/model/deployment/model_deployment_infrastructure.py index 519b5a62f..0bce46cf8 100644 --- a/ads/model/deployment/model_deployment_infrastructure.py +++ b/ads/model/deployment/model_deployment_infrastructure.py @@ -223,6 +223,7 @@ def _load_default_properties(self) -> Dict: defaults[self.CONST_REPLICA] = DEFAULT_REPLICA if NB_SESSION_OCID: + nb_session = None try: nb_session = DSCNotebookSession.from_ocid(NB_SESSION_OCID) except Exception as e: diff --git a/ads/model/deployment/model_deployment_runtime.py b/ads/model/deployment/model_deployment_runtime.py index e6c743652..199c69ac2 100644 --- a/ads/model/deployment/model_deployment_runtime.py +++ b/ads/model/deployment/model_deployment_runtime.py @@ -21,6 +21,11 @@ class OCIModelDeploymentRuntimeType: CONTAINER = "OCIR_CONTAINER" +class ModelDeploymentMode: + HTTPS = "HTTPS_ONLY" + STREAM = "STREAM_ONLY" + + class ModelDeploymentRuntime(Builder): """A class used to represent a Model Deployment Runtime. @@ -173,7 +178,7 @@ def deployment_mode(self) -> str: str The deployment mode of model deployment. """ - return self.get_spec(self.CONST_DEPLOYMENT_MODE, None) + return self.get_spec(self.CONST_DEPLOYMENT_MODE, ModelDeploymentMode.HTTPS) def with_deployment_mode(self, deployment_mode: str) -> "ModelDeploymentRuntime": """Sets the deployment mode of model deployment. diff --git a/ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py b/ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py index 070bd6911..989633bb6 100644 --- a/ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py +++ b/ads/model/model_artifact_boilerplate/artifact_introspection_test/model_artifact_validate.py @@ -30,7 +30,7 @@ HTML_PATH = os.path.join(_cwd, "resources", "template.html") CONFIG_PATH = os.path.join(_cwd, "resources", "config.yaml") PYTHON_VER_PATTERN = "^([3])(\.[6-9])(\.\d+)?$" -PAR_URL = "https://objectstorage.us-ashburn-1.oraclecloud.com/p/Ri7zFc_h91sxMdgnza9Qnqw3Ina8hf8wzDvEpAnUXMDOnUR1U1fpsaBUjUfgPgIq/n/ociodscdev/b/service-conda-packs/o/service_pack/index.json" +PAR_URL = "https://objectstorage.us-ashburn-1.oraclecloud.com/p/WyjtfVIG0uda-P3-2FmAfwaLlXYQZbvPZmfX1qg0-sbkwEQO6jpwabGr2hMDBmBp/n/ociodscdev/b/service-conda-packs/o/service_pack/index.json" TESTS = { "score_py": { @@ -195,7 +195,9 @@ def check_runtime_yml(file_path) -> Tuple[bool, str]: return False, TESTS["runtime_path_exist"]["error_msg"] else: TESTS["runtime_path_exist"]["success"] = False - return False, TESTS["runtime_path_exist"]["error_msg"] + TESTS["runtime_path_exist"][ + "error_msg" + ] = "WARNING: Unable to validate if INFERENCE_ENV_PATH exists. Please check if you have internet access." else: bucket_name = env_path.username namespace = env_path.hostname diff --git a/ads/opctl/backend/ads_ml_job.py b/ads/opctl/backend/ads_ml_job.py index 9dff4e8c1..bd1362044 100644 --- a/ads/opctl/backend/ads_ml_job.py +++ b/ads/opctl/backend/ads_ml_job.py @@ -217,10 +217,22 @@ def cancel(self): """ Cancel Job Run from OCID. """ - run_id = self.config["execution"]["run_id"] with AuthContext(auth=self.auth_type, profile=self.profile): - DataScienceJobRun.from_ocid(run_id).cancel() - print(f"Job run {run_id} has been cancelled.") + wait_for_completion = self.config["execution"].get("wait_for_completion") + if self.config["execution"].get("id"): + id = self.config["execution"]["id"] + Job.from_datascience_job(id).cancel( + wait_for_completion=wait_for_completion + ) + if wait_for_completion: + print(f"All job runs under {id} have been cancelled.") + elif self.config["execution"].get("run_id"): + run_id = self.config["execution"]["run_id"] + DataScienceJobRun.from_ocid(run_id).cancel( + wait_for_completion=wait_for_completion + ) + if wait_for_completion: + print(f"Job run {run_id} has been cancelled.") def watch(self): """ diff --git a/ads/opctl/cmds.py b/ads/opctl/cmds.py index 5f6bee9fa..fdd392987 100644 --- a/ads/opctl/cmds.py +++ b/ads/opctl/cmds.py @@ -374,7 +374,7 @@ def delete(**kwargs) -> None: ): kwargs["id"] = kwargs.pop("ocid") else: - raise ValueError(f"{kwargs['ocid']} is valid or supported.") + raise ValueError(f"{kwargs['ocid']} is invalid or not supported.") p = ConfigProcessor().step(ConfigMerger, **kwargs) return _BackendFactory(p.config).backend.delete() @@ -388,13 +388,24 @@ def cancel(**kwargs) -> None: ---------- kwargs: dict keyword argument, stores command line args + Returns ------- None """ - kwargs["run_id"] = kwargs.pop("ocid") - if not kwargs.get("backend"): - kwargs["backend"] = _get_backend_from_run_id(kwargs["run_id"]) + kwargs["backend"] = _get_backend_from_ocid(kwargs["ocid"]) + if ( + DataScienceResourceRun.JOB_RUN in kwargs["ocid"] + or DataScienceResourceRun.DATAFLOW_RUN in kwargs["ocid"] + or DataScienceResourceRun.PIPELINE_RUN in kwargs["ocid"] + ): + kwargs["run_id"] = kwargs.pop("ocid") + elif ( + DataScienceResource.JOB in kwargs["ocid"] + ): + kwargs["id"] = kwargs.pop("ocid") + else: + raise ValueError(f"{kwargs['ocid']} is invalid or not supported.") p = ConfigProcessor().step(ConfigMerger, **kwargs) return _BackendFactory(p.config).backend.cancel() diff --git a/ads/opctl/distributed/cli.py b/ads/opctl/distributed/cli.py index cee55287a..58507985a 100644 --- a/ads/opctl/distributed/cli.py +++ b/ads/opctl/distributed/cli.py @@ -30,7 +30,14 @@ def commands(): "-f", help="Distributed training framework type", type=click.Choice( - ["dask", "horovod-tensorflow", "horovod-pytorch", "pytorch", "tensorflow"] + [ + "dask", + "horovod-tensorflow", + "horovod-pytorch", + "pytorch", + "tensorflow", + "ray", + ] ), default=None, required=True, diff --git a/ads/pipeline/ads_pipeline_step.py b/ads/pipeline/ads_pipeline_step.py index 2098274e5..187ecec10 100644 --- a/ads/pipeline/ads_pipeline_step.py +++ b/ads/pipeline/ads_pipeline_step.py @@ -519,6 +519,19 @@ def to_dict(self) -> dict: dict_details["spec"][self.CONST_JOB_ID] = self.job_id if self.description: dict_details["spec"][self.CONST_DESCRIPTION] = self.description + if self.kind == "ML_JOB": + if self.environment_variable: + dict_details["spec"][self.CONST_ENVIRONMENT_VARIABLES] = ( + self.environment_variable + ) + if self.argument: + dict_details["spec"][self.CONST_COMMAND_LINE_ARGUMENTS] = ( + self.argument + ) + if self.maximum_runtime_in_minutes: + dict_details["spec"][self.CONST_MAXIMUM_RUNTIME_IN_MINUTES] = ( + self.maximum_runtime_in_minutes + ) dict_details["spec"].pop(self.CONST_DEPENDS_ON, None) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 04e1b9465..bbdcdb5e8 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,12 +2,33 @@ Release Notes ============= +2.8.11 +------ +Release date: October 18, 2023 + +* Added support to mount file systems in Data Science notebook sessions and jobs. +* Added support to cancel all job runs in the ADS ``api`` and ``opctl`` commands. +* Updated ``ads.set_auth()`` to use both ``config`` and ``signer`` when provided. +* Fixed a bug when initializing distributed training artifacts with "Ray" framework. + +2.8.10 +------ +Release date: September 27, 2023 + +* Improved the ``LargeArtifactUploader`` class to understand OCI paths to upload model artifacts to the model catalog by reference. +* Removed ``ADSDataset`` runtime dependency on ``geopandas``. +* Fixed a bug in the progress bar during model registration. +* Fixed a bug where session variable could be referenced before assignment. +* Fixed a bug with model artifact save. +* Fixed a bug with pipelines step. + 2.8.9 ----- Release date: September 5, 2023 * Upgraded the ``scikit-learn`` dependency to ``>=1.0``. * Upgraded the ``pandas`` dependency to ``>1.2.1,<2.1`` to allow you to use ADS with pandas 2.0. +* Implemented multi-part upload in the ``ArtifactUploader`` to upload model artifacts to the model catalog. * Fixed the "Attribute not found" error, when ``deploy()`` called twice in ``GenericModel``. * Fixed the fetch of the security token, when the relative path for the ``security_token_file`` is provided (used in session token-bases authentication). diff --git a/docs/source/user_guide/jobs/infra_and_runtime.rst b/docs/source/user_guide/jobs/infra_and_runtime.rst index 58ab60052..4d9dbac1b 100644 --- a/docs/source/user_guide/jobs/infra_and_runtime.rst +++ b/docs/source/user_guide/jobs/infra_and_runtime.rst @@ -152,6 +152,61 @@ see also `ADS Logging <../logging/logging.html>`_. With logging configured, you can call :py:meth:`~ads.jobs.DataScienceJobRun.watch` method to stream the logs. + +Mounting File Systems +--------------------- + +Data Science Job supports mounting multiple types of file systems, +see `Data Science Job Mounting File Systems `_. A maximum number of 5 file systems are +allowed to be mounted for each Data Science Job. You can specify a list of file systems to be mounted +by calling :py:meth:`~ads.jobs.DataScienceJob.with_storage_mount()`. For each file system to be mounted, +you need to pass a dictionary with ``src`` and ``dest`` as keys. For example, you can pass +``@`` as the value for ``src`` to mount OCI File Storage and you can also +pass ``oci://@/`` to mount OCI Object Storage. The value of +``dest`` indicates the path and directory to which you want to mount the file system and must be in the +format as ``/``. The ```` is required +while the ```` is optional. The ```` must start with character ``/`` if provided. +If not, the file systems will be mounted to ``/mnt/`` by default. + + +.. tabs:: + + .. code-tab:: python + :caption: Python + + from ads.jobs import DataScienceJob + + infrastructure = ( + DataScienceJob() + .with_log_group_id("") + .with_log_id("") + .with_storage_mount( + { + "src" : "@", + "dest" : "/" + }, # mount oci file storage to path "/" + { + "src" : "oci://@/", + "dest" : "" + } # mount oci object storage to path "/mnt/" + ) + ) + + .. code-tab:: yaml + :caption: YAML + + kind: infrastructure + type: dataScienceJob + spec: + logGroupId: + logId: + storageMount: + - src: @ + dest: / + - src: oci://@/ + dest: + + Runtime ======= diff --git a/docs/source/user_guide/jobs/tabs/infra_config.rst b/docs/source/user_guide/jobs/tabs/infra_config.rst index 683ef7a8b..0151d1c7f 100644 --- a/docs/source/user_guide/jobs/tabs/infra_config.rst +++ b/docs/source/user_guide/jobs/tabs/infra_config.rst @@ -22,6 +22,17 @@ .with_shape_config_details(memory_in_gbs=16, ocpus=1) # Minimum/Default block storage size is 50 (GB). .with_block_storage_size(50) + # A maximum number of 5 file systems are allowed to be mounted for a job. + .with_storage_mount( + { + "src" : "@", + "dest" : "/" + }, # mount oci file storage to path "/" + { + "src" : "oci://@/", + "dest" : "" + } # mount oci object storage to path "/mnt/" + ) ) .. code-tab:: yaml @@ -40,3 +51,8 @@ ocpus: 1 shapeName: VM.Standard.E3.Flex subnetId: + storageMount: + - src: @ + dest: / + - src: oci://@/ + dest: diff --git a/docs/source/user_guide/jobs/tabs/llama2_full.rst b/docs/source/user_guide/jobs/tabs/llama2_full.rst index 270f9386e..15a0222cf 100644 --- a/docs/source/user_guide/jobs/tabs/llama2_full.rst +++ b/docs/source/user_guide/jobs/tabs/llama2_full.rst @@ -14,42 +14,32 @@ .with_compartment_id("") .with_project_id("") .with_subnet_id("") - .with_shape_name("VM.GPU.A10.1") + .with_shape_name("VM.GPU.A10.2") .with_block_storage_size(256) ) .with_runtime( PyTorchDistributedRuntime() # Specify the service conda environment by slug name. - .with_service_conda("pytorch20_p39_gpu_v1") + .with_service_conda("pytorch20_p39_gpu_v2") .with_git( url="https://github.com/facebookresearch/llama-recipes.git", - commit="03faba661f079ee1ecaeb66deaa6bdec920a7bab" + commit="1aecd00924738239f8d86f342b36bacad180d2b3" ) .with_dependency( pip_pkg=" ".join([ - "'accelerate>=0.21.0'", - "appdirs", - "loralib", - "bitsandbytes==0.39.1", - "black", - "'black[jupyter]'", - "datasets", - "fire", - "'git+https://github.com/huggingface/peft.git'", - "'transformers>=4.31.0'", - "sentencepiece", - "py7zr", - "scipy", - "optimum" + "--extra-index-url https://download.pytorch.org/whl/cu118 torch==2.1.0", + "git+https://github.com/huggingface/peft.git@15a013af5ff5660b9377af24d3eee358213d72d4" + "appdirs==1.4.4", + "llama-recipes==0.0.1", + "py7zr==0.20.6", ]) ) .with_output("/home/datascience/outputs", "oci://bucket@namespace/outputs/$JOB_RUN_OCID") .with_command(" ".join([ - "torchrun llama_finetuning.py", + "torchrun examples/finetuning.py", "--enable_fsdp", "--pure_bf16", "--batch_size_training 1", - "--micro_batch_size 1", "--model_name $MODEL_NAME", "--dist_checkpoint_root_folder /home/datascience/outputs", "--dist_checkpoint_folder fine-tuned" @@ -87,36 +77,26 @@ spec: git: url: https://github.com/facebookresearch/llama-recipes.git - commit: 03faba661f079ee1ecaeb66deaa6bdec920a7bab + commit: 1aecd00924738239f8d86f342b36bacad180d2b3 command: >- torchrun llama_finetuning.py --enable_fsdp --pure_bf16 --batch_size_training 1 - --micro_batch_size 1 --model_name $MODEL_NAME --dist_checkpoint_root_folder /home/datascience/outputs --dist_checkpoint_folder fine-tuned replicas: 2 conda: type: service - slug: pytorch20_p39_gpu_v1 + slug: pytorch20_p39_gpu_v2 dependencies: pipPackages: >- - 'accelerate>=0.21.0' - appdirs - loralib - bitsandbytes==0.39.1 - black - 'black[jupyter]' - datasets - fire - 'git+https://github.com/huggingface/peft.git' - 'transformers>=4.31.0' - sentencepiece - py7zr - scipy - optimum + --extra-index-url https://download.pytorch.org/whl/cu118 torch==2.1.0 + git+https://github.com/huggingface/peft.git@15a013af5ff5660b9377af24d3eee358213d72d4 + llama-recipes==0.0.1 + appdirs==1.4.4 + py7zr==0.20.6 outputDir: /home/datascience/outputs outputUri: oci://bucket@namespace/outputs/$JOB_RUN_OCID env: diff --git a/docs/source/user_guide/jobs/tabs/quick_start_job.rst b/docs/source/user_guide/jobs/tabs/quick_start_job.rst index 82839c925..f515fff13 100644 --- a/docs/source/user_guide/jobs/tabs/quick_start_job.rst +++ b/docs/source/user_guide/jobs/tabs/quick_start_job.rst @@ -24,6 +24,17 @@ .with_shape_config_details(memory_in_gbs=16, ocpus=1) # Minimum/Default block storage size is 50 (GB). .with_block_storage_size(50) + # A maximum number of 5 file systems are allowed to be mounted for a job. + .with_storage_mount( + { + "src" : "@", + "dest" : "/" + }, # mount oci file storage to path "/" + { + "src" : "oci://@/", + "dest" : "" + } # mount oci object storage to path "/mnt/" + ) ) .with_runtime( PythonRuntime() @@ -59,6 +70,11 @@ ocpus: 1 shapeName: VM.Standard.E3.Flex subnetId: + storageMount: + - src: @ + dest: / + - src: oci://@/ + dest: runtime: kind: runtime type: python diff --git a/docs/source/user_guide/model_registration/_template/deploy.rst b/docs/source/user_guide/model_registration/_template/deploy.rst index d2ec91ed7..f5c5d4eee 100644 --- a/docs/source/user_guide/model_registration/_template/deploy.rst +++ b/docs/source/user_guide/model_registration/_template/deploy.rst @@ -7,12 +7,11 @@ You can use the ``.deploy()`` method to deploy a model. You must first save the The ``.deploy()`` method returns a ``ModelDeployment`` object. Specify deployment attributes such as display name, instance type, number of instances, maximum router bandwidth, and logging groups. The API takes the following parameters: -See `API documentation <../../ads.model.html#id1>`__ for more details about the parameters. - +See :py:meth:`~ads.model.GenericModel.deploy` for more details about the parameters. .. admonition:: Tips :class: note * Providing ``deployment_access_log_id`` and ``deployment_predict_log_id`` helps in debugging your model inference setup. - * Default Load Balancer configuration has bandwidth of 10 Mbps. `Refer service document to help you choose the right setup. `_ - * Check for supported instance shapes `here `_ . + * Default Load Balancer configuration has bandwidth of 10 Mbps. `Refer service document to help you choose the right setup. `_ + * Check for supported instance shapes `here `_ . diff --git a/docs/source/user_guide/model_registration/_template/prepare_save_deploy.rst b/docs/source/user_guide/model_registration/_template/prepare_save_deploy.rst index 5e44c48d5..7b818d1ec 100644 --- a/docs/source/user_guide/model_registration/_template/prepare_save_deploy.rst +++ b/docs/source/user_guide/model_registration/_template/prepare_save_deploy.rst @@ -27,7 +27,7 @@ The ``.prepare_save_deploy()`` method is a shortcut for the functions ``.prepare Namespace of region. Use this parameter to identify the service pack region when you pass a slug to ``inference_conda_env`` and ``training_conda_env``. * ``use_case_type``: str - The use case type of the model. Assign a value using the``UseCaseType`` class or provide a string in ``UseCaseType``. For + The use case type of the model. Assign a value using the ``UseCaseType`` class or provide a string in ``UseCaseType``. For example, use_case_type=UseCaseType.BINARY_CLASSIFICATION or use_case_type="binary_classification". Check the ``UseCaseType`` class to see supported types. * ``X_sample``: Union[list, tuple, pd.Series, np.ndarray, pd.DataFrame]. Defaults to None. @@ -60,7 +60,7 @@ The ``.prepare_save_deploy()`` method is a shortcut for the functions ``.prepare The name of the model. * ``description``: (str, optional). Defaults to None. The description of the model. -* ``deployment_instance_shape``: (str, optional). Default to ``VM.Standard2.1``. +* ``deployment_instance_shape``: (str, optional). The shape of the instance used for deployment. * ``deployment_instance_count``: (int, optional). Defaults to 1. The number of instances used for deployment. diff --git a/docs/source/user_guide/model_registration/introduction.rst b/docs/source/user_guide/model_registration/introduction.rst index 356724f40..2a8465c8d 100644 --- a/docs/source/user_guide/model_registration/introduction.rst +++ b/docs/source/user_guide/model_registration/introduction.rst @@ -40,6 +40,16 @@ If you make changes to the ``score.py`` file, call the ``.verify()`` method to c The ``.save()`` method is then used to store the model in the model catalog. A call to the ``.deploy()`` method creates a load balancer and the instances needed to have an HTTPS access point to perform inference on the model. Using the ``.predict()`` method, you can send data to the model deployment endpoint and it will return the predictions. + +LLMs +---- + +.. toctree:: + :maxdepth: 1 + + large_language_model + + Register -------- @@ -84,7 +94,3 @@ Frameworks :maxdepth: 1 framework_specific_instruction - - - - diff --git a/docs/source/user_guide/model_registration/large_language_model.rst b/docs/source/user_guide/model_registration/large_language_model.rst new file mode 100644 index 000000000..592d753e5 --- /dev/null +++ b/docs/source/user_guide/model_registration/large_language_model.rst @@ -0,0 +1,170 @@ +==================== +Large Language Model +==================== + +Oracle ADS (Accelerated Data Science) opens the gateway to harnessing the full potential of the Large Language models +within Oracle Cloud Infrastructure (OCI). `Meta `_'s +latest offering, `Llama 2 `_, introduces a collection of pre-trained and +fine-tuned generative text models, ranging from 7 to 70 billion parameters. These models represent a significant leap +forward, being trained on 40% more tokens and boasting an extended context length of 4,000 tokens. + +Throughout this documentation, we showcase two essential inference frameworks: + +- `Text Generation Inference (TGI) `_. A purpose-built solution for deploying and serving LLMs from Hugging Face, which we extend to meet the interface requirements of model deployment resources. + +- `vLLM `_. An open-source, high-throughput, and memory-efficient inference and serving engine for LLMs from UC Berkeley. + + +While our primary focus is on the Llama 2 family, the methodology presented here can be applied to other LLMs as well. + + +**Sample Code** + +For your convenience, we provide sample code and a complete walkthrough, available in the `Oracle +GitHub samples repository `_. + +**Prerequisites** + +Using the Llama 2 model requires user agreement acceptance on `Meta's website `_. Downloading the model +from `Hugging Face `_ necessitates an account and agreement to the service terms. Ensure that the model's license permits +usage for your intended purposes. + +**Recommended Hardware** + +We recommend specific OCI shapes based on Nvidia A10 GPUs for deploying models. These shapes +cater to both the 7-billion and 13-billion parameter models, with the latter utilizing quantization techniques to +optimize GPU memory usage. OCI offers `a variety of GPU options `_ to suit your needs. + +**Deployment Approaches** + +You can use the following methods to deploy an LLM with OCI Data Science: + +- Online Method. This approach involves downloading the LLM directly from the hosting repository into the `Data Science Model Deployment `_. It minimizes data copying, making it suitable for large models. However, it lacks governance and may not be ideal for production environments or fine-tuning scenarios. + +- Offline Method. In this method, you download the LLM model from the host repository and save it in the `Data Science Model Catalog `_. Deployment then occurs directly from the catalog, allowing for better control and governance of the model. + +**Inference Container** + +We explore two inference options: Hugging Face's Text Generation Inference (TGI) and vLLM from UC Berkeley. These +containers are crucial for effective model deployment and are optimized to align with OCI Data Science model deployment requirements. +You can find both the TGI and vLLM Docker files in `our samples repository `_. + +**Creating the Model Deployment** + +The final step involves deploying the model and the inference container by creating a model deployment. Once deployed, +the model is accessible via a predict URL, allowing HTTP-based model invocation. + +**Testing the Model** + +To validate your deployed model, a Gradio Chat app can be configured to use the predict URL. This app provides +parameters such as ``max_tokens``, ``temperature``, and ``top_p`` for fine-tuning model responses. Check our `blog `_ to +learn more about this. + + +Train Model +----------- + +Check `Training Large Language Model <../model_training/training_llm.rst>`_ to see how to train your large language model +by Oracle Cloud Infrastructure (OCI) `Data Science Jobs (Jobs) `_. + + +Register Model +-------------- + +Once you've trained your LLM, we guide you through the process of registering it within OCI, enabling seamless access and management. + +Zip all items of the folder using zip/tar utility, preferrably using below command to avoid creating another hierarchy of folder structure inside zipped file. + +.. code-block:: bash + + zip my_large_model.zip * -0 + +Upload the zipped artifact created in an object storage bucket in your tenancy. Tools like `rclone `_, +can help speed this upload. Using rclone with OCI can be referred from `here `_. + +Example of using ``oci-cli``: + +.. code-block:: bash + + oci os object put -ns -bn --name /my_large_model.zip --file my_large_model.zip + +Next step is to create a model catalog item. Use :py:class:`~ads.model.DataScienceModel` to register the large model to Model Catalog. + +.. versionadd:: 2.8.10 + +.. code-block:: python + + import ads + from ads.model import DataScienceModel + + ads.set_auth("resource_principal") + + MODEL_DISPLAY_NAME = "My Large Model" + ARTIFACT_PATH = "oci://@//my_large_model.zip" + + model = (DataScienceModel() + .with_display_name(MODEL_DISPLAY_NAME) + .with_artifact(ARTIFACT_PATH) + .create( + remove_existing_artifact=False + )) + model_id = model.id + +Deploy Model +------------ + +The final step involves deploying your registered LLM for real-world applications. We walk you through deploying it in a +`custom containers (Bring Your Own Container) `_ within the OCI Data +Science Service, leveraging advanced technologies for optimal performance. + +You can define the model deployment with `ADS Python APIs <../model_registration/model_deploy_byoc.rst>`_ or YAML. In the +examples below, you will need to change with the OCIDs of the resources required for the deployment, like ``project ID``, +``compartment ID`` etc. All of the configurations with ```` should be replaces with your corresponding ID from +your tenancy, the resources we created in the previous steps. + + +Online Deployment +^^^^^^^^^^^^^^^^^ + +**Prerequisites** + +Check on `GitHub Sample repository `_ to see how to complete the Prerequisites before actual deployment. + +- Zips your Hugging Face user access token and registers it into Model Catalog by following the instruction on ``Register Model`` in this page. +- Creates logging in the `OCI Logging Service `_ for the model deployment (if you have to already created, you can skip this step). +- Creates a subnet in `Virtual Cloud Network `_ for the model deployment. +- Executes container build and push process to `Oracle Cloud Container Registry `_. +- You can now use the Bring Your Own Container Deployment in OCI Data Science to the deploy the Llama2 model. + +.. include:: ../model_registration/tabs/env-var-online.rst + +.. include:: ../model_registration/tabs/ads-md-deploy-online.rst + +Offline Deployment +^^^^^^^^^^^^^^^^^^ + +Check on `GitHub Sample repository `_ to see how to complete the Prerequisites before actual deployment. + +- Registers the zipped artifact into Model Catalog by following the instruction on ``Register Model`` in this page. +- Creates logging in the `OCI Logging Service `_ for the model deployment (if you have to already created, you can skip this step). +- Executes container build and push process to `Oracle Cloud Container Registry `_. +- You can now use the Bring Your Own Container Deployment in OCI Data Science to the deploy the Llama2 model. + +.. include:: ../model_registration/tabs/env-var-offline.rst + +.. include:: ../model_registration/tabs/ads-md-deploy-offline.rst + +You can deploy the model through API call or ADS CLI. + +Make sure that you've also created and setup your `API Auth Token `_ to execute the commands below. + +.. include:: ../model_registration/tabs/run_md.rst + + +Inference Model +--------------- + +Once the model is deployed and shown as Active you can execute inference against it. You can run inference against +the deployed model with oci-cli from your OCI Data Science Notebook or you local environment. + +.. include:: ../model_registration/tabs/run_predict.rst diff --git a/docs/source/user_guide/model_registration/model_load.rst b/docs/source/user_guide/model_registration/model_load.rst index 44fb3876e..16c958935 100644 --- a/docs/source/user_guide/model_registration/model_load.rst +++ b/docs/source/user_guide/model_registration/model_load.rst @@ -1,3 +1,4 @@ +===================== Load Registered Model ===================== @@ -43,7 +44,7 @@ Alternatively the ``.from_id()`` method can be used to load a model. In future r Load Deployed Model -=================== +------------------- Load and recreate :doc:`framework specific wrapper ` objects using the ``ocid`` value of your OCI Model Deployment instance. @@ -82,7 +83,7 @@ Alternatively the ``.from_id()`` method can be used to load a model from the Mod ) Load Model From Object Storage -============================== +------------------------------ Load and recreate :doc:`framework specific wrapper ` objects from the existing model artifact archive. @@ -107,7 +108,7 @@ A model loaded from an artifact archive can be registered and deployed. Large Model Artifacts -===================== +--------------------- .. versionadded:: 2.6.4 diff --git a/docs/source/user_guide/model_registration/tabs/ads-md-deploy-offline.rst b/docs/source/user_guide/model_registration/tabs/ads-md-deploy-offline.rst new file mode 100644 index 000000000..ccc392a02 --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/ads-md-deploy-offline.rst @@ -0,0 +1,130 @@ +Creates Model Deployment: + +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.model.deployment import ModelDeployment, ModelDeploymentInfrastructure, ModelDeploymentContainerRuntime + + # configure model deployment infrastructure + infrastructure = ( + ModelDeploymentInfrastructure() + .with_project_id("ocid1.datascienceproject.oc1.") + .with_compartment_id("ocid1.compartment.oc1..") + .with_shape_name("VM.GPU3.2") + .with_bandwidth_mbps(10) + .with_web_concurrency(10) + .with_access_log( + log_group_id="ocid1.loggroup.oc1.", + log_id="ocid1.log.oc1." + ) + .with_predict_log( + log_group_id="ocid1.loggroup.oc1.", + log_id="ocid1.log.oc1." + ) + ) + + # configure model deployment runtime + container_runtime = ( + ModelDeploymentContainerRuntime() + .with_image("iad.ocir.io//:") + .with_server_port(5001) + .with_health_check_port(5001) + .with_env(env_var) + .with_deployment_mode("HTTPS_ONLY") + .with_model_uri("ocid1.datasciencemodel.oc1.") + .with_region("us-ashburn-1") + .with_overwrite_existing_artifact(True) + .with_remove_existing_artifact(True) + .with_timeout(100) + ) + + # configure model deployment + deployment = ( + ModelDeployment() + .with_display_name("Model Deployment Demo using ADS") + .with_description("The model deployment description.") + .with_freeform_tags({"key1":"value1"}) + .with_infrastructure(infrastructure) + .with_runtime(container_runtime) + ) + + .. code-tab:: yaml + :caption: TGI-YAML + + kind: deployment + spec: + displayName: LLama2-7b model deployment - tgi + infrastructure: + kind: infrastructure + type: datascienceModelDeployment + spec: + compartmentId: ocid1.compartment.oc1.. + projectId: ocid1.datascienceproject.oc1. + accessLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + predictLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + shapeName: VM.GPU.A10.2 + replica: 1 + bandWidthMbps: 10 + webConcurrency: 10 + subnetId: ocid1.subnet.oc1. + runtime: + kind: runtime + type: container + spec: + modelUri: ocid1.datasciencemodel.oc1. + image: + serverPort: 5001 + healthCheckPort: 5001 + env: + MODEL_DEPLOY_PREDICT_ENDPOINT: "/generate" + PARAMS: "--model /opt/ds/model/deployed_model --max-batch-prefill-tokens 1024" + region: us-ashburn-1 + overwriteExistingArtifact: True + removeExistingArtifact: True + timeout: 100 + deploymentMode: HTTPS_ONLY + + .. code-tab:: yaml + :caption: vllm-YAML + + kind: deployment + spec: + displayName: LLama2-7b model deployment - vllm + infrastructure: + kind: infrastructure + type: datascienceModelDeployment + spec: + compartmentId: ocid1.compartment.oc1.. + projectId: ocid1.datascienceproject.oc1. + accessLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + predictLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + shapeName: VM.GPU.A10.2 + replica: 1 + bandWidthMbps: 10 + webConcurrency: 10 + runtime: + kind: runtime + type: container + spec: + modelUri: ocid1.datasciencemodel.oc1. + image: + serverPort: 5001 + healthCheckPort: 5001 + env: + PARAMS: "--model /opt/ds/model/deployed_model" + TENSOR_PARALLELISM: 2 + region: us-ashburn-1 + overwriteExistingArtifact: True + removeExistingArtifact: True + timeout: 100 + deploymentMode: HTTPS_ONLY diff --git a/docs/source/user_guide/model_registration/tabs/ads-md-deploy-online.rst b/docs/source/user_guide/model_registration/tabs/ads-md-deploy-online.rst new file mode 100644 index 000000000..72f4ff907 --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/ads-md-deploy-online.rst @@ -0,0 +1,133 @@ +.. tabs:: + + .. code-tab:: Python3 + :caption: Python + + from ads.model.deployment import ModelDeployment, ModelDeploymentInfrastructure, ModelDeploymentContainerRuntime + + # configure model deployment infrastructure + infrastructure = ( + ModelDeploymentInfrastructure() + .with_project_id("ocid1.datascienceproject.oc1.") + .with_compartment_id("ocid1.compartment.oc1..") + .with_shape_name("VM.GPU.A10.2") + .with_bandwidth_mbps(10) + .with_web_concurrency(10) + .with_access_log( + log_group_id="ocid1.loggroup.oc1.", + log_id="ocid1.log.oc1." + ) + .with_predict_log( + log_group_id="ocid1.loggroup.oc1.", + log_id="ocid1.log.oc1." + ) + .with_subnet_id("ocid1.subnet.oc1.") + ) + + # configure model deployment runtime + container_runtime = ( + ModelDeploymentContainerRuntime() + .with_image("iad.ocir.io//:") + .with_server_port(5001) + .with_health_check_port(5001) + .with_env(env_var) + .with_deployment_mode("HTTPS_ONLY") + .with_model_uri("ocid1.datasciencemodel.oc1.") + .with_region("us-ashburn-1") + .with_overwrite_existing_artifact(True) + .with_remove_existing_artifact(True) + .with_timeout(100) + ) + + # configure model deployment + deployment = ( + ModelDeployment() + .with_display_name("Model Deployment Demo using ADS") + .with_description("The model deployment description") + .with_freeform_tags({"key1":"value1"}) + .with_infrastructure(infrastructure) + .with_runtime(container_runtime) + ) + + .. code-tab:: yaml + :caption: TGI-YAML + + kind: deployment + spec: + displayName: LLama2-7b model deployment - tgi + infrastructure: + kind: infrastructure + type: datascienceModelDeployment + spec: + compartmentId: ocid1.compartment.oc1.. + projectId: ocid1.datascienceproject.oc1. + accessLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + predictLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + shapeName: VM.GPU.A10.2 + replica: 1 + bandWidthMbps: 10 + webConcurrency: 10 + subnetId: ocid1.subnet.oc1. + runtime: + kind: runtime + type: container + spec: + modelUri: ocid1.datasciencemodel.oc1. + image: + serverPort: 5001 + healthCheckPort: 5001 + env: + TOKEN: "/opt/ds/model/deployed_model/token" + PARAMS: "--model-id meta-llama/Llama-2-7b-chat-hf --max-batch-prefill-tokens 1024" + region: us-ashburn-1 + overwriteExistingArtifact: True + removeExistingArtifact: True + timeout: 100 + deploymentMode: HTTPS_ONLY + + .. code-tab:: yaml + :caption: vllm-YAML + + kind: deployment + spec: + displayName: LLama2-7b model deployment - vllm + infrastructure: + kind: infrastructure + type: datascienceModelDeployment + spec: + compartmentId: ocid1.compartment.oc1.. + projectId: ocid1.datascienceproject.oc1. + accessLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + predictLog: + logGroupId: ocid1.loggroup.oc1. + logId: ocid1.log.oc1. + shapeName: VM.GPU.A10.2 + replica: 1 + bandWidthMbps: 10 + webConcurrency: 10 + subnetId: ocid1.subnet.oc1. + runtime: + kind: runtime + type: container + spec: + modelUri: ocid1.datasciencemodel.oc1. + image: + serverPort: 5001 + healthCheckPort: 5001 + env: + PARAMS: "--model meta-llama/Llama-2-7b-chat-hf" + HUGGINGFACE_HUB_CACHE: "/home/datascience/.cache" + TOKEN_FILE: /opt/ds/model/deployed_model/token + STORAGE_SIZE_IN_GB: "950" + WEB_CONCURRENCY: 1 + region: us-ashburn-1 + overwriteExistingArtifact: True + removeExistingArtifact: True + timeout: 100 + deploymentMode: HTTPS_ONLY diff --git a/docs/source/user_guide/model_registration/tabs/env-var-offline.rst b/docs/source/user_guide/model_registration/tabs/env-var-offline.rst new file mode 100644 index 000000000..2b81283a2 --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/env-var-offline.rst @@ -0,0 +1,39 @@ +Set custom environment variables: + +.. tabs:: + + .. code-tab:: Python3 + :caption: 7b llama2 - vllm + + env_var = { + "PARAMS": "--model /opt/ds/model/deployed_model", + } + + .. code-tab:: Python3 + :caption: 13b llama2 - vllm + + env_var = { + "PARAMS": "--model /opt/ds/model/deployed_model", + "TENSOR_PARALLELISM": 2, + } + + .. code-tab:: Python3 + :caption: 7b llama2 - TGI + + env_var = { + "MODEL_DEPLOY_PREDICT_ENDPOINT": "/generate", + "PARAMS": "--model /opt/ds/model/deployed_model --max-batch-prefill-tokens 1024" + } + + .. code-tab:: Python3 + :caption: 13b llama2 - TGI + + env_var = { + "MODEL_DEPLOY_PREDICT_ENDPOINT": "/generate", + "PARAMS" : "--model /opt/ds/model/deployed_model --max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096" + } + + +You can override more vllm/TGI bootstrapping configuration using ``PARAMS`` environment configuration. +For details of configurations, please refer the official `vLLM doc `_ and +`TGI doc `_. diff --git a/docs/source/user_guide/model_registration/tabs/env-var-online.rst b/docs/source/user_guide/model_registration/tabs/env-var-online.rst new file mode 100644 index 000000000..7e9599035 --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/env-var-online.rst @@ -0,0 +1,32 @@ +Set custom environment variables: + +.. tabs:: + + .. code-tab:: Python3 + :caption: 7b llama2 - vllm + + env_var = { + "TOKEN_FILE": "/opt/ds/model/deployed_model/token", + "PARAMS": "--model meta-llama/Llama-2-7b-chat-hf", + } + + .. code-tab:: Python3 + :caption: 7b llama2 - TGI + + env_var = { + "TOKEN_FILE": "/opt/ds/model/deployed_model/token", + "PARAMS": "--model-id meta-llama/Llama-2-7b-chat-hf --max-batch-prefill-tokens 1024", + } + + .. code-tab:: Python3 + :caption: 13b llama2 - TGI + + env_var = { + "TOKEN_FILE": "/opt/ds/model/deployed_model/token", + "PARAMS" : "--model meta-llama/Llama-2-13b-chat-hf --max-batch-prefill-tokens 1024 --quantize bitsandbytes --max-batch-total-tokens 4096" + } + + +You can override more vllm/TGI bootstrapping configuration using ``PARAMS`` environment configuration. +For details of configurations, please refer the official `vLLM doc `_ and +`TGI doc `_. diff --git a/docs/source/user_guide/model_registration/tabs/run_md.rst b/docs/source/user_guide/model_registration/tabs/run_md.rst new file mode 100644 index 000000000..3f45415ff --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/run_md.rst @@ -0,0 +1,15 @@ +To create a model deployment: + +.. tabs:: + + .. code-tab:: python + :caption: Python + + # Deploy model on container runtime + deployment.deploy() + + .. code-tab:: bash + :caption: YAML + + # Use the following command to deploy model + ads opctl run -f ads-md-deploy-.yaml diff --git a/docs/source/user_guide/model_registration/tabs/run_predict.rst b/docs/source/user_guide/model_registration/tabs/run_predict.rst new file mode 100644 index 000000000..a18c86dd6 --- /dev/null +++ b/docs/source/user_guide/model_registration/tabs/run_predict.rst @@ -0,0 +1,52 @@ +Run inference against the deployed model: + +.. tabs:: + + .. code-tab:: python + :caption: Python + + # For TGI + data = { + "inputs": "Write a python program to randomly select item from a predefined list?", + "parameters": { + "max_new_tokens": 200 + } + } + + # For vLLM + data = { + "prompt": "are you smart?", + "use_beam_search": true, + "n": 4, + "temperature": 0 + } + + deployment.predict(data=data) + + .. code-tab:: bash + :caption: TGI Inference by OCI CLI + + oci raw-request \ + --http-method POST \ + --target-uri "" \ + --request-body '{ + "inputs": "Write a python program to randomly select item from a predefined list?", + "parameters": { + "max_new_tokens": 200 + } + }' \ + --auth resource_principal + + .. code-tab:: bash + :caption: vLLM Inference by OCI CLI + + oci raw-request \ + --http-method POST \ + --target-uri "" \ + --request-body '{ + "prompt": "are you smart?", + "use_beam_search": true, + "n": 4, + "temperature": 0 + }' \ + --auth resource_principal diff --git a/docs/source/user_guide/model_serialization/_template/prepare_save_deploy.rst b/docs/source/user_guide/model_serialization/_template/prepare_save_deploy.rst index a8cf22e8e..6eb2cc8a9 100644 --- a/docs/source/user_guide/model_serialization/_template/prepare_save_deploy.rst +++ b/docs/source/user_guide/model_serialization/_template/prepare_save_deploy.rst @@ -27,7 +27,7 @@ The ``.prepare_save_deploy()`` method is a shortcut for the functions ``.prepare Namespace of region. Use this parameter to identify the service pack region when you pass a slug to ``inference_conda_env`` and ``training_conda_env``. * ``use_case_type``: str - The use case type of the model. Assign a value using the``UseCaseType`` class or provide a string in ``UseCaseType``. For + The use case type of the model. Assign a value using the ``UseCaseType`` class or provide a string in ``UseCaseType``. For example, use_case_type=UseCaseType.BINARY_CLASSIFICATION or use_case_type="binary_classification". Check the ``UseCaseType`` class to see supported types. * ``X_sample``: Union[list, tuple, pd.Series, np.ndarray, pd.DataFrame]. Defaults to None. @@ -60,7 +60,7 @@ The ``.prepare_save_deploy()`` method is a shortcut for the functions ``.prepare The name of the model. * ``description``: (str, optional). Defaults to None. The description of the model. -* ``deployment_instance_shape``: (str, optional). Default to ``VM.Standard2.1``. +* ``deployment_instance_shape``: (str, optional). The shape of the instance used for deployment. * ``deployment_instance_count``: (int, optional). Defaults to 1. The number of instances used for deployment. diff --git a/docs/source/user_guide/model_serialization/boilerplate/deploy.rst b/docs/source/user_guide/model_serialization/boilerplate/deploy.rst index e5366b9e7..c4c3ed51a 100644 --- a/docs/source/user_guide/model_serialization/boilerplate/deploy.rst +++ b/docs/source/user_guide/model_serialization/boilerplate/deploy.rst @@ -1,11 +1,11 @@ -You can use the ``.deploy()`` method to deploy a model. You must first save the model to the model catalog, and then deploy it. +You can use the ``.deploy()`` method to deploy a model. You must first save the model to the model catalog, and then deploy it. The ``.deploy()`` method returns a ``ModelDeployment`` object. Specify deployment attributes such as display name, instance type, number of instances, maximum router bandwidth, and logging groups. The API takes the following parameters: - ``deployment_access_log_id: (str, optional)``: Defaults to ``None``. The access log OCID for the access logs, see `logging `_. - ``deployment_bandwidth_mbps: (int, optional)``: Defaults to 10. The bandwidth limit on the load balancer in Mbps. - ``deployment_instance_count: (int, optional)``: Defaults to 1. The number of instances used for deployment. -- ``deployment_instance_shape: (str, optional)``: Default to VM.Standard2.1. The shape of the instance used for deployment. +- ``deployment_instance_shape: (str, optional)``: The shape of the instance used for deployment. - ``deployment_log_group_id: (str, optional)``: Defaults to ``None``. The OCI logging group OCID. The access log and predict log share the same log group. - ``deployment_predict_log_id: (str, optional)``: Defaults to ``None``. The predict log OCID for the predict logs, see `logging `_. - ``description: (str, optional)``: Defaults to ``None``. The description of the model. @@ -16,4 +16,3 @@ The ``.deploy()`` method returns a ``ModelDeployment`` object. Specify deployme - ``max_wait_time : (int, optional)``: Defaults to 1200 seconds. The maximum amount of time to wait in seconds. A negative value implies an infinite wait time. - ``poll_interval : (int, optional)``: Defaults to 60 seconds. Poll interval in seconds. - ``project_id: (str, optional)``: Project OCID. If not specified, the value is taken from the environment variables. - diff --git a/docs/source/user_guide/model_serialization/boilerplate/initialize.rst b/docs/source/user_guide/model_serialization/boilerplate/initialize.rst index c3a936094..ea3235492 100644 --- a/docs/source/user_guide/model_serialization/boilerplate/initialize.rst +++ b/docs/source/user_guide/model_serialization/boilerplate/initialize.rst @@ -19,7 +19,7 @@ The ``properties`` is an instance of the ``ModelProperties`` class and has the f By default, ``properties`` is populated from the appropriate environment variables if it's not specified. For example, in a notebook session, the environment variables for project id and compartment id are preset and stored in ``PROJECT_OCID`` and -``NB_SESSION_COMPARTMENT_OCID`` by default. So ``properties`` populates these variables +``NB_SESSION_COMPARTMENT_OCID`` by default. So ``properties`` populates these variables from the environment variables and uses the values in methods such as ``.save()`` and ``.deploy()``. However, you can explicitly pass in values to overwrite the defaults. When you use a method that includes an instance of ``properties``, then ``properties`` records the values that you pass in. diff --git a/docs/source/user_guide/model_training/training_llm.rst b/docs/source/user_guide/model_training/training_llm.rst index 16e9ab3d2..bedd79116 100644 --- a/docs/source/user_guide/model_training/training_llm.rst +++ b/docs/source/user_guide/model_training/training_llm.rst @@ -16,9 +16,9 @@ This page shows an example of fine-tuning the `Llama 2 `_ repository contains example code to fine-tune llama2 model. -The example `fine-tuning script `_ support full parameter fine-tuning +The example `fine-tuning script `_ supports both full parameter fine-tuning and `Parameter-Efficient Fine-Tuning (PEFT) `_. -With ADS, you can start the training job by taking the source code directly from Github. +With ADS, you can start the training job by taking the source code directly from Github with no code change. Access the Pre-Trained Model ============================ @@ -49,13 +49,16 @@ The job run will: Note that in the training command, there is no need specify the number of nodes, or the number of GPUs. ADS will automatically configure that base on the ``replica`` and ``shape`` you specified. -The fine-tuning runs on the `samsum `_ dataset by default. You can also `add your custom datasets `_. +The fine-tuning runs on the `samsum `_ dataset by default. You can also `add your custom datasets `_. -The same training script also support Parameter-Efficient Fine-Tuning (PEFT). You can change the ``command`` to the following for PEFT with `LoRA `_ +Once the fine-tuning is finished, the checkpoints will be saved into OCI object storage bucket as specified. +You can `load the FSDP checkpoints for inferencing `_. + +The same training script also support Parameter-Efficient Fine-Tuning (PEFT). You can change the ``command`` to the following for PEFT with `LoRA `_. Note that for PEFT, the fine-tuned weights are stored in the location specified by ``--output_dir``, while for full parameter fine-tuning, the checkpoints are stored in the location specified by ``--dist_checkpoint_root_folder`` and ``--dist_checkpoint_folder`` .. code-block:: bash torchrun llama_finetuning.py --enable_fsdp --use_peft --peft_method lora \ - --pure_bf16 --batch_size_training 1 --micro_batch_size 1 \ - --model_name /home/datascience/llama --output_dir /home/datascience/outputs + --pure_bf16 --batch_size_training 1 \ + --model_name meta-llama/Llama-2-7b-hf --output_dir /home/datascience/outputs diff --git a/pyproject.toml b/pyproject.toml index b0fd11696..82fe8146a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,12 +57,12 @@ dependencies = [ "asteval>=0.9.25", "cerberus>=1.3.4", "cloudpickle>=1.6.0", - "fsspec>=0.8.7", + "fsspec>=0.8.7,<2023.9.1", # v2.9.1 introduced issues, revealed by unit tests "gitpython>=3.1.2", "jinja2>=2.11.2", "matplotlib>=3.1.3", "numpy>=1.19.2", - "oci>=2.104.3", + "oci>=2.113.0", "ocifs>=1.1.3", "pandas>1.2.1,<2.1", "psutil>=5.7.2", @@ -109,7 +109,7 @@ onnx = [ "lightgbm==3.3.1", "onnx>=1.12.0", "onnxmltools>=1.10.0", - "onnxruntime>=1.10.0", + "onnxruntime>=1.10.0,<1.16", # v1.16 introduced issues https://github.com/microsoft/onnxruntime/issues/17631, revealedd by unit tests "oracle_ads[viz]", "protobuf<=3.20", "skl2onnx>=1.10.4", diff --git a/tests/integration/feature_store/test_data/credit_score_batch_1.csv b/tests/integration/feature_store/test_data/credit_score_batch_1.csv new file mode 100644 index 000000000..475f4b9c6 --- /dev/null +++ b/tests/integration/feature_store/test_data/credit_score_batch_1.csv @@ -0,0 +1,4 @@ +user_id,date,credit_score +c123006815,01/01/22,568 +c123006815,01/01/22,568 +c123006850,05/02/22,740 \ No newline at end of file diff --git a/tests/integration/feature_store/test_data/credit_score_batch_2.csv b/tests/integration/feature_store/test_data/credit_score_batch_2.csv new file mode 100644 index 000000000..b18137693 --- /dev/null +++ b/tests/integration/feature_store/test_data/credit_score_batch_2.csv @@ -0,0 +1,8 @@ +user_id,date,credit_score +c123006818,04/01/22,571 +c123006847,02/02/22,800 +c123006820,06/01/22,573 +c123006857,12/02/22,850 +c123006822,08/01/22,575 +c123006823,09/01/22,300 +c123006824,10/01/22,577 diff --git a/tests/integration/feature_store/test_streaming_dataframe_feature_group.py b/tests/integration/feature_store/test_streaming_dataframe_feature_group.py new file mode 100644 index 000000000..a5adf4d42 --- /dev/null +++ b/tests/integration/feature_store/test_streaming_dataframe_feature_group.py @@ -0,0 +1,185 @@ +import time + +from delta import configure_spark_with_delta_pip +from great_expectations.core import ExpectationSuite, ExpectationConfiguration +from pyspark.sql import SparkSession +from pyspark.sql.types import StructType + +from ads.feature_store.common.enums import TransformationMode, ExpectationType +from ads.feature_store.statistics_config import StatisticsConfig +from tests.integration.feature_store.test_base import FeatureStoreTestCase + + +def get_streaming_df(): + spark_builder = ( + SparkSession.builder.appName("FeatureStore") + .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") + .config( + "spark.sql.catalog.spark_catalog", + "org.apache.spark.sql.delta.catalog.DeltaCatalog", + ) + .enableHiveSupport() + ) + + spark = configure_spark_with_delta_pip( + spark_builder + ).getOrCreate() + + # Define the schema for the streaming data frame + credit_score_schema = StructType() \ + .add("user_id", "string") \ + .add("date", "string") \ + .add("credit_score", "string") + + credit_score_streaming_df = spark.readStream \ + .option("sep", ",") \ + .option("header", "true")\ + .schema(credit_score_schema) \ + .csv("test_data/") + + return credit_score_streaming_df + + +def credit_score_transformation(credit_score): + import pyspark.sql.functions as F + + # Create a new Spark DataFrame that contains the transformed credit score. + transformed_credit_score = credit_score.select( + "user_id", + "date", + F.when(F.col("credit_score").cast("int") > 500, 1).otherwise(0).alias("credit_score") + ) + + # Return the new Spark DataFrame. + return transformed_credit_score + + +class TestFeatureGroupWithStreamingDataFrame(FeatureStoreTestCase): + """Contains integration tests for Feature Group Kwargs supported transformation.""" + + def create_transformation_resource_stream(self, feature_store) -> "Transformation": + transformation = feature_store.create_transformation( + source_code_func=credit_score_transformation, + display_name="credit_score_transformation", + transformation_mode=TransformationMode.SPARK, + ) + return transformation + + + def test_feature_group_materialization_with_streaming_data_frame(self): + fs = self.define_feature_store_resource().create() + assert fs.oci_fs.id + + entity = self.create_entity_resource(fs) + assert entity.oci_fs_entity.id + + transformation = self.create_transformation_resource_stream(fs) + streaming_df = get_streaming_df() + + stats_config = StatisticsConfig().with_is_enabled(False) + fg = entity.create_feature_group( + primary_keys=["User_id"], + schema_details_dataframe=streaming_df, + statistics_config=stats_config, + name=self.get_name("streaming_fg_1"), + transformation_id=transformation.id + ) + assert fg.oci_feature_group.id + + query = fg.materialise_stream(input_dataframe=streaming_df, + checkpoint_dir=f"test_data/checkpoint/{fg.name}") + + assert query + time.sleep(10) + query.stop() + + assert fg.select().read().count() == 10 + + self.clean_up_feature_group(fg) + self.clean_up_transformation(transformation) + self.clean_up_entity(entity) + self.clean_up_feature_store(fs) + + def test_feature_group_materialization_with_streaming_data_frame_and_expectation(self): + fs = self.define_feature_store_resource().create() + assert fs.oci_fs.id + + entity = self.create_entity_resource(fs) + assert entity.oci_fs_entity.id + + transformation = self.create_transformation_resource_stream(fs) + streaming_df = get_streaming_df() + + stats_config = StatisticsConfig().with_is_enabled(False) + # Initialize Expectation Suite + expectation_suite_trans = ExpectationSuite(expectation_suite_name="feature_definition") + expectation_suite_trans.add_expectation( + ExpectationConfiguration( + expectation_type="EXPECT_COLUMN_VALUES_TO_BE_NULL", kwargs={"column": "date"} + ) + ) + expectation_suite_trans.add_expectation( + ExpectationConfiguration( + expectation_type="EXPECT_COLUMN_VALUES_TO_NOT_BE_NULL", + kwargs={"column": "date"}, + ) + ) + + fg = entity.create_feature_group( + primary_keys=["User_id"], + schema_details_dataframe=streaming_df, + statistics_config=stats_config, + expectation_suite=expectation_suite_trans, + expectation_type=ExpectationType.LENIENT, + name=self.get_name("streaming_fg_2"), + transformation_id=transformation.id + ) + assert fg.oci_feature_group.id + + query = fg.materialise_stream(input_dataframe=streaming_df, + checkpoint_dir=f"test_data/checkpoint/{fg.name}") + + assert query + time.sleep(10) + query.stop() + + assert fg.select().read().count() == 10 + assert fg.get_validation_output().to_pandas() is None + + self.clean_up_feature_group(fg) + self.clean_up_transformation(transformation) + self.clean_up_entity(entity) + self.clean_up_feature_store(fs) + + def test_feature_group_materialization_with_streaming_data_frame_and_stats(self): + fs = self.define_feature_store_resource().create() + assert fs.oci_fs.id + + entity = self.create_entity_resource(fs) + assert entity.oci_fs_entity.id + + transformation = self.create_transformation_resource_stream(fs) + streaming_df = get_streaming_df() + + fg = entity.create_feature_group( + primary_keys=["User_id"], + schema_details_dataframe=streaming_df, + name=self.get_name("streaming_fg_3"), + transformation_id=transformation.id + ) + assert fg.oci_feature_group.id + + query = fg.materialise_stream(input_dataframe=streaming_df, + checkpoint_dir=f"test_data/checkpoint/{fg.name}") + + assert query + time.sleep(10) + query.stop() + + assert fg.select().read().count() == 10 + assert fg.get_statistics().to_pandas() is None + + self.clean_up_feature_group(fg) + self.clean_up_transformation(transformation) + self.clean_up_entity(entity) + self.clean_up_feature_store(fs) diff --git a/tests/unitary/default_setup/jobs/test_jobs_base.py b/tests/unitary/default_setup/jobs/test_jobs_base.py index e0b8feb64..4b2f51709 100644 --- a/tests/unitary/default_setup/jobs/test_jobs_base.py +++ b/tests/unitary/default_setup/jobs/test_jobs_base.py @@ -20,6 +20,7 @@ ScriptRuntime, NotebookRuntime, ) +from ads.jobs.builders.infrastructure.dsc_job import DataScienceJobRun from ads.jobs.builders.infrastructure.dsc_job_runtime import ( CondaRuntimeHandler, ScriptRuntimeHandler, @@ -603,3 +604,25 @@ def test_run_details_link_fail(self, mock_extract_region): test_run_instance = RunInstance() test_result = test_run_instance.run_details_link assert test_result == "" + + +class DataScienceJobMethodTest(DataScienceJobPayloadTest): + + @patch("ads.jobs.builders.infrastructure.dsc_job.DataScienceJobRun.cancel") + @patch("ads.jobs.ads_job.Job.run_list") + def test_job_cancel(self, mock_run_list, mock_cancel): + mock_run_list.return_value = [ + DataScienceJobRun( + lifecycle_state="CANCELED" + ) + ] * 3 + + job = ( + Job(name="test") + .with_infrastructure(infrastructure.DataScienceJob()) + .with_runtime(ScriptRuntime().with_script(self.SCRIPT_URI)) + ) + + job.cancel() + mock_run_list.assert_called() + mock_cancel.assert_called_with(wait_for_completion=False) diff --git a/tests/unitary/default_setup/jobs/test_jobs_mount_file_system.py b/tests/unitary/default_setup/jobs/test_jobs_mount_file_system.py index d014331fc..fe7aaf80b 100644 --- a/tests/unitary/default_setup/jobs/test_jobs_mount_file_system.py +++ b/tests/unitary/default_setup/jobs/test_jobs_mount_file_system.py @@ -15,7 +15,6 @@ from ads.jobs.builders.runtimes.python_runtime import PythonRuntime try: - from oci.data_science.models import JobStorageMountConfigurationDetails from oci.data_science.models import FileStorageMountConfigurationDetails from oci.data_science.models import ObjectStorageMountConfigurationDetails except (ImportError, AttributeError) as e: @@ -50,6 +49,7 @@ FileStorageMountConfigurationDetails( **{ "destination_directory_name": "test_destination_directory_name_from_dsc", + "destination_path": "/test_destination_path", "export_id": "export_id_from_dsc", "mount_target_id": "mount_target_id_from_dsc", "storage_type": "FILE_STORAGE", @@ -58,6 +58,7 @@ FileStorageMountConfigurationDetails( **{ "destination_directory_name": "test_destination_directory_name_from_dsc", + "destination_path": "/test_destination_path", "export_id": "export_id_from_dsc", "mount_target_id": "mount_target_id_from_dsc", "storage_type": "FILE_STORAGE", @@ -80,15 +81,15 @@ .with_storage_mount( { "src" : "1.1.1.1:test_export_path_one", - "dest" : "test_mount_one", + "dest" : "/test_path_one/test_mount_one", }, { "src" : "2.2.2.2:test_export_path_two", - "dest" : "test_mount_two", + "dest" : "/test_path_two/test_mount_two", }, { "src" : "oci://bucket_name@namespace/synthetic/", - "dest" : "test_mount_three", + "dest" : "/test_path_three/test_mount_three", } ) ) @@ -114,11 +115,11 @@ shapeName: VM.Standard.E3.Flex storageMount: - src: 1.1.1.1:test_export_path_one - dest: test_mount_one + dest: /test_path_one/test_mount_one - src: 2.2.2.2:test_export_path_two - dest: test_mount_two + dest: /test_path_two/test_mount_two - src: oci://bucket_name@namespace/synthetic/ - dest: test_mount_three + dest: /test_path_three/test_mount_three subnetId: ocid1.subnet.oc1.iad.xxxx type: dataScienceJob name: My Job @@ -142,17 +143,17 @@ def test_data_science_job_initialize(self): dsc_file_storage_one = job.infrastructure.storage_mount[0] assert isinstance(dsc_file_storage_one, dict) assert dsc_file_storage_one["src"] == "1.1.1.1:test_export_path_one" - assert dsc_file_storage_one["dest"] == "test_mount_one" + assert dsc_file_storage_one["dest"] == "/test_path_one/test_mount_one" dsc_file_storage_two = job.infrastructure.storage_mount[1] assert isinstance(dsc_file_storage_two, dict) assert dsc_file_storage_two["src"] == "2.2.2.2:test_export_path_two" - assert dsc_file_storage_two["dest"] == "test_mount_two" + assert dsc_file_storage_two["dest"] == "/test_path_two/test_mount_two" dsc_object_storage = job.infrastructure.storage_mount[2] assert isinstance(dsc_object_storage, dict) assert dsc_object_storage["src"] == "oci://bucket_name@namespace/synthetic/" - assert dsc_object_storage["dest"] == "test_mount_three" + assert dsc_object_storage["dest"] == "/test_path_three/test_mount_three" def test_data_science_job_from_yaml(self): job_from_yaml = Job.from_yaml(job_yaml_string) @@ -161,17 +162,17 @@ def test_data_science_job_from_yaml(self): dsc_file_storage_one = job_from_yaml.infrastructure.storage_mount[0] assert isinstance(dsc_file_storage_one, dict) assert dsc_file_storage_one["src"] == "1.1.1.1:test_export_path_one" - assert dsc_file_storage_one["dest"] == "test_mount_one" + assert dsc_file_storage_one["dest"] == "/test_path_one/test_mount_one" dsc_file_storage_two = job.infrastructure.storage_mount[1] assert isinstance(dsc_file_storage_two, dict) assert dsc_file_storage_two["src"] == "2.2.2.2:test_export_path_two" - assert dsc_file_storage_two["dest"] == "test_mount_two" + assert dsc_file_storage_two["dest"] == "/test_path_two/test_mount_two" dsc_object_storage = job.infrastructure.storage_mount[2] assert isinstance(dsc_object_storage, dict) assert dsc_object_storage["src"] == "oci://bucket_name@namespace/synthetic/" - assert dsc_object_storage["dest"] == "test_mount_three" + assert dsc_object_storage["dest"] == "/test_path_three/test_mount_three" def test_data_science_job_to_dict(self): assert job.to_dict() == { @@ -201,15 +202,15 @@ def test_data_science_job_to_dict(self): "storageMount": [ { "src" : "1.1.1.1:test_export_path_one", - "dest" : "test_mount_one", + "dest" : "/test_path_one/test_mount_one", }, { "src" : "2.2.2.2:test_export_path_two", - "dest" : "test_mount_two", + "dest" : "/test_path_two/test_mount_two", }, { "src" : "oci://bucket_name@namespace/synthetic/", - "dest" : "test_mount_three", + "dest" : "/test_path_three/test_mount_three", } ], }, @@ -260,11 +261,11 @@ def test_update_storage_mount_from_dsc_model( assert isinstance(infrastructure.storage_mount[1], dict) assert infrastructure.storage_mount[0] == { "src" : "mount_target_id_from_dsc:export_id_from_dsc", - "dest" : "test_destination_directory_name_from_dsc" + "dest" : "/test_destination_path/test_destination_directory_name_from_dsc" } assert infrastructure.storage_mount[1] == { "src" : "mount_target_id_from_dsc:export_id_from_dsc", - "dest" : "test_destination_directory_name_from_dsc" + "dest" : "/test_destination_path/test_destination_directory_name_from_dsc" } @patch.object(OCIFileStorage, "update_to_dsc_model") @@ -276,6 +277,7 @@ def test_update_job_infra( mock_update_to_dsc_model.return_value = { "destinationDirectoryName": "test_destination_directory_name_from_dsc", + "destination_path": "/test_destination_path", "exportId": "test_export_id_one", "mountTargetId": "test_mount_target_id_one", "storageType": "FILE_STORAGE", @@ -293,6 +295,7 @@ def test_update_job_infra( 0 ] == { "destinationDirectoryName": "test_destination_directory_name_from_dsc", + "destination_path": "/test_destination_path", "exportId": "test_export_id_one", "mountTargetId": "test_mount_target_id_one", "storageType": "FILE_STORAGE", @@ -358,7 +361,7 @@ def test_file_manager_process_data_error(self): def test_dsc_object_storage(self): object_storage = OCIObjectStorage( src="oci://bucket@namespace/prefix", - dest="test_dest", + dest="/test_path/test_dest", ) result = object_storage.update_to_dsc_model() @@ -367,10 +370,12 @@ def test_dsc_object_storage(self): assert result["prefix"] == "prefix" assert result["storageType"] == "OBJECT_STORAGE" assert result["destinationDirectoryName"] == "test_dest" + assert result["destinationPath"] == "/test_path" dsc_model = ObjectStorageMountConfigurationDetails( **{ "destination_directory_name": "test_destination_directory_name_from_dsc", + "destination_path": "/test_destination_path", "storage_type": "OBJECT_STORAGE", "bucket": "bucket", "namespace": "namespace", @@ -380,17 +385,18 @@ def test_dsc_object_storage(self): result = OCIObjectStorage.update_from_dsc_model(dsc_model) assert result["src"] == "oci://bucket@namespace/prefix" - assert result["dest"] == "test_destination_directory_name_from_dsc" + assert result["dest"] == "/test_destination_path/test_destination_directory_name_from_dsc" def test_dsc_object_storage_error(self): error_messages = { "namespace" : "Missing parameter `namespace` from service. Check service log to see the error.", "bucket" : "Missing parameter `bucket` from service. Check service log to see the error.", - "destination_directory_name" : "Missing parameter `destination_directory_name` from service. Check service log to see the error." + "destination_directory_name" : "Missing parameter `destination_directory_name` from service. Check service log to see the error.", } dsc_model_dict = { "destination_directory_name": "test_destination_directory_name_from_dsc", + "destination_path": "/test_path", "storage_type": "OBJECT_STORAGE", "bucket": "bucket", "namespace": "namespace", @@ -412,11 +418,12 @@ def test_dsc_object_storage_error(self): def test_dsc_file_storage(self, mock_search_resources): file_storage = OCIFileStorage( src="ocid1.mounttarget.oc1.iad.xxxx:ocid1.export.oc1.iad.xxxx", - dest="test_dest", + dest="/test_path/test_dest", ) file_storage = file_storage.update_to_dsc_model() assert file_storage == { "destinationDirectoryName" : "test_dest", + "destinationPath" : "/test_path", "exportId" : "ocid1.export.oc1.iad.xxxx", "mountTargetId" : "ocid1.mounttarget.oc1.iad.xxxx", "storageType" : "FILE_STORAGE" @@ -424,7 +431,7 @@ def test_dsc_file_storage(self, mock_search_resources): file_storage = OCIFileStorage( src="1.1.1.1:/test_export", - dest="test_dest", + dest="/test_path/test_dest", ) items = [ @@ -477,6 +484,7 @@ def test_dsc_file_storage(self, mock_search_resources): file_storage = file_storage.update_to_dsc_model() assert file_storage == { "destinationDirectoryName" : "test_dest", + "destinationPath" : "/test_path", "exportId" : "ocid1.export.oc1.iad.xxxx", "mountTargetId" : "ocid1.mounttarget.oc1.iad.xxxx", "storageType" : "FILE_STORAGE" @@ -485,6 +493,7 @@ def test_dsc_file_storage(self, mock_search_resources): dsc_model = FileStorageMountConfigurationDetails( **{ "destination_directory_name": "test_dest", + "destination_path" : "/test_path", "storage_type": "FILE_STORAGE", "export_id": "ocid1.export.oc1.iad.xxxx", "mount_target_id": "ocid1.mounttarget.oc1.iad.xxxx" @@ -492,17 +501,18 @@ def test_dsc_file_storage(self, mock_search_resources): ) result = OCIFileStorage.update_from_dsc_model(dsc_model) assert result["src"] == "ocid1.mounttarget.oc1.iad.xxxx:ocid1.export.oc1.iad.xxxx" - assert result["dest"] == "test_dest" + assert result["dest"] == "/test_path/test_dest" def test_dsc_file_storage_error(self): error_messages = { "mount_target_id" : "Missing parameter `mount_target_id` from service. Check service log to see the error.", "export_id" : "Missing parameter `export_id` from service. Check service log to see the error.", - "destination_directory_name" : "Missing parameter `destination_directory_name` from service. Check service log to see the error." + "destination_directory_name" : "Missing parameter `destination_directory_name` from service. Check service log to see the error.", } dsc_model_dict = { "destination_directory_name": "test_destination_directory_name_from_dsc", + "destination_path": "/test_path", "storage_type": "FILE_STORAGE", "mount_target_id": "ocid1.mounttarget.oc1.iad.xxxx", "export_id": "ocid1.export.oc1.iad.xxxx", @@ -517,4 +527,25 @@ def test_dsc_file_storage_error(self): dsc_model_copy.pop(error) OCIFileStorage.update_from_dsc_model( FileStorageMountConfigurationDetails(**dsc_model_copy) - ) \ No newline at end of file + ) + + def test_get_destination_path_and_name(self): + path, directory = OCIFileStorage.get_destination_path_and_name("abc") + + assert path == None + assert directory == "abc" + + path, directory = OCIFileStorage.get_destination_path_and_name("/abc") + + assert path == "/" + assert directory == "abc" + + path, directory = OCIFileStorage.get_destination_path_and_name("/abc/def") + + assert path == "/abc" + assert directory == "def" + + path, directory = OCIFileStorage.get_destination_path_and_name("/abc/def/ghi") + + assert path == "/abc/def" + assert directory == "ghi" diff --git a/tests/unitary/default_setup/model/test_artifact_uploader.py b/tests/unitary/default_setup/model/test_artifact_uploader.py index bc9daeabf..ef9372728 100644 --- a/tests/unitary/default_setup/model/test_artifact_uploader.py +++ b/tests/unitary/default_setup/model/test_artifact_uploader.py @@ -29,6 +29,7 @@ def setup_class(cls): cls.mock_artifact_zip_path = os.path.join( cls.curr_dir, "test_files/model_artifacts.zip" ) + cls.mock_oci_artifact_path = "oci://bucket-name@namespace/model_artifacts.zip" def teardown_class(cls): # if os.path.exists(cls.mock_artifact_path): @@ -73,6 +74,32 @@ def test__init__(self): overwrite_existing_artifact=False, remove_existing_artifact=False, ) + + invalid_artifact_path = "oci://my-bucket@my-tenancy/mymodel" + with pytest.raises(ValueError): + lg_artifact_uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=invalid_artifact_path, + auth=self.mock_auth, + region=self.mock_region, + overwrite_existing_artifact=False, + remove_existing_artifact=False, + ) + + with patch("ads.common.utils.is_path_exists", return_value=False): + with pytest.raises( + ValueError, + match=f"The `{self.mock_oci_artifact_path}` does not exist", + ): + lg_artifact_uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=self.mock_oci_artifact_path, + auth=self.mock_auth, + region=self.mock_region, + overwrite_existing_artifact=False, + remove_existing_artifact=False, + ) + auth = default_signer() lg_artifact_uploader = LargeArtifactUploader( dsc_model=self.mock_dsc_model, @@ -97,14 +124,25 @@ def test__init__(self): == DEFAULT_PARALLEL_PROCESS_COUNT ) - def test_prepare_artiact_tmp_zip(self): + with patch("ads.common.utils.is_path_exists", return_value=True): + uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=self.mock_oci_artifact_path, + overwrite_existing_artifact=False, + remove_existing_artifact=False, + ) + assert uploader.artifact_path == self.mock_oci_artifact_path + assert uploader.bucket_uri == self.mock_oci_artifact_path + assert uploader.artifact_zip_path == None + + def test_prepare_artifact_tmp_zip(self): # Tests case when a folder provided as artifacts location with patch("ads.model.common.utils.zip_artifact") as mock_zip_artifact: mock_zip_artifact.return_value = "test_artifact.zip" artifact_uploader = SmallArtifactUploader( dsc_model=self.mock_dsc_model, artifact_path=self.mock_artifact_path ) - test_result = artifact_uploader._prepare_artiact_tmp_zip() + test_result = artifact_uploader._prepare_artifact_tmp_zip() assert test_result == "test_artifact.zip" # Tests case when a zip file provided as artifacts location @@ -114,17 +152,26 @@ def test_prepare_artiact_tmp_zip(self): dsc_model=self.mock_dsc_model, artifact_path=self.mock_artifact_path + ".zip", ) - test_result = artifact_uploader._prepare_artiact_tmp_zip() + test_result = artifact_uploader._prepare_artifact_tmp_zip() assert test_result == self.mock_artifact_path + ".zip" - def test_remove_artiact_tmp_zip(self): + # Tests case when a zip file provided as object storage path + with patch("ads.common.utils.is_path_exists", return_value=True): + artifact_uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=self.mock_oci_artifact_path, + ) + test_result = artifact_uploader._prepare_artifact_tmp_zip() + assert test_result == self.mock_oci_artifact_path + + def test_remove_artifact_tmp_zip(self): artifact_uploader = SmallArtifactUploader( dsc_model=self.mock_dsc_model, artifact_path=self.mock_artifact_path ) with patch("shutil.rmtree") as mock_rmtree: # Tests case when tmp artifact needs to be removed artifact_uploader.artifact_zip_path = "artifacts.zip" - artifact_uploader._remove_artiact_tmp_zip() + artifact_uploader._remove_artifact_tmp_zip() mock_rmtree.assert_called_with("artifacts.zip", ignore_errors=True) with patch("os.path.exists", return_value=True): @@ -135,41 +182,44 @@ def test_remove_artiact_tmp_zip(self): # Tests case when tmp artifact shouldn't be removed artifact_uploader.artifact_zip_path = "artifacts.zip" artifact_uploader.artifact_path = "artifacts.zip" - artifact_uploader._remove_artiact_tmp_zip() + artifact_uploader._remove_artifact_tmp_zip() mock_rmtree.assert_not_called() @patch.object(SmallArtifactUploader, "_upload") - @patch.object(SmallArtifactUploader, "_prepare_artiact_tmp_zip") - @patch.object(SmallArtifactUploader, "_remove_artiact_tmp_zip") + @patch.object(SmallArtifactUploader, "_prepare_artifact_tmp_zip") + @patch.object(SmallArtifactUploader, "_remove_artifact_tmp_zip") def test_upload( - self, mock__remove_artiact_tmp_zip, mock__prepare_artiact_tmp_zip, mock__upload + self, + mock__remove_artifact_tmp_zip, + mock__prepare_artifact_tmp_zip, + mock__upload, ): artifact_uploader = SmallArtifactUploader( dsc_model=self.mock_dsc_model, artifact_path=self.mock_artifact_path ) artifact_uploader.upload() - mock__remove_artiact_tmp_zip.assert_called() - mock__prepare_artiact_tmp_zip.assert_called() + mock__remove_artifact_tmp_zip.assert_called() + mock__prepare_artifact_tmp_zip.assert_called() mock__upload.assert_called() def test_upload_small_artifact(self): with open(self.mock_artifact_zip_path, "rb") as file_data: with patch.object( SmallArtifactUploader, - "_prepare_artiact_tmp_zip", + "_prepare_artifact_tmp_zip", return_value=self.mock_artifact_zip_path, - ) as mock_prepare_artiact_tmp_zip: + ) as mock_prepare_artifact_tmp_zip: with patch.object( - SmallArtifactUploader, "_remove_artiact_tmp_zip" - ) as mock_remove_artiact_tmp_zip: + SmallArtifactUploader, "_remove_artifact_tmp_zip" + ) as mock_remove_artifact_tmp_zip: artifact_uploader = SmallArtifactUploader( dsc_model=self.mock_dsc_model, artifact_path=self.mock_artifact_path, ) artifact_uploader.artifact_zip_path = self.mock_artifact_zip_path artifact_uploader.upload() - mock_prepare_artiact_tmp_zip.assert_called() - mock_remove_artiact_tmp_zip.assert_called() + mock_prepare_artifact_tmp_zip.assert_called() + mock_remove_artifact_tmp_zip.assert_called() self.mock_dsc_model.create_model_artifact.assert_called() @patch("ads.common.utils.is_path_exists") @@ -215,6 +265,24 @@ def test_upload_large_artifact(self, mock_upload, mock_is_path_exists): ) artifact_uploader.upload() + @patch("ads.common.utils.is_path_exists", return_value=True) + @patch("ads.common.utils.upload_to_os") + def test_skip_upload(self, mock_upload, mock_is_path_exists): + """Tests case when provided artifact is object storage path.""" + artifact_uploader = LargeArtifactUploader( + dsc_model=self.mock_dsc_model, + artifact_path=self.mock_oci_artifact_path, + auth=default_signer(), + region=self.mock_region, + overwrite_existing_artifact=False, + remove_existing_artifact=False, + ) + artifact_uploader.upload() + mock_upload.assert_not_called() + self.mock_dsc_model.export_model_artifact.assert_called_with( + bucket_uri=self.mock_oci_artifact_path, region=self.mock_region + ) + def test_zip_artifact_fail(self): with pytest.raises(ValueError, match="The `artifact_dir` must be provided."): zip_artifact(None) diff --git a/tests/unitary/default_setup/model_deployment/test_model_deployment.py b/tests/unitary/default_setup/model_deployment/test_model_deployment.py index 21bd5dc74..9018e7729 100644 --- a/tests/unitary/default_setup/model_deployment/test_model_deployment.py +++ b/tests/unitary/default_setup/model_deployment/test_model_deployment.py @@ -14,13 +14,17 @@ ModelDeployment, ModelDeploymentProperties, ) +from ads.model.deployment.model_deployment_infrastructure import ModelDeploymentInfrastructure +from ads.model.deployment.model_deployment_runtime import ModelDeploymentCondaRuntime class ModelDeploymentTestCase(unittest.TestCase): MODEL_ID = "" with patch.object(oci_client, "OCIClientFactory"): test_model_deployment = ModelDeployment( - model_deployment_id="test_model_deployment_id", properties={} + model_deployment_id="test_model_deployment_id", properties={}, + infrastructure=ModelDeploymentInfrastructure(), + runtime=ModelDeploymentCondaRuntime() ) @patch("requests.post") diff --git a/tests/unitary/default_setup/pipeline/test_pipeline.py b/tests/unitary/default_setup/pipeline/test_pipeline.py index 49c8a4348..302dfe065 100644 --- a/tests/unitary/default_setup/pipeline/test_pipeline.py +++ b/tests/unitary/default_setup/pipeline/test_pipeline.py @@ -333,6 +333,11 @@ def test_pipeline_define(self): "name": "TestPipelineStepOne", "jobId": "TestJobIdOne", "description": "Test description one", + "commandLineArguments": "ARGUMENT --KEY VALUE", + "environmentVariables": { + "ENV": "VALUE" + }, + "maximumRuntimeInMinutes": 20 }, }, { @@ -1060,6 +1065,11 @@ def test_pipeline_to_dict(self): "name": "TestPipelineStepOne", "jobId": "TestJobIdOne", "description": "Test description one", + "commandLineArguments": "ARGUMENT --KEY VALUE", + "environmentVariables": { + "ENV": "VALUE" + }, + "maximumRuntimeInMinutes": 20 }, }, { diff --git a/tests/unitary/default_setup/pipeline/test_pipeline_step.py b/tests/unitary/default_setup/pipeline/test_pipeline_step.py index 403e55917..d334bc27b 100644 --- a/tests/unitary/default_setup/pipeline/test_pipeline_step.py +++ b/tests/unitary/default_setup/pipeline/test_pipeline_step.py @@ -38,6 +38,9 @@ class DataSciencePipelineStepBaseTest(unittest.TestCase): PipelineStep("TestUpstreamPipelineStepOne") .with_description("Test upstream pipeline step description one") .with_job_id("TestJobIdOne") + .with_environment_variable(**{"test_key": "test_value"}) + .with_maximum_runtime_in_minutes(20) + .with_argument("--key val path/to/file") ) upstream_pipeline_step_two = ( @@ -85,6 +88,11 @@ def test_pipeline_step_to_dict(self): "name": "TestUpstreamPipelineStepOne", "jobId": "TestJobIdOne", "description": "Test upstream pipeline step description one", + "environmentVariables": { + "test_key": "test_value" + }, + "commandLineArguments": "--key val path/to/file", + "maximumRuntimeInMinutes": 20 }, } diff --git a/tests/unitary/with_extras/model/test_artifact.py b/tests/unitary/with_extras/model/test_artifact.py index e55f7c36c..e8f824894 100644 --- a/tests/unitary/with_extras/model/test_artifact.py +++ b/tests/unitary/with_extras/model/test_artifact.py @@ -138,10 +138,23 @@ def test_prepare_runtime_yaml_inference_training( (TrainingEnvInfo, mlcpu_path_cust, None, None, training_info_cust), ], ) + @patch("ads.model.runtime.env_info.get_service_packs") def test__populate_env_info_inference( - self, env_info_class, conda_pack, bucketname, namespace, expected_env_info + self, mock_get_service_packs, env_info_class, conda_pack, bucketname, namespace, expected_env_info ): """test _populate_env_info.""" + env_path = ( + expected_env_info.inference_env_path if isinstance(expected_env_info, InferenceEnvInfo) + else expected_env_info.training_env_path + ) + mock_get_service_packs.return_value = ( + { + env_path : ("mlcpuv1", "3.6") + }, + { + "mlcpuv1" : (env_path, "3.6") + } + ) env_info = self.artifact._populate_env_info( env_info_class, conda_pack=conda_pack, diff --git a/tests/unitary/with_extras/model/test_env_info.py b/tests/unitary/with_extras/model/test_env_info.py index 8f3ac8ade..cf555f68c 100644 --- a/tests/unitary/with_extras/model/test_env_info.py +++ b/tests/unitary/with_extras/model/test_env_info.py @@ -177,7 +177,16 @@ def test_from_slug_prod_sp(self): info.training_env_type = "service_pack" info.training_python_version = "3.6" - def test_from_slug_not_exist(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_slug_not_exist(self, mock_get_service_packs): + mock_get_service_packs.return_value = ( + { + "test_path" : ("mlcpuv1", "3.6"), + }, + { + "mlcpuv1" : ("test_path", "3.6"), + } + ) with pytest.warns(UserWarning, match="not a service pack"): TrainingEnvInfo.from_slug( "not_exist", namespace="ociodscdev", bucketname="service-conda-packs" @@ -256,7 +265,16 @@ def test_from_slug_prod_sp(self): info.inference_env_type = "service_pack" info.inference_python_version = "3.6" - def test_from_slug_not_exist(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_slug_not_exist(self, mock_get_service_packs): + mock_get_service_packs.return_value = ( + { + "test_path" : ("mlcpuv1", "3.6"), + }, + { + "mlcpuv1" : ("test_path", "3.6"), + } + ) with pytest.warns(UserWarning, match="not a service pack"): InferenceEnvInfo.from_slug( "not_exist", namespace="ociodscdev", bucketname="service-conda-packs" diff --git a/tests/unitary/with_extras/model/test_generic_model.py b/tests/unitary/with_extras/model/test_generic_model.py index 7773b8a84..afe1f9f8a 100644 --- a/tests/unitary/with_extras/model/test_generic_model.py +++ b/tests/unitary/with_extras/model/test_generic_model.py @@ -279,14 +279,29 @@ def test_prepare_fail(self, mock_handle_model_file_name): "oci://service-conda-packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" ) + @patch("ads.model.runtime.env_info.get_service_packs") @patch("ads.common.auth.default_signer") - def test_prepare_both_conda_env(self, mock_signer): + def test_prepare_both_conda_env(self, mock_signer, mock_get_service_packs): """prepare a model by only providing inference conda env.""" + inference_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + inference_python_version="3.6" + training_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Oracle_Database_for_CPU_Python_3.7/1.0/database_p37_cpu_v1" + training_python_version="3.7" + mock_get_service_packs.return_value = ( + { + inference_conda_env : ("mlcpuv1", inference_python_version), + training_conda_env : ("database_p37_cpu_v1", training_python_version) + }, + { + "mlcpuv1" : (inference_conda_env, inference_python_version), + "database_p37_cpu_v1" : (training_conda_env, training_python_version) + } + ) self.generic_model.prepare( - inference_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1", - inference_python_version="3.6", - training_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Oracle_Database_for_CPU_Python_3.7/1.0/database_p37_cpu_v1", - training_python_version="3.7", + inference_conda_env=inference_conda_env, + inference_python_version=inference_python_version, + training_conda_env=training_conda_env, + training_python_version=training_python_version, model_file_name="fake_model_name", force_overwrite=True, ) @@ -349,8 +364,19 @@ def test_reload(self): @patch.object(GenericModel, "_random_display_name", return_value="test_name") @patch.object(DataScienceModel, "create") - def test_save(self, mock_dsc_model_create, mock__random_display_name): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_save(self, mock_get_service_packs, mock_dsc_model_create, mock__random_display_name): """test saving a model to artifact.""" + inference_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" + inference_python_version="3.7" + mock_get_service_packs.return_value = ( + { + inference_conda_env : ("dataexpl_p37_cpu_v3", inference_python_version), + }, + { + "dataexpl_p37_cpu_v3" : (inference_conda_env, inference_python_version), + } + ) mock_dsc_model_create.return_value = MagicMock(id="fake_id") self.generic_model.prepare( inference_conda_env="dataexpl_p37_cpu_v3", @@ -360,7 +386,7 @@ def test_save(self, mock_dsc_model_create, mock__random_display_name): force_overwrite=True, training_id=None, ) - self.generic_model.save() + self.generic_model.save(ignore_introspection=True) assert self.generic_model.model_id is not None and isinstance( self.generic_model.model_id, str ) @@ -371,8 +397,19 @@ def test_save(self, mock_dsc_model_create, mock__random_display_name): parallel_process_count=utils.DEFAULT_PARALLEL_PROCESS_COUNT, ) - def test_save_not_implemented_error(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_save_not_implemented_error(self, mock_get_service_packs): """test saving a model to artifact.""" + inference_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" + inference_python_version="3.7" + mock_get_service_packs.return_value = ( + { + inference_conda_env : ("dataexpl_p37_cpu_v3", inference_python_version), + }, + { + "dataexpl_p37_cpu_v3" : (inference_conda_env, inference_python_version), + } + ) self.generic_model._serialize = False self.generic_model.prepare( inference_conda_env="dataexpl_p37_cpu_v3", diff --git a/tests/unitary/with_extras/model/test_model_deployment_details.py b/tests/unitary/with_extras/model/test_model_deployment_details.py index e6122cb3f..eeb2a9de4 100644 --- a/tests/unitary/with_extras/model/test_model_deployment_details.py +++ b/tests/unitary/with_extras/model/test_model_deployment_details.py @@ -5,6 +5,7 @@ import os from unittest import TestCase +from unittest.mock import patch import yaml from ads.model.runtime.model_deployment_details import ModelDeploymentDetails @@ -27,7 +28,18 @@ def setUpClass(cls): with open(os.path.join(curr_dir, "runtime_fail.yaml"), encoding="utf-8") as rt: cls.runtime_dict_fail = yaml.load(rt, loader) - def test_from_dict(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_dict(self, mock_get_service_packs): + inference_conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + inference_python_version="3.6" + mock_get_service_packs.return_value = ( + { + inference_conda_env : ("mlcpuv1", inference_python_version), + }, + { + "mlcpuv1" : (inference_conda_env, inference_python_version), + } + ) model_deployment = ModelDeploymentDetails.from_dict( self.runtime_dict["MODEL_DEPLOYMENT"] ) diff --git a/tests/unitary/with_extras/model/test_model_metadata_mixin.py b/tests/unitary/with_extras/model/test_model_metadata_mixin.py index 520aab631..ad9b41f7e 100644 --- a/tests/unitary/with_extras/model/test_model_metadata_mixin.py +++ b/tests/unitary/with_extras/model/test_model_metadata_mixin.py @@ -3,6 +3,7 @@ # Copyright (c) 2022, 2023 Oracle and/or its affiliates. # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ +from unittest.mock import patch import numpy as np import pytest from sklearn import datasets, linear_model @@ -104,7 +105,19 @@ def test_metadata_generic_model_container_runtime(self): ) assert model.metadata_provenance.training_id is None - def test_metadata_sklearn_model(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_metadata_sklearn_model(self, mock_get_service_packs): + conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" + python_version="3.7" + mock_get_service_packs.return_value = ( + { + conda_env : ("dataexpl_p37_cpu_v3", python_version), + }, + { + "dataexpl_p37_cpu_v3" : (conda_env, python_version), + + } + ) model = SklearnModel(self.rgr, artifact_dir="./test_sklearn") model.prepare( inference_conda_env="dataexpl_p37_cpu_v3", @@ -146,7 +159,19 @@ def test_metadata_sklearn_model(self): ) assert model.metadata_provenance.training_id is None - def test_metadata_xgboost_model(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_metadata_xgboost_model(self, mock_get_service_packs): + conda_env="oci://service-conda-packs@ociodscdev/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" + python_version="3.7" + mock_get_service_packs.return_value = ( + { + conda_env : ("dataexpl_p37_cpu_v3", python_version), + }, + { + "dataexpl_p37_cpu_v3" : (conda_env, python_version), + + } + ) model = XGBoostModel(self.xgb_rgr, artifact_dir="./test_xgboost") model.prepare( inference_conda_env="dataexpl_p37_cpu_v3", @@ -192,7 +217,7 @@ def test_metadata_xgboost_model(self): assert model.metadata_provenance.training_id is None assert ( model.runtime_info.model_deployment.inference_conda_env.inference_env_path - == "oci://service-conda-packs@id19sfcrra6z/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" + == "oci://service-conda-packs@ociodscdev/service_pack/cpu/Data_Exploration_and_Manipulation_for_CPU_Python_3.7/3.0/dataexpl_p37_cpu_v3" ) def teardown_method(self): diff --git a/tests/unitary/with_extras/model/test_model_provenance_details.py b/tests/unitary/with_extras/model/test_model_provenance_details.py index 299443410..499be7c19 100644 --- a/tests/unitary/with_extras/model/test_model_provenance_details.py +++ b/tests/unitary/with_extras/model/test_model_provenance_details.py @@ -4,6 +4,7 @@ # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/ import os +from unittest.mock import patch import yaml from ads.model.runtime.model_provenance_details import ( @@ -57,8 +58,19 @@ def setup_class(cls): with open(os.path.join(curr_dir, "runtime_fail.yaml"), encoding="utf-8") as rt: cls.runtime_dict_fail = yaml.load(rt, loader) - def test_from_dict(self): - + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_dict(self, mock_get_service_packs): + conda_env="oci://service_conda_packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + python_version="3.6" + mock_get_service_packs.return_value = ( + { + conda_env : ("mlcpuv1", python_version), + }, + { + "mlcpuv1" : (conda_env, python_version), + + } + ) model_provenance = ModelProvenanceDetails.from_dict( self.runtime_dict["MODEL_PROVENANCE"] ) diff --git a/tests/unitary/with_extras/model/test_runtime_info.py b/tests/unitary/with_extras/model/test_runtime_info.py index c4da7f6be..2297a1f2b 100644 --- a/tests/unitary/with_extras/model/test_runtime_info.py +++ b/tests/unitary/with_extras/model/test_runtime_info.py @@ -36,7 +36,19 @@ def test__validate_dict_fail(self): with pytest.raises(AssertionError): RuntimeInfo._validate_dict(self.runtime_dict_fail) - def test_from_yaml(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_yaml(self, mock_get_service_packs): + conda_env="oci://service_conda_packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + python_version="3.6" + mock_get_service_packs.return_value = ( + { + conda_env : ("mlcpuv1", python_version), + }, + { + "mlcpuv1" : (conda_env, python_version), + + } + ) runtime_info = RuntimeInfo.from_yaml( uri=os.path.join(self.curr_dir, "runtime.yaml") ) @@ -95,7 +107,19 @@ def test_from_yaml(self): @patch.object(InferenceEnvInfo, "_validate", side_effect=DocumentError) @patch.object(TrainingEnvInfo, "_validate", side_effect=DocumentError) - def test_from_yaml_fail(self, mock_inference, mock_training): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_yaml_fail(self, mock_get_service_packs, mock_inference, mock_training): + conda_env="oci://service_conda_packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + python_version="3.6" + mock_get_service_packs.return_value = ( + { + conda_env : ("mlcpuv1", python_version), + }, + { + "mlcpuv1" : (conda_env, python_version), + + } + ) with pytest.raises(DocumentError): RuntimeInfo.from_yaml(uri=os.path.join(self.curr_dir, "runtime_fail.yaml")) @@ -119,7 +143,19 @@ def test_from_yaml_wrong_format(self, mock_provenance, mock_deployment): with pytest.raises(FileNotFoundError): RuntimeInfo.from_yaml(uri=os.path.join(self.curr_dir, "fake.yaml")) - def test_from_and_to_yaml_file(self): + @patch("ads.model.runtime.env_info.get_service_packs") + def test_from_and_to_yaml_file(self, mock_get_service_packs): + conda_env="oci://service_conda_packs@ociodscdev/service_pack/cpu/General_Machine_Learning_for_CPUs/1.0/mlcpuv1" + python_version="3.6" + mock_get_service_packs.return_value = ( + { + conda_env : ("mlcpuv1", python_version), + }, + { + "mlcpuv1" : (conda_env, python_version), + + } + ) runtime = RuntimeInfo.from_yaml(uri=os.path.join(self.curr_dir, "runtime.yaml")) runtime.to_yaml(uri=os.path.join(self.curr_dir, "runtime_copy.yaml")) runtime_copy = RuntimeInfo.from_yaml( diff --git a/tests/unitary/with_extras/opctl/test_opctl_cmds.py b/tests/unitary/with_extras/opctl/test_opctl_cmds.py index cb5841b34..3c614594a 100644 --- a/tests/unitary/with_extras/opctl/test_opctl_cmds.py +++ b/tests/unitary/with_extras/opctl/test_opctl_cmds.py @@ -181,8 +181,9 @@ def test_cancel(self, job_cancel_func, pipeline_cancel_func, monkeypatch): monkeypatch.delenv("NB_SESSION_OCID", raising=False) cancel(ocid="...datasciencejobrun...") job_cancel_func.assert_called() - with pytest.raises(ValueError): - cancel(ocid="....datasciencejob....") + + cancel(ocid="....datasciencejob....") + job_cancel_func.assert_called() cancel(ocid="...datasciencepipelinerun...") pipeline_cancel_func.assert_called()