Provide 24.10 version bumps (#406)

* Add support for XGBoost UBJSON * Bump package versions in Conda envs * Replace `convert_sklearn` script to use latest Treelite API * Update `generate_example_model.py` to use latest XGBoost API
triton-inference-server · Oct 9, 2024 · 87e48ba · 87e48ba
1 parent 25e6845
commit 87e48ba
Show file tree

Hide file tree

Showing 18 changed files with 191 additions and 92 deletions.
diff --git a/conda/environments/rapids_triton_dev.yml b/conda/environments/rapids_triton_dev.yml
@@ -6,4 +6,6 @@ dependencies:
  - ccache
  - cmake>=3.26.4,!=3.30.0
  - ninja
+ # TODO(hcho3): Remove the pin when
+ # https://github.com/triton-inference-server/common/pull/114 is merged
  - rapidjson>=1.1.0,<1.1.0.post*
diff --git a/conda/environments/triton_benchmark.yml b/conda/environments/triton_benchmark.yml
@@ -5,14 +5,14 @@ channels:
  - rapidsai
 dependencies:
  - cuda-version=11.8
- - cudf=23.12
- - libcusolver<=11.4.1.48
- - libcusparse<=12.0
+ - cudf=24.08
+ - libcusolver
+ - libcusparse
  - matplotlib
  - pip
  - python
  - scipy
  - pip:
  - tritonclient[all]
- - protobuf==3.20.1
- - git+https://github.com/rapidsai/rapids-triton.git@branch-23.12#subdirectory=python
+ - protobuf
+ - git+https://github.com/rapidsai/rapids-triton.git@branch-24.04#subdirectory=python
diff --git a/conda/environments/triton_test.yml b/conda/environments/triton_test.yml
@@ -7,19 +7,19 @@ dependencies:
  - aws-sdk-cpp
  - clang-tools=11.1.0
  - cuda-version=11.8
- - cudf=23.12
- - cuml=23.12
+ - cudf=24.08
+ - cuml=24.08
  - flake8
- - hypothesis<6.46.8
+ - hypothesis
  - lightgbm
  - matplotlib
  - pip
  - pytest
  - python
- - rapidsai::xgboost>=1.7
- - scikit-learn=1.2.0
- - treelite
+ - rapidsai::xgboost>=2.1
+ - scikit-learn>=1.5
+ - treelite>=4.3
  - pip:
  - tritonclient[all]
- - protobuf==3.20.1
- - git+https://github.com/rapidsai/rapids-triton.git@branch-23.12#subdirectory=python
+ - protobuf
+ - git+https://github.com/rapidsai/rapids-triton.git@branch-24.04#subdirectory=python
diff --git a/conda/environments/triton_test_no_client.yml b/conda/environments/triton_test_no_client.yml
@@ -7,15 +7,15 @@ dependencies:
  - aws-sdk-cpp
  - clang-tools=11.1.0
  - cuda-version=11.8
- - cudf=23.12
- - cuml=23.12
+ - cudf=24.08
+ - cuml=24.08
  - flake8
- - hypothesis<6.46.8
+ - hypothesis
  - lightgbm
  - pip
  - pytest
  - python
  - python-rapidjson
- - rapidsai::xgboost>=1.7
- - scikit-learn=1.2.0
- - treelite
+ - rapidsai::xgboost>=2.1
+ - scikit-learn>=1.5
+ - treelite>=4.3
diff --git a/docs/model_config.md b/docs/model_config.md
@@ -70,7 +70,7 @@ instance_group [{ kind: KIND_AUTO }]
 parameters [
  {
  key: "model_type"
- value: { string_value: "xgboost_json" }
+ value: { string_value: "xgboost_ubj" }
  },
  {
  key: "output_class"
@@ -185,23 +185,24 @@ Treelite's checkpoint format. For more information, see [Model
 Support](model_support.md).
 
 The `model_type` option is used to indicate which of these serialization
-formats your model uses: `xgboost` for XGBoost binary, `xgboost_json` for
-XGBoost JSON, `lightgbm` for LightGBM, or `treelite_checkpoint` for
-Treelite:
+formats your model uses: `xgboost_ubj` for XGBoost UBJSON, `xgboost_json` for
+XGBoost JSON, `xgboost` for XGBoost binary (legacy), `lightgbm` for LightGBM,
+or `treelite_checkpoint` for Treelite:
 
 ```
 parameters [
  {
  key: "model_type"
- value: { string_value: "xgboost_json" }
+ value: { string_value: "xgboost_ubj" }
  }
 ]
 ```
 
 #### Model Filenames
 For each model type, Triton expects a particular default filename:
-- `xgboost.model` for XGBoost Binary
+- `xgboost.ubj` for XGBoost UBJSON
 - `xgboost.json` for XGBoost JSON
+- `xgboost.model` for XGBoost Binary (Legacy)
 - `model.txt` for LightGBM
 - `checkpoint.tl` for Treelite
 It is recommended that you use these filenames, but custom filenames can be

diff --git a/docs/sklearn_and_cuml.md b/docs/sklearn_and_cuml.md
@@ -48,7 +48,7 @@ model framework in Triton simply by exporting to the binary checkpoint format.
 The FIL backend repo includes scripts for easy conversion from
 pickle-serialized cuML or Scikit-Learn models to Treelite checkpoints. You can
 download the relevant script for Scikit-Learn
-[here](https://raw.githubusercontent.com/triton-inference-server/fil_backend/main/scripts/convert_sklearn)
+[here](https://raw.githubusercontent.com/triton-inference-server/fil_backend/main/scripts/convert_sklearn.py)
 and for cuML
 [here](https://raw.githubusercontent.com/triton-inference-server/fil_backend/main/scripts/convert_cuml.py).
 
@@ -89,7 +89,7 @@ model_repository/
 
 Then perform the conversion by running either:
 ```bash
-./convert_sklearn model_repository/fil/1/model.pkl
+./convert_sklearn.py model_repository/fil/1/model.pkl
 ```
 for Scikit-Learn models or
 ```bash

diff --git a/notebooks/faq/FAQs.ipynb b/notebooks/faq/FAQs.ipynb
@@ -593,7 +593,7 @@
  " archival_path = os.path.join(VERSIONED_DIR, 'model.pkl')\n",
  " shutil.copy(MODEL_PATH, archival_path)\n",
  " \n",
- " !../../scripts/convert_sklearn {archival_path}"
+ " !../../scripts/convert_sklearn.py {archival_path}"
  ]
  },
  {

diff --git a/ops/Dockerfile b/ops/Dockerfile
@@ -3,7 +3,7 @@
 # Arguments for controlling build details
 ###########################################################################################
 # Version of Triton to use
-ARG TRITON_VERSION=24.08
+ARG TRITON_VERSION=24.09
 # Base container image
 ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3
 # Whether or not to enable GPU build
@@ -56,7 +56,7 @@ RUN conda run --no-capture-output -n triton_test \
 
 FROM wheel-install-${USE_CLIENT_WHEEL} as conda-test
 RUN conda run --no-capture-output -n triton_test \
- pip install git+https://github.com/rapidsai/rapids-triton.git@branch-21.12#subdirectory=python
+ pip install git+https://github.com/rapidsai/rapids-triton.git@branch-24.04#subdirectory=python
 RUN conda-pack --ignore-missing-files -n triton_test -o /tmp/env.tar \
  && mkdir /conda/test/ \
  && cd /conda/test/ \

diff --git a/qa/L0_e2e/generate_example_model.py b/qa/L0_e2e/generate_example_model.py
@@ -72,11 +72,10 @@ def train_xgboost_classifier(data, labels, depth=25, trees=100):
  training_params = {
  "eval_metric": "error",
  "objective": "binary:logistic",
- "tree_method": "gpu_hist",
+ "tree_method": "hist",
+ "device": "cuda",
  "max_depth": depth,
  "n_estimators": trees,
- "use_label_encoder": False,
- "predictor": "gpu_predictor",
  }
  model = xgb.XGBClassifier(**training_params)
 
@@ -192,10 +191,10 @@ def train_xgboost_regressor(data, targets, depth=25, trees=100):
 
  training_params = {
  "objective": "reg:squarederror",
- "tree_method": "gpu_hist",
+ "tree_method": "hist",
+ "device": "cuda",
  "max_depth": depth,
  "n_estimators": trees,
- "predictor": "gpu_predictor",
  }
  model = xgb.XGBRegressor(**training_params)
 
@@ -307,13 +306,19 @@ def generate_model(
 
 def serialize_model(model, directory, output_format="xgboost"):
  if output_format == "xgboost":
- model_path = os.path.join(directory, "xgboost.model")
+ model_path = os.path.join(directory, "xgboost.deprecated")
  model.save_model(model_path)
- return model_path
+ new_model_path = os.path.join(directory, "xgboost.model")
+ os.rename(model_path, new_model_path)
+ return new_model_path
  if output_format == "xgboost_json":
  model_path = os.path.join(directory, "xgboost.json")
  model.save_model(model_path)
  return model_path
+ if output_format == "xgboost_ubj":
+ model_path = os.path.join(directory, "xgboost.ubj")
+ model.save_model(model_path)
+ return model_path
  if output_format == "lightgbm":
  model_path = os.path.join(directory, "model.txt")
  model.save_model(model_path)
@@ -462,6 +467,8 @@ def build_model(
 
  if output_format is None:
  if model_type == "xgboost":
+ # TODO(hcho3): Update to "xgboost_ubj" when XGBoost removes support
+ # for legacy binary format
  output_format = "xgboost"
  elif model_type == "lightgbm":
  output_format = "lightgbm"
@@ -471,7 +478,10 @@ def build_model(
  raise RuntimeError('Unknown model type "{}"'.format(model_type))
 
  if (
- (model_type == "xgboost" and output_format not in {"xgboost", "xgboost_json"})
+ (
+ model_type == "xgboost"
+ and output_format not in {"xgboost", "xgboost_json", "xgboost_ubj"}
+ )
  or (model_type == "lightgbm" and output_format not in {"lightgbm"})
  or (model_type == "sklearn" and output_format not in {"pickle"})
  or (model_type == "cuml" and output_format not in {"pickle"})
@@ -545,7 +555,7 @@ def parse_args():
  )
  parser.add_argument(
  "--format",
- choices=("xgboost", "xgboost_json", "lightgbm", "pickle"),
+ choices=("xgboost", "xgboost_json", "xgboost_ubj", "lightgbm", "pickle"),
  default=None,
  help="serialization format for model",
  )

diff --git a/qa/L0_e2e/test_model.py b/qa/L0_e2e/test_model.py
@@ -36,6 +36,7 @@
  "xgboost",
  "xgboost_shap",
  "xgboost_json",
+ "xgboost_ubj",
  "lightgbm",
  "lightgbm_rf",
  "regression",
@@ -66,6 +67,17 @@ def valid_shm_modes():
  return tuple(modes)
 
 
+# TODO(hcho3): Remove once we fix the flakiness of CUDA shared mem
+# See https://github.com/triton-inference-server/server/issues/7688
+def shared_mem_parametrize():
+ params = [None]
+ if "cuda" in valid_shm_modes():
+ params.append(
+ pytest.param("cuda", marks=pytest.mark.xfail(reason="shared mem is flaky")),
+ )
+ return params
+
+
 @pytest.fixture(scope="session")
 def client():
  """A RAPIDS-Triton client for submitting inference requests"""
@@ -98,27 +110,46 @@ class GTILModel:
  """A compatibility wrapper for executing models with GTIL"""
 
  def __init__(self, model_path, model_format, output_class):
- if model_format == "treelite_checkpoint":
+ if model_format == "xgboost":
+ self.tl_model = treelite.frontend.load_xgboost_model_legacy_binary(
+ model_path
+ )
+ elif model_format == "xgboost_json":
+ self.tl_model = treelite.frontend.load_xgboost_model(
+ model_path, format_choice="json"
+ )
+ elif model_format == "xgboost_ubj":
+ self.tl_model = treelite.frontend.load_xgboost_model(
+ model_path, format_choice="ubjson"
+ )
+ elif model_format == "lightgbm":
+ self.tl_model = treelite.frontend.load_lightgbm_model(model_path)
+ elif model_format == "treelite_checkpoint":
  self.tl_model = treelite.Model.deserialize(model_path)
- else:
- self.tl_model = treelite.Model.load(model_path, model_format)
  self.output_class = output_class
 
  def _predict(self, arr):
- return treelite.gtil.predict(self.tl_model, arr)
+ result = treelite.gtil.predict(self.tl_model, arr)
+ # GTIL always returns prediction result with dimensions
+ # (num_row, num_target, num_class)
+ assert len(result.shape) == 3
+ # We don't test multi-target models
+ # TODO(hcho3): Add coverage for multi-target models
+ assert result.shape[1] == 1
+ return result[:, 0, :]
 
  def predict_proba(self, arr):
  result = self._predict(arr)
- if len(result.shape) > 1:
+ if result.shape[1] > 1:
  return result
  else:
- return np.transpose(np.vstack((1 - result, result)))
+ return np.hstack((1 - result, result))
 
  def predict(self, arr):
  if self.output_class:
  return np.argmax(self.predict_proba(arr), axis=1)
  else:
- return self._predict(arr)
+ return self._predict(arr).squeeze()
 
 
 class GroundTruthModel:
@@ -144,6 +175,8 @@ def __init__(
  model_path = os.path.join(model_dir, "xgboost.model")
  elif model_format == "xgboost_json":
  model_path = os.path.join(model_dir, "xgboost.json")
+ elif model_format == "xgboost_ubj":
+ model_path = os.path.join(model_dir, "xgboost.ubj")
  elif model_format == "lightgbm":
  model_path = os.path.join(model_dir, "model.txt")
  elif model_format == "treelite_checkpoint":
@@ -220,12 +253,13 @@ def model_data(request, client, model_repo):
  )
 
 
+@pytest.mark.parametrize("shared_mem", shared_mem_parametrize())
 @given(hypothesis_data=st.data())
 @settings(
  deadline=None,
  suppress_health_check=(HealthCheck.too_slow, HealthCheck.filter_too_much),
 )
-def test_small(client, model_data, hypothesis_data):
+def test_small(shared_mem, client, model_data, hypothesis_data):
  """Test Triton-served model on many small Hypothesis-generated examples"""
  all_model_inputs = defaultdict(list)
  total_output_sizes = {}
@@ -251,9 +285,6 @@ def test_small(client, model_data, hypothesis_data):
  model_output_sizes = {
  name: size for name, size in model_data.output_sizes.items()
  }
- shared_mem = hypothesis_data.draw(
- st.one_of(st.just(mode) for mode in valid_shm_modes())
- )
  result = client.predict(
  model_data.name,
  model_inputs,
@@ -298,11 +329,11 @@ def test_small(client, model_data, hypothesis_data):
  )
 
  # Test entire batch of Hypothesis-generated inputs at once
- shared_mem = hypothesis_data.draw(
- st.one_of(st.just(mode) for mode in valid_shm_modes())
- )
  all_triton_outputs = client.predict(
- model_data.name, all_model_inputs, total_output_sizes, shared_mem=shared_mem
+ model_data.name,
+ all_model_inputs,
+ total_output_sizes,
+ shared_mem=shared_mem,
  )
 
  for output_name in sorted(ground_truth.keys()):
@@ -324,7 +355,7 @@ def test_small(client, model_data, hypothesis_data):
  )
 
 
-@pytest.mark.parametrize("shared_mem", valid_shm_modes())
+@pytest.mark.parametrize("shared_mem", shared_mem_parametrize())
 def test_max_batch(client, model_data, shared_mem):
  """Test processing of a single maximum-sized batch"""
  max_inputs = {
@@ -335,9 +366,11 @@ def test_max_batch(client, model_data, shared_mem):
  name: size * model_data.max_batch_size
  for name, size in model_data.output_sizes.items()
  }
- shared_mem = valid_shm_modes()[0]
  result = client.predict(
- model_data.name, max_inputs, model_output_sizes, shared_mem=shared_mem
+ model_data.name,
+ max_inputs,
+ model_output_sizes,
+ shared_mem=shared_mem,
  )
 
  ground_truth = model_data.ground_truth_model.predict(max_inputs)