feat: Update Triton model support (#485)

#### Motivation Triton introduced [support for more model frameworks last year](https://developer.nvidia.com/blog/real-time-serving-for-xgboost-scikit-learn-randomforest-lightgbm-and-more/) and can support xgboost, lightgbm, and more. This PR adds examples and docs to advertise this. #### Modifications - Add newly supported models to Triton runtime config, setting `autoSelect: false`. - Add an example ISVC config for Triton-served XGBoost model. - Update example-models doc to reflect example models added in kserve/modelmesh-minio-examples#7 - Update model-formats README to reflect framework support and framework-specific docs to show example ISVC using Triton. - Add FVTs for lightgbm and xgboost deployment on Triton runtime #### Result Closes #185 --------- Signed-off-by: Rafael Vasquez <raf.vasquez@ibm.com> Signed-off-by: Rafael Vasquez <rafvasq21@gmail.com>
kserve · Mar 13, 2024 · 3e755a9 · 3e755a9
1 parent 1682254
commit 3e755a9
Show file tree

Hide file tree

Showing 11 changed files with 354 additions and 13 deletions.
diff --git a/config/example-isvcs/example-triton-xgboost-isvc.yaml b/config/example-isvcs/example-triton-xgboost-isvc.yaml
@@ -0,0 +1,28 @@
+# Copyright 2022 IBM Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: example-xgboost-mushroom-fil
+  annotations:
+    serving.kserve.io/deploymentMode: ModelMesh
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: xgboost
+      runtime: triton-2.x
+      storage:
+        key: localMinIO
+        path: xgboost/mushroom-fil
diff --git a/config/runtimes/triton-2.x.yaml b/config/runtimes/triton-2.x.yaml
@@ -39,6 +39,15 @@ spec:
     - name: tensorrt
       version: "7" # 7.2.1
       autoSelect: true
+    - name: sklearn
+      version: "0" # v0.23.1
+      autoSelect: false
+    - name: xgboost
+      version: "1" # v1.1.1
+      autoSelect: false
+    - name: lightgbm
+      version: "3" # v3.2.1
+      autoSelect: false
 
   protocolVersions:
     - grpc-v2

diff --git a/docs/example-models.md b/docs/example-models.md
@@ -28,6 +28,10 @@ s3://modelmesh-example-models/
 │   └── mnist.h5
 ├── lightgbm
 │   └── mushroom.bst
+│   └── mushroom-fil
+│       ├── 1
+│       │   └── model.txt
+│       └── config.pbtxt
 ├── onnx
 │   └── mnist.onnx
 ├── pytorch
@@ -45,6 +49,10 @@ s3://modelmesh-example-models/
 │           └── variables.index
 └── xgboost
     └── mushroom.json
+    └── mushroom-fil
+        ├── 1
+        │   └── xgboost.json
+        └── config.pbtxt
 ```
 
 ### Example Inference Requests
@@ -277,3 +285,34 @@ Response:
   ]
 }
 ```
+
+#### XGBoost (Triton FIL):
+
+This is a sample inference request to an XGBoost model trained on a [mushroom dataset](https://archive.ics.uci.edu/ml/datasets/Mushroom) and served using the [FIL backend for Triton](https://github.com/triton-inference-server/fil_backend):
+
+```shell
+MODEL_NAME=example-xgboost-mushroom-fil
+grpcurl \
+  -plaintext \
+  -proto fvt/proto/kfs_inference_v2.proto \
+  -d '{ "model_name": "'"${MODEL_NAME}"'", "inputs": [{ "name": "input__0", "shape": [1, 126], "datatype": "FP32", "contents": { "fp32_contents": [1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0] }}]}' \
+  localhost:8033 \
+  inference.GRPCInferenceService.ModelInfer
+```
+
+Response:
+
+```json
+{
+  "modelName": "example-xgboost-mushroom-fil__isvc-ffe6a3f20b",
+  "modelVersion": "1",
+  "outputs": [
+    {
+      "name": "output__0",
+      "datatype": "FP32",
+      "shape": ["1"]
+    }
+  ],
+  "rawOutputContents": ["B1xLPA=="]
+}
+```
diff --git a/docs/model-formats/README.md b/docs/model-formats/README.md
@@ -11,16 +11,16 @@ By leveraging existing third-party model servers, we support a number of standar
 - [TensorFlow](tensorflow.md)
 - [XGBoost](xgboost.md)
 
-| Model Type  | Framework        | Supported via ServingRuntime |
-| ----------- | ---------------- | ---------------------------- |
-| keras       | TensorFlow       | Triton (C++)                 |
-| lightgbm    | LightGBM         | MLServer (python)            |
-| onnx        | ONNX             | Triton (C++), OVMS (C++)     |
-| openvino_ir | Intel OpenVINO\* | OVMS (C++)                   |
-| pytorch     | PyTorch          | Triton (C++)                 |
-| sklearn     | scikit-learn     | MLServer (python)            |
-| tensorflow  | TensorFlow       | Triton (C++)                 |
-| xgboost     | XGBoost          | MLServer (python)            |
-| any         | Custom           | [Custom](../runtimes) (any)  |
+| Model Type  | Framework        | Supported via ServingRuntime    |
+| ----------- | ---------------- | ------------------------------- |
+| keras       | TensorFlow       | Triton (C++)                    |
+| lightgbm    | LightGBM         | MLServer (python), Triton (C++) |
+| onnx        | ONNX             | Triton (C++), OVMS (C++)        |
+| openvino_ir | Intel OpenVINO\* | OVMS (C++)                      |
+| pytorch     | PyTorch          | Triton (C++)                    |
+| sklearn     | scikit-learn     | MLServer (python), Triton (C++) |
+| tensorflow  | TensorFlow       | Triton (C++)                    |
+| xgboost     | XGBoost          | MLServer (python), Triton (C++) |
+| any         | Custom           | [Custom](../runtimes) (any)     |
 
 (\*)Many ML frameworks can have models converted to the OpenVINO IR format, such as Caffe, TensorFlow, MXNet, PaddlePaddle and ONNX, doc [here](https://docs.openvino.ai/latest/ovms_what_is_openvino_model_server.html).
diff --git a/docs/model-formats/lightgbm.md b/docs/model-formats/lightgbm.md
@@ -32,11 +32,18 @@ The storage path can point directly to a serialized model
 
 ```
 s3://modelmesh-example-models/
-└── lightgbm/mushroom.bst
+└── lightgbm
+    └── mushroom.bst
+    └── mushroom-fil
+        ├── 1
+        │   └── model.txt
+        └── config.pbtxt
 ```
 
 **InferenceService**
 
+For MLServer:
+
 ```yaml
 kind: InferenceService
 metadata:
@@ -54,3 +61,25 @@ spec:
         parameters:
           bucket: modelmesh-example-models
 ```
+
+For Triton:
+
+```yaml
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: lightgbm-example
+  annotations:
+    serving.kserve.io/deploymentMode: ModelMesh
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: lightgbm
+      runtime: triton-2.x
+      storage:
+        key: localMinIO
+        path: lightgbm/lightgbm-fil
+        parameters:
+          bucket: modelmesh-example-models
+```
diff --git a/docs/model-formats/xgboost.md b/docs/model-formats/xgboost.md
@@ -33,11 +33,18 @@ The storage path can point directly to a serialized model
 
 ```
 s3://modelmesh-example-models/
-└── xgboost/mushroom.json
+└── xgboost
+    └── mushroom.json
+    └── mushroom-fil
+        ├── 1
+        │   └── xgboost.json
+        └── config.pbtxt
 ```
 
 **InferenceService**
 
+If using MLServer:
+
 ```yaml
 apiVersion: serving.kserve.io/v1beta1
 kind: InferenceService
@@ -56,3 +63,25 @@ spec:
         parameters:
           bucket: modelmesh-example-models
 ```
+
+For Triton:
+
+```yaml
+apiVersion: serving.kserve.io/v1beta1
+kind: InferenceService
+metadata:
+  name: xgboost-example
+  annotations:
+    serving.kserve.io/deploymentMode: ModelMesh
+spec:
+  predictor:
+    model:
+      modelFormat:
+        name: xgboost
+      runtime: triton-2.x
+      storage:
+        key: localMinIO
+        path: xgboost/mushroom-fil
+        parameters:
+          bucket: modelmesh-example-models
+```
diff --git a/fvt/inference.go b/fvt/inference.go
@@ -302,6 +302,26 @@ func ExpectSuccessfulInference_lightgbmMushroom(predictorName string) {
 	Expect(math.Round(float64(inferResponse.Outputs[0].Contents.Fp64Contents[0])*10) / 10).To(BeEquivalentTo(0.0))
 }
 
+// LightGBM Mushroom via Triton
+// COS path: fvt/lightgbm/mushroom-fil
+func ExpectSuccessfulInference_lightgbmFILMushroom(predictorName string) {
+	// build the grpc inference call
+	inferInput := &inference.ModelInferRequest_InferInputTensor{
+		Name:     "input__0",
+		Shape:    []int64{1, 126},
+		Datatype: "FP32",
+		Contents: &inference.InferTensorContents{Fp32Contents: mushroomInputData},
+	}
+	inferRequest := &inference.ModelInferRequest{
+		ModelName: predictorName,
+		Inputs:    []*inference.ModelInferRequest_InferInputTensor{inferInput},
+	}
+
+	inferResponse, err := FVTClientInstance.RunKfsInference(inferRequest)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(inferResponse).ToNot(BeNil())
+}
+
 // XGBoost Mushroom
 // COS path: fvt/xgboost/mushroom
 func ExpectSuccessfulInference_xgboostMushroom(predictorName string) {
@@ -324,6 +344,26 @@ func ExpectSuccessfulInference_xgboostMushroom(predictorName string) {
 	Expect(math.Round(float64(inferResponse.Outputs[0].Contents.Fp32Contents[0])*10) / 10).To(BeEquivalentTo(0.0))
 }
 
+// XGBoost Mushroom via Triton
+// COS path: fvt/xgboost/mushroom-fil
+func ExpectSuccessfulInference_xgboostFILMushroom(predictorName string) {
+	// build the grpc inference call
+	inferInput := &inference.ModelInferRequest_InferInputTensor{
+		Name:     "input__0",
+		Shape:    []int64{1, 126},
+		Datatype: "FP32",
+		Contents: &inference.InferTensorContents{Fp32Contents: mushroomInputData},
+	}
+	inferRequest := &inference.ModelInferRequest{
+		ModelName: predictorName,
+		Inputs:    []*inference.ModelInferRequest_InferInputTensor{inferInput},
+	}
+
+	inferResponse, err := FVTClientInstance.RunKfsInference(inferRequest)
+	Expect(err).ToNot(HaveOccurred())
+	Expect(inferResponse).ToNot(BeNil())
+}
+
 // Helpers
 
 var mushroomInputData []float32 = []float32{1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0}