From 05078e49e7e7a16632abb7a0c6b3bb8acce91d02 Mon Sep 17 00:00:00 2001 From: qinguoyi <1532979219@qq.com> Date: Sun, 3 Nov 2024 16:42:37 +0800 Subject: [PATCH] Support ollama support ollama --- README.md | 2 +- chart/templates/backends/ollama.yaml | 36 +++++++++++++++++++ docs/examples/README.md | 7 +++- docs/examples/ollama/model.yaml | 8 +++++ docs/examples/ollama/playground.yaml | 10 ++++++ docs/support-backends.md | 4 +++ pkg/controller_helper/backendruntime.go | 26 +++++++++----- pkg/controller_helper/backendruntime_test.go | 9 +++++ .../model_source/modelsource.go | 3 +- pkg/controller_helper/model_source/uri.go | 20 +++++++---- pkg/webhook/openmodel_webhook.go | 3 +- test/config/backends/ollama.yaml | 29 +++++++++++++++ .../controller/inference/playground_test.go | 28 +++++++++++++++ 13 files changed, 166 insertions(+), 19 deletions(-) create mode 100644 chart/templates/backends/ollama.yaml create mode 100644 docs/examples/ollama/model.yaml create mode 100644 docs/examples/ollama/playground.yaml create mode 100644 test/config/backends/ollama.yaml diff --git a/README.md b/README.md index ffa42ce..e9f1da2 100644 --- a/README.md +++ b/README.md @@ -27,7 +27,7 @@ Easy, advanced inference platform for large language models on Kubernetes ## Features Overview - **Easy of Use**: People can quick deploy a LLM service with minimal configurations. -- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp). Find the full list of supported backends [here](./docs/support-backends.md). +- **Broad Backend Support**: llmaz supports a wide range of advanced inference backends for different scenarios, like [vLLM](https://github.com/vllm-project/vllm), [Text-Generation-Inference](https://github.com/huggingface/text-generation-inference), [SGLang](https://github.com/sgl-project/sglang), [llama.cpp](https://github.com/ggerganov/llama.cpp), [ollama](https://github.com/ollama/ollama). Find the full list of supported backends [here](./docs/support-backends.md). - **Scaling Efficiency (WIP)**: llmaz works smoothly with autoscaling components like [Cluster-Autoscaler](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler) or [Karpenter](https://github.com/kubernetes-sigs/karpenter) to support elastic scenarios. - **Accelerator Fungibility (WIP)**: llmaz supports serving the same LLM with various accelerators to optimize cost and performance. - **SOTA Inference**: llmaz supports the latest cutting-edge researches like [Speculative Decoding](https://arxiv.org/abs/2211.17192) or [Splitwise](https://arxiv.org/abs/2311.18677)(WIP) to run on Kubernetes. diff --git a/chart/templates/backends/ollama.yaml b/chart/templates/backends/ollama.yaml new file mode 100644 index 0000000..e931d61 --- /dev/null +++ b/chart/templates/backends/ollama.yaml @@ -0,0 +1,36 @@ +{{- if .Values.backendRuntime.install -}} +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: ollama +spec: + commands: + - sh + - -c + image: ollama/ollama + version: latest + # Do not edit the preset argument name unless you know what you're doing. + # Free to add more arguments with your requirements. + args: + - name: default + flags: + - "ollama serve & + while true; do output=$(ollama list 2>&1); + if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; + ollama run {{`{{ .ModelName }}`}}; + while true;do sleep 60;done" + envs: + - name: OLLAMA_HOST + value: 0.0.0.0:8080 + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi +{{- end }} \ No newline at end of file diff --git a/docs/examples/README.md b/docs/examples/README.md index e011cbe..104d4ab 100644 --- a/docs/examples/README.md +++ b/docs/examples/README.md @@ -10,6 +10,7 @@ We provide a set of examples to help you serve large language models, by default - [Deploy models via SGLang](#deploy-models-via-sglang) - [Deploy models via llama.cpp](#deploy-models-via-llamacpp) - [Deploy models via text-generation-inference](#deploy-models-via-tgi) +- [Deploy models via ollama](#ollama) - [Speculative Decoding with vLLM](#speculative-decoding-with-vllm) ### Deploy models from Huggingface @@ -32,7 +33,7 @@ In theory, if we want to load the `Qwen2-7B` model, which occupies about 14.2 GB - Alibaba Cloud OSS, see [example](./objstore-oss/) here - > Note: you should set OSS_ACCESS_KEY_ID and OSS_ACCESS_kEY_SECRET first by running `kubectl create secret generic oss-access-secret --from-literal=OSS_ACCESS_KEY_ID= --from-literal=OSS_ACCESS_kEY_SECRET=` + > Note: you should set OSS_ACCESS_KEY_ID and OSS_ACCESS_kEY_SECRET first by running `kubectl create secret generic oss-access-secret --from-literal=OSS_ACCESS_KEY_ID= --from-literal=OSS_ACCESS_kEY_SECRET=` ### Deploy models via SGLang @@ -46,6 +47,10 @@ By default, we use [vLLM](https://github.com/vllm-project/vllm) as the inference [text-generation-inference](https://github.com/huggingface/text-generation-inference) is used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. see [example](./tgi/) here. +### Deploy models via ollama + +[ollama](https://github.com/ollama/ollama) based on llama.cpp, aims for local deploy. see [example](./ollama/) here. + ### Speculative Decoding with vLLM [Speculative Decoding](https://arxiv.org/abs/2211.17192) can improve inference performance efficiently, see [example](./speculative-decoding/vllm/) here. diff --git a/docs/examples/ollama/model.yaml b/docs/examples/ollama/model.yaml new file mode 100644 index 0000000..ab7cf3b --- /dev/null +++ b/docs/examples/ollama/model.yaml @@ -0,0 +1,8 @@ +apiVersion: llmaz.io/v1alpha1 +kind: OpenModel +metadata: + name: qwen2-0--5b +spec: + familyName: qwen2 + source: + uri: OLLAMA://qwen2:0.5b \ No newline at end of file diff --git a/docs/examples/ollama/playground.yaml b/docs/examples/ollama/playground.yaml new file mode 100644 index 0000000..f91949a --- /dev/null +++ b/docs/examples/ollama/playground.yaml @@ -0,0 +1,10 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: Playground +metadata: + name: qwen2-0--5b +spec: + replicas: 1 + modelClaim: + modelName: qwen2-0--5b + backendRuntimeConfig: + name: ollama diff --git a/docs/support-backends.md b/docs/support-backends.md index 2a1a27a..7a067f7 100644 --- a/docs/support-backends.md +++ b/docs/support-backends.md @@ -14,6 +14,10 @@ If you want to integrate more backends into llmaz, please refer to this [PR](htt [text-generation-inference](https://github.com/huggingface/text-generation-inference) is a Rust, Python and gRPC server for text generation inference. Used in production at Hugging Face to power Hugging Chat, the Inference API and Inference Endpoint. +## ollama + +[ollama](https://github.com/ollama/ollama) is running with Llama 3.2, Mistral, Gemma 2, and other large language models, based on llama.cpp, aims for local deploy. + ## vLLM [vLLM](https://github.com/vllm-project/vllm) is a high-throughput and memory-efficient inference and serving engine for LLMs diff --git a/pkg/controller_helper/backendruntime.go b/pkg/controller_helper/backendruntime.go index ded7c24..c798108 100644 --- a/pkg/controller_helper/backendruntime.go +++ b/pkg/controller_helper/backendruntime.go @@ -19,6 +19,7 @@ package helper import ( "fmt" "regexp" + "strings" corev1 "k8s.io/api/core/v1" @@ -94,18 +95,25 @@ func (p *BackendRuntimeParser) Resources() inferenceapi.ResourceRequirements { func renderFlags(flags []string, modelInfo map[string]string) ([]string, error) { // Capture the word. re := regexp.MustCompile(`\{\{\s*\.(\w+)\s*\}\}`) + res := []string{} - var value string for _, flag := range flags { - value = flag - match := re.FindStringSubmatch(flag) - if len(match) > 1 { - // Return the matched word. - value = modelInfo[match[1]] - - if value == "" { - return nil, fmt.Errorf("missing flag or the flag has format error: %s", flag) + value := flag + matches := re.FindAllStringSubmatch(flag, -1) + + if len(matches) > 0 { + for _, match := range matches { + if len(match) > 1 { + // get key + key := match[1] + replacement, exists := modelInfo[key] + if !exists { + return nil, fmt.Errorf("missing flag or the flag has format error: %s", flag) + } + // replace + value = strings.Replace(value, match[0], replacement, -1) + } } } diff --git a/pkg/controller_helper/backendruntime_test.go b/pkg/controller_helper/backendruntime_test.go index ed311a6..7bd7a8d 100644 --- a/pkg/controller_helper/backendruntime_test.go +++ b/pkg/controller_helper/backendruntime_test.go @@ -30,6 +30,15 @@ func TestRenderFlags(t *testing.T) { wantFlags []string wantError bool }{ + { + name: "normal parse long args", + flags: []string{"run {{ .ModelPath }};sleep 5", "--host", "0.0.0.0"}, + modelInfo: map[string]string{ + "ModelPath": "path/to/model", + "ModelName": "foo", + }, + wantFlags: []string{"run path/to/model;sleep 5", "--host", "0.0.0.0"}, + }, { name: "normal parse", flags: []string{"-m", "{{ .ModelPath }}", "--served-model-name", "{{ .ModelName }}", "--host", "0.0.0.0"}, diff --git a/pkg/controller_helper/model_source/modelsource.go b/pkg/controller_helper/model_source/modelsource.go index 281779d..18bd640 100644 --- a/pkg/controller_helper/model_source/modelsource.go +++ b/pkg/controller_helper/model_source/modelsource.go @@ -72,11 +72,12 @@ func NewModelSourceProvider(model *coreapi.OpenModel) ModelSourceProvider { if model.Spec.Source.URI != nil { // We'll validate the format in the webhook, so generally no error should happen here. protocol, address, _ := util.ParseURI(string(*model.Spec.Source.URI)) - provider := &URIProvider{modelName: model.Name, protocol: protocol} + provider := &URIProvider{modelName: model.Name, protocol: protocol, modelAddress: address} switch protocol { case OSS: provider.endpoint, provider.bucket, provider.modelPath, _ = util.ParseOSS(address) + case OLLAMA: default: // This should be validated at webhooks. panic("protocol not supported") diff --git a/pkg/controller_helper/model_source/uri.go b/pkg/controller_helper/model_source/uri.go index 092fd1a..fb8a049 100644 --- a/pkg/controller_helper/model_source/uri.go +++ b/pkg/controller_helper/model_source/uri.go @@ -26,18 +26,23 @@ import ( var _ ModelSourceProvider = &URIProvider{} const ( - OSS = "OSS" + OSS = "OSS" + OLLAMA = "OLLAMA" ) type URIProvider struct { - modelName string - protocol string - bucket string - endpoint string - modelPath string + modelName string + protocol string + bucket string + endpoint string + modelPath string + modelAddress string } func (p *URIProvider) ModelName() string { + if p.protocol == OLLAMA { + return p.modelAddress + } return p.modelName } @@ -58,6 +63,9 @@ func (p *URIProvider) ModelPath() string { } func (p *URIProvider) InjectModelLoader(template *corev1.PodTemplateSpec, index int) { + if p.protocol == OLLAMA { + return + } initContainerName := MODEL_LOADER_CONTAINER_NAME if index != 0 { initContainerName += "-" + strconv.Itoa(index) diff --git a/pkg/webhook/openmodel_webhook.go b/pkg/webhook/openmodel_webhook.go index 2d97c7c..931f8a4 100644 --- a/pkg/webhook/openmodel_webhook.go +++ b/pkg/webhook/openmodel_webhook.go @@ -47,7 +47,8 @@ func SetupOpenModelWebhook(mgr ctrl.Manager) error { var _ webhook.CustomDefaulter = &OpenModelWebhook{} var SUPPORTED_OBJ_STORES = map[string]struct{}{ - modelSource.OSS: {}, + modelSource.OSS: {}, + modelSource.OLLAMA: {}, } // Default implements webhook.Defaulter so a webhook will be registered for the type diff --git a/test/config/backends/ollama.yaml b/test/config/backends/ollama.yaml new file mode 100644 index 0000000..23ed462 --- /dev/null +++ b/test/config/backends/ollama.yaml @@ -0,0 +1,29 @@ +apiVersion: inference.llmaz.io/v1alpha1 +kind: BackendRuntime +metadata: + labels: + app.kubernetes.io/name: backendruntime + app.kubernetes.io/part-of: llmaz + app.kubernetes.io/created-by: llmaz + name: ollama +spec: + commands: + - sh + - -c + image: ollama/ollama + version: latest + args: + - name: default + flags: + - "ollama serve & + while true; do output=$(ollama list 2>&1); + if ! echo $output | grep -q 'could not connect to ollama app' && echo $output | grep -q 'NAME';then echo 'ollama is running';break; else echo 'Waiting for the ollama to be running...';sleep 1;fi;done; + ollama run {{`{{ .ModelName }}`}}; + while true;do sleep 60;done" + resources: + requests: + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 4Gi \ No newline at end of file diff --git a/test/integration/controller/inference/playground_test.go b/test/integration/controller/inference/playground_test.go index 15a6d87..8f47e03 100644 --- a/test/integration/controller/inference/playground_test.go +++ b/test/integration/controller/inference/playground_test.go @@ -264,6 +264,34 @@ var _ = ginkgo.Describe("playground controller test", func() { }, }, }), + ginkgo.Entry("advance configured Playground with ollama", &testValidatingCase{ + makePlayground: func() *inferenceapi.Playground { + return wrapper.MakePlayground("playground", ns.Name).ModelClaim(model.Name).Label(coreapi.ModelNameLabelKey, model.Name). + BackendRuntime("ollama").BackendRuntimeVersion("main").BackendRuntimeArgs([]string{"--foo", "bar"}).BackendRuntimeEnv("FOO", "BAR"). + BackendRuntimeRequest("cpu", "1").BackendRuntimeLimit("cpu", "10"). + Obj() + }, + updates: []*update{ + { + updateFunc: func(playground *inferenceapi.Playground) { + gomega.Expect(k8sClient.Create(ctx, playground)).To(gomega.Succeed()) + }, + checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) { + validation.ValidatePlayground(ctx, k8sClient, playground) + validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundProgressing, "Pending", metav1.ConditionTrue) + }, + }, + { + updateFunc: func(playground *inferenceapi.Playground) { + util.UpdateLwsToReady(ctx, k8sClient, playground.Name, playground.Namespace) + }, + checkFunc: func(ctx context.Context, k8sClient client.Client, playground *inferenceapi.Playground) { + validation.ValidatePlayground(ctx, k8sClient, playground) + validation.ValidatePlaygroundStatusEqualTo(ctx, k8sClient, playground, inferenceapi.PlaygroundAvailable, "PlaygroundReady", metav1.ConditionTrue) + }, + }, + }, + }), ginkgo.Entry("playground is created when service exists with the same name", &testValidatingCase{ makePlayground: func() *inferenceapi.Playground { return util.MockASamplePlayground(ns.Name)