Add chatqna wrapper for multiple model selection (#1144)

Signed-off-by: lvliang-intel <liang1.lv@intel.com> Co-authored-by: Ying Hu <ying.hu@intel.com> Co-authored-by: chen, suyue <suyue.chen@intel.com>
opea-project · Nov 18, 2024 · fb514bb · fb514bb
1 parent b1bb6db
commit fb514bb
Show file tree

Hide file tree

Showing 6 changed files with 262 additions and 53 deletions.
diff --git a/ChatQnA/Dockerfile.wrapper b/ChatQnA/Dockerfile.wrapper
@@ -0,0 +1,32 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+FROM python:3.11-slim
+
+RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
+    libgl1-mesa-glx \
+    libjemalloc-dev \
+    git
+
+RUN useradd -m -s /bin/bash user && \
+    mkdir -p /home/user && \
+    chown -R user /home/user/
+
+WORKDIR /home/user/
+RUN git clone https://github.com/opea-project/GenAIComps.git
+
+WORKDIR /home/user/GenAIComps
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r /home/user/GenAIComps/requirements.txt
+
+COPY ./chatqna_wrapper.py /home/user/chatqna.py
+
+ENV PYTHONPATH=$PYTHONPATH:/home/user/GenAIComps
+
+USER user
+
+WORKDIR /home/user
+
+RUN echo 'ulimit -S -n 999999' >> ~/.bashrc
+
+ENTRYPOINT ["python", "chatqna.py"]
diff --git a/ChatQnA/chatqna_wrapper.py b/ChatQnA/chatqna_wrapper.py
@@ -0,0 +1,68 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from comps import ChatQnAGateway, MicroService, ServiceOrchestrator, ServiceType
+
+MEGA_SERVICE_HOST_IP = os.getenv("MEGA_SERVICE_HOST_IP", "0.0.0.0")
+MEGA_SERVICE_PORT = int(os.getenv("MEGA_SERVICE_PORT", 8888))
+EMBEDDING_SERVICE_HOST_IP = os.getenv("EMBEDDING_SERVICE_HOST_IP", "0.0.0.0")
+EMBEDDING_SERVICE_PORT = int(os.getenv("EMBEDDING_SERVICE_PORT", 6000))
+RETRIEVER_SERVICE_HOST_IP = os.getenv("RETRIEVER_SERVICE_HOST_IP", "0.0.0.0")
+RETRIEVER_SERVICE_PORT = int(os.getenv("RETRIEVER_SERVICE_PORT", 7000))
+RERANK_SERVICE_HOST_IP = os.getenv("RERANK_SERVICE_HOST_IP", "0.0.0.0")
+RERANK_SERVICE_PORT = int(os.getenv("RERANK_SERVICE_PORT", 8000))
+LLM_SERVICE_HOST_IP = os.getenv("LLM_SERVICE_HOST_IP", "0.0.0.0")
+LLM_SERVICE_PORT = int(os.getenv("LLM_SERVICE_PORT", 9000))
+
+
+class ChatQnAService:
+    def __init__(self, host="0.0.0.0", port=8000):
+        self.host = host
+        self.port = port
+        self.megaservice = ServiceOrchestrator()
+
+    def add_remote_service(self):
+        embedding = MicroService(
+            name="embedding",
+            host=EMBEDDING_SERVICE_HOST_IP,
+            port=EMBEDDING_SERVICE_PORT,
+            endpoint="/v1/embeddings",
+            use_remote_service=True,
+            service_type=ServiceType.EMBEDDING,
+        )
+        retriever = MicroService(
+            name="retriever",
+            host=RETRIEVER_SERVICE_HOST_IP,
+            port=RETRIEVER_SERVICE_PORT,
+            endpoint="/v1/retrieval",
+            use_remote_service=True,
+            service_type=ServiceType.RETRIEVER,
+        )
+        rerank = MicroService(
+            name="rerank",
+            host=RERANK_SERVICE_HOST_IP,
+            port=RERANK_SERVICE_PORT,
+            endpoint="/v1/reranking",
+            use_remote_service=True,
+            service_type=ServiceType.RERANK,
+        )
+        llm = MicroService(
+            name="llm",
+            host=LLM_SERVICE_HOST_IP,
+            port=LLM_SERVICE_PORT,
+            endpoint="/v1/chat/completions",
+            use_remote_service=True,
+            service_type=ServiceType.LLM,
+        )
+        self.megaservice.add(embedding).add(retriever).add(rerank).add(llm)
+        self.megaservice.flow_to(embedding, retriever)
+        self.megaservice.flow_to(retriever, rerank)
+        self.megaservice.flow_to(rerank, llm)
+        self.gateway = ChatQnAGateway(megaservice=self.megaservice, host="0.0.0.0", port=self.port)
+
+
+if __name__ == "__main__":
+    chatqna = ChatQnAService(host=MEGA_SERVICE_HOST_IP, port=MEGA_SERVICE_PORT)
+    chatqna.add_remote_service()
diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml
@@ -11,6 +11,12 @@ services:
       context: ../
       dockerfile: ./Dockerfile
     image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
+  chatqna-wrapper:
+    build:
+      context: ../
+      dockerfile: ./Dockerfile.wrapper
+    extends: chatqna
+    image: ${REGISTRY:-opea}/chatqna-wrapper:${TAG:-latest}
   chatqna-guardrails:
     build:
       context: ../

diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-guardrails.yaml
@@ -1184,13 +1184,8 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-<<<<<<< HEAD
           image: "ghcr.io/huggingface/tgi-gaudi:2.0.6"
-          imagePullPolicy: IfNotPresent
-=======
-          image: "ghcr.io/huggingface/tgi-gaudi:2.0.5"
           imagePullPolicy: Always
->>>>>>> e3187be819ad088c24bf1b2cbb419255af0f2be3
           volumeMounts:
             - mountPath: /data
               name: model-volume

diff --git a/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml b/ChatQnA/kubernetes/intel/hpu/gaudi/manifest/chatqna-vllm.yaml
@@ -43,6 +43,7 @@ metadata:
     app.kubernetes.io/managed-by: Helm
 data:
   TEI_EMBEDDING_ENDPOINT: "http://chatqna-tei"
+  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
   https_proxy: ""
   no_proxy: ""
@@ -70,9 +71,8 @@ data:
   no_proxy: ""
   LOGFLAG: ""
   vLLM_ENDPOINT: "http://chatqna-vllm"
-  HUGGINGFACEHUB_API_TOKEN: "insert-your-huggingface-token-here"
-  LLM_MODEL: "meta-llama/Llama-3.1-70B-Instruct"
-  MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
+  LLM_MODEL: "meta-llama/Meta-Llama-3-8B-Instruct"
+  MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
 ---
 # Source: chatqna/charts/reranking-usvc/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -145,7 +145,6 @@ data:
   NUMBA_CACHE_DIR: "/tmp"
   TRANSFORMERS_CACHE: "/tmp/transformers_cache"
   HF_HOME: "/tmp/.cache/huggingface"
-  MAX_WARMUP_SEQUENCE_LENGTH: "512"
 ---
 # Source: chatqna/charts/teirerank/templates/configmap.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -170,6 +169,7 @@ data:
   NUMBA_CACHE_DIR: "/tmp"
   TRANSFORMERS_CACHE: "/tmp/transformers_cache"
   HF_HOME: "/tmp/.cache/huggingface"
+  MAX_WARMUP_SEQUENCE_LENGTH: "512"
 ---
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
@@ -183,7 +183,7 @@ metadata:
     app.kubernetes.io/instance: chatqna
     app.kubernetes.io/version: "2.1.0"
 data:
-  MODEL_ID: "meta-llama/Llama-3.1-70B-Instruct"
+  MODEL_ID: "meta-llama/Meta-Llama-3-8B-Instruct"
   PORT: "2080"
   HF_TOKEN: "insert-your-huggingface-token-here"
   http_proxy: ""
@@ -194,6 +194,12 @@ data:
   PT_HPU_ENABLE_LAZY_COLLECTIVES: "true"
   OMPI_MCA_btl_vader_single_copy_mechanism: "none"
   HF_HOME: "/tmp/.cache/huggingface"
+  GPU_MEMORY_UTILIZATION: "0.5"
+  DTYPE: "auto"
+  TENSOR_PARALLEL_SIZE: "1"
+  BLOCK_SIZE: "128"
+  MAX_NUM_SEQS: "256"
+  MAX_SEQ_LEN_TO_CAPTURE: "2048"
 ---
 # Source: chatqna/templates/nginx-deployment.yaml
 apiVersion: v1
@@ -649,7 +655,7 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "opea/dataprep-redis:v0.9"
+          image: "opea/dataprep-redis:latest"
           imagePullPolicy: Always
           ports:
             - name: data-prep
@@ -1103,10 +1109,8 @@ spec:
             - configMapRef:
                 name: chatqna-tei-config
           securityContext:
-            privileged: true
-            capabilities:
-              add: ["SYS_NICE"]
-          image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
+            {}
+          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
           imagePullPolicy: IfNotPresent
           args:
             - "--auto-truncate"
@@ -1140,16 +1144,8 @@ spec:
             initialDelaySeconds: 5
             periodSeconds: 5
           resources:
-            limits:
-              habana.ai/gaudi: 1
-              cpu: 10
-              memory: 100Gi
-              hugepages-2Mi: 9800Mi
-            requests:
-              habana.ai/gaudi: 1
-              cpu: 10
-              memory: 100Gi
-              hugepages-2Mi: 9800Mi
+            {}
+
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
@@ -1191,11 +1187,17 @@ spec:
             - configMapRef:
                 name: chatqna-teirerank-config
           securityContext:
-            {}
-          image: "ghcr.io/huggingface/text-embeddings-inference:cpu-1.5"
-          imagePullPolicy: Always
-          args:
-            - "--auto-truncate"
+            allowPrivilegeEscalation: false
+            capabilities:
+              drop:
+              - ALL
+            readOnlyRootFilesystem: false
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "ghcr.io/huggingface/tei-gaudi:1.5.0"
+          imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /data
               name: model-volume
@@ -1228,7 +1230,8 @@ spec:
             initialDelaySeconds: 5
             periodSeconds: 5
           resources:
-            {}
+            limits:
+              habana.ai/gaudi: 1
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
@@ -1242,6 +1245,7 @@ spec:
 # Copyright (C) 2024 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+
 apiVersion: apps/v1
 kind: Deployment
 metadata:
@@ -1271,16 +1275,36 @@ spec:
             - configMapRef:
                 name: chatqna-vllm-config
           securityContext:
-            privileged: true
+            allowPrivilegeEscalation: false
             capabilities:
-              add: ["SYS_NICE"]
-          image: "opea/llm-vllm-hpu:latest"
-          command:
-            - /bin/bash
-            - -c
-            - |
-              export VLLM_CPU_KVCACHE_SPACE=40 && \
-              python3 -m vllm.entrypoints.openai.api_server --enforce-eager --gpu-memory-utilization 0.5 --dtype auto --model $MODEL_ID --port 2080 --tensor-parallel-size 8 --block-size 128 --max-num-seqs 256 --max-seq_len-to-capture 2048
+              drop:
+              - ALL
+            readOnlyRootFilesystem: true
+            runAsNonRoot: true
+            runAsUser: 1000
+            seccompProfile:
+              type: RuntimeDefault
+          image: "opea/vllm-hpu:latest"
+          args:
+            - "--enforce-eager"
+            - "--model"
+            - "$(MODEL_ID)"
+            - "--tensor-parallel-size"
+            - "1"
+            - "--gpu-memory-utilization"
+            - "$(GPU_MEMORY_UTILIZATION)"
+            - "--dtype"
+            - "$(DTYPE)"
+            - "--max-num-seqs"
+            - "$(MAX_NUM_SEQS)"
+            - "--block-size"
+            - "$(BLOCK_SIZE)"
+            - "--max-seq-len-to-capture"
+            - "$(MAX_SEQ_LEN_TO_CAPTURE)"
+            - "--host"
+            - "0.0.0.0"
+            - "--port"
+            - "$(PORT)"
           imagePullPolicy: Always
           volumeMounts:
             - mountPath: /data
@@ -1293,20 +1317,13 @@ spec:
               protocol: TCP
           resources:
             limits:
-              habana.ai/gaudi: 8
-              cpu: 40
-              memory: 400Gi
-              hugepages-2Mi: 9800Mi
-            requests:
-              habana.ai/gaudi: 8
-              cpu: 40
-              memory: 400Gi
-              hugepages-2Mi: 9800Mi
+              habana.ai/gaudi: 1
       volumes:
         - name: model-volume # Replace with Persistent volume claim/ host directory
           emptyDir: {}
         - name: tmp
           emptyDir: {}
+
 ---
 # Source: chatqna/templates/deployment.yaml
 # Copyright (C) 2024 Intel Corporation
@@ -1350,8 +1367,8 @@ spec:
               value: chatqna-retriever-usvc
             - name: EMBEDDING_SERVICE_HOST_IP
               value: chatqna-embedding-usvc
-            - name: GUARDRAIL_SERVICE_HOST_IP
-              value: chatqna-guardrails-usvc
+            - name: MODEL_ID
+              value: "meta-llama/Meta-Llama-3-8B-Instruct"
           securityContext:
             allowPrivilegeEscalation: false
             capabilities:
@@ -1362,8 +1379,8 @@ spec:
             runAsUser: 1000
             seccompProfile:
               type: RuntimeDefault
-          image: "opea/chatqna:latest"
-          imagePullPolicy: Always
+          image: "opea/chatqna-wrapper:latest"
+          imagePullPolicy: IfNotPresent
           volumeMounts:
             - mountPath: /tmp
               name: tmp