Added: customize timeout and max_concurrency in services via env vars…

…; cleanup legacy_trainer (#551)
konfuzio-ai · Nov 10, 2024 · 4d6f4e2 · 4d6f4e2
1 parent 72766e9
commit 4d6f4e2
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 9 deletions.
diff --git a/konfuzio_sdk/bento/categorization/categorizationai_service.py b/konfuzio_sdk/bento/categorization/categorizationai_service.py
@@ -28,11 +28,11 @@
 
 @bentoml.service(
     traffic={
-        'timeout': 3600,  # Hard limit for categorization calls is 1 hour
+        'timeout': int(os.environ.get('BENTO_SERVICE_TIMEOUT', '3600')),
         # Don't process more than 2 documents at a time. Will respond with 429 if more come.
         # Clients should implement a retry strategy for 429.
         # Servers should implement a scaling strategy and start multiple services when high load is present.
-        'max_concurrency': 2,
+        'max_concurrency': int(os.environ.get('BENTO_SERVICE_MAX_CONCURRENCY', '2')),
     }
 )
 @bentoml.mount_asgi_app(app, path='/v1')

diff --git a/konfuzio_sdk/bento/extraction/rfextractionai_service.py b/konfuzio_sdk/bento/extraction/rfextractionai_service.py
@@ -28,11 +28,11 @@
 
 @bentoml.service(
     traffic={
-        'timeout': 3600,  # Hard limit for extraction calls is 1 hour
+        'timeout': int(os.environ.get('BENTO_SERVICE_TIMEOUT', '3600')),
         # Don't process more than 2 documents at a time. Will respond with 429 if more come.
         # Clients should implement a retry strategy for 429.
         # Servers should implement a scaling strategy and start multiple services when high load is present.
-        'max_concurrency': 2,
+        'max_concurrency': int(os.environ.get('BENTO_SERVICE_MAX_CONCURRENCY', '2')),
     }
 )
 @bentoml.mount_asgi_app(app, path='/v1')

diff --git a/konfuzio_sdk/bento/extraction/rfextractionai_service_for_legacy_trainer.py b/konfuzio_sdk/bento/extraction/rfextractionai_service_for_legacy_trainer.py
@@ -4,8 +4,9 @@
 import json
 import os
 import typing as t
-import bentoml
 from concurrent.futures import ThreadPoolExecutor
+
+import bentoml
 from fastapi import Depends, FastAPI, HTTPException
 
 from .schemas import ExtractRequest20240117, ExtractResponseForLegacyTrainer20240912
@@ -16,7 +17,7 @@
 except (ImportError, ValueError):
     from base.utils import handle_exceptions
 
-from konfuzio_sdk.data import Project, Category
+from konfuzio_sdk.data import Category, Project
 
 # load ai model name from AI_MODEL_NAME file in parent directory
 ai_model_name_file = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'AI_MODEL_NAME')
@@ -25,7 +26,15 @@
 app = FastAPI()
 
 
-@bentoml.service
+@bentoml.service(
+    traffic={
+        'timeout': int(os.environ.get('BENTO_SERVICE_TIMEOUT', '3600')),
+        # Don't process more than 2 documents at a time. Will respond with 429 if more come.
+        # Clients should implement a retry strategy for 429.
+        # Servers should implement a scaling strategy and start multiple services when high load is present.
+        'max_concurrency': int(os.environ.get('BENTO_SERVICE_MAX_CONCURRENCY', '2')),
+    }
+)
 @bentoml.mount_asgi_app(app, path='/v1')
 class ExtractionService:
     model_ref = bentoml.models.get(ai_model_name)
@@ -86,10 +95,11 @@ async def extract(self, ctx: bentoml.Context, **request: t.Any) -> ExtractRespon
         for _page in request.pages:
             pages.append(dict(_page))
 
-        result = await asyncio.get_event_loop().run_in_executor(self.executor, extraction_model.extract, request.text, bboxes, pages)
+        result = await asyncio.get_event_loop().run_in_executor(
+            self.executor, extraction_model.extract, request.text, bboxes, pages
+        )
         json_result = process_response(result, schema=ExtractResponseForLegacyTrainer20240912)
 
-        project._documents = [d for d in project._documents if d.id_ != document.id_ and d.copy_of_id != document.id_]
         return json_result