Add Slack classification and GHA for scheduled runs

IDinsight · May 1, 2024 · f78ee1d · f78ee1d
1 parent d8da706
commit f78ee1d
Show file tree

Hide file tree

Showing 7 changed files with 239 additions and 15 deletions.
diff --git a/.github/workflows/send-slack-metrics.yml b/.github/workflows/send-slack-metrics.yml
@@ -0,0 +1,40 @@
+name: Send Slack Metrics
+
+on:
+  schedule:
+    - cron: "*/5 * * * *" 
+ branches:
+    - prod  
+
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    environment: 
+      name: production
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v2
+
+      - name: Configure SSH
+        run: |
+          mkdir -p ~/.ssh/
+          echo "$PROD_SSH_KEY" > ~/.ssh/hubgpt_prod.key
+          chmod 600 ~/.ssh/hubgpt_prod.key
+          cat >>~/.ssh/config <<END
+          Host prod
+            HostName $PROD_EC2_HOST_IP
+            User $PROD_EC2_USER
+            IdentityFile ~/.ssh/hubgpt_prod.key
+            StrictHostKeyChecking no
+          END
+        env:
+          PROD_EC2_USER: ${{ secrets.PROD_EC2_USER }}
+          PROD_SSH_KEY: ${{ secrets.PROD_SSH_KEY }}
+          PROD_EC2_HOST_IP: ${{ secrets.PROD_EC2_HOST_IP }}
+
+      - name: Trigger metric dispatch
+        run: |
+          ssh prod << 'EOF'
+          cd danswer
+          make send-slack-metrics
+          EOF
diff --git a/Makefile b/Makefile
@@ -31,3 +31,5 @@ re-deploy:
 	docker compose -p danswer-stack down && \
 	git pull origin prod && \
 	docker compose -p danswer-stack -f docker-compose.prod.yml up -d --build
+send-slack-metrics:
+	docker exec danswer-stack-background-1 /usr/local/bin/python /app/scripts/send_slack_report/send_slack_report.py
diff --git a/backend/Dockerfile b/backend/Dockerfile
@@ -40,6 +40,7 @@ RUN apt-get remove -y --allow-remove-essential perl-base xserver-common xvfb cma
 # Set up application files
 WORKDIR /app
 COPY ./danswer /app/danswer
+COPY ./scripts /app/scripts
 COPY ./shared_configs /app/shared_configs
 COPY ./alembic /app/alembic
 COPY ./alembic.ini /app/alembic.ini
@@ -49,4 +50,4 @@ ENV PYTHONPATH /app
 
 # Default command which does nothing
 # This container is used by api server and background which specify their own CMD
-CMD ["tail", "-f", "/dev/null"]
+CMD ["tail", "-f", "/dev/null"]
diff --git a/backend/scripts/send_slack_report/initial_query_classification.py b/backend/scripts/send_slack_report/initial_query_classification.py
@@ -0,0 +1,128 @@
+import re
+
+PROJECTS = [
+    "Can you list all the projects in education that IDinsight has ever done?",
+    "Hello - can you give me a list of all the projects Karan Nagpal has worked on as a\
+        Director? Source answers from Slack channel project updates.",
+    "have we done any work in social protection",
+    "Is the Philippines' Health Promotion and Literacy Longitudinal Study considered an\
+        impact evaluation project",
+    "What was the most recent MLE project IDinsight completed?",
+    "project summary dobu",
+    "Give me a summary of what has happened on a care LP in the last 13 months",
+]
+
+POLICIES = [
+    "How do you sign up on surveystream",
+    "How do I use Unit4",
+    "Where do I submit my expenses",
+    "How do I install perimeter81?",
+    "What is the link for accessing timesheets?",
+]
+
+RESOURCES = [
+    "can you share with me the IDi doc templates link",
+    "Can you give me an example of IRB submission",
+    "Hello! Where can I find the user manual to be filled for a project on-boarding?",
+    "workplans for a mixed methods process evaluation",
+    "An example of a short proposal pitch for work IDinsight has done in Microfinance",
+    "can anyone share a MELA proposal that we've done recently in education? WNA is developing\
+a proposal for a MELA engagement with the MEN in Cote d'Ivoire.",
+]
+
+TEAMS = [
+    "About Dinabandhu Bharti",
+    "Tell me about Mark Botterell",
+    "Who is Zia",
+    "who is amit kumar",
+    "tell me about dinabandhu bharti field manager at idinsight",
+    "Who is DS team director?",
+]
+
+MISCELLANEOUS = [
+    "hi",
+    "Test",
+    "Hi <!channel>",
+    "holiday calendar",
+    "what is 'hubgpt'",
+    "What are the list of holidays in India in 2024",
+    "Qualgpt",
+    "what was the temperature in Delhi on 25th February 2024?",
+    "mumbwa",
+]
+
+CONTEXT_PROMPT = f"""
+
+You are a labeling bot. Your job is to give a label to a question asked by a\
+    user and return a one-word label.
+
+The labels you can choose from are as follows: "PROJECTS", "POLICIES", "RESOURCES",\
+    "TEAMS", or "MISCELLANEOUS".
+I have provided you with some demonstrations as to how you MUST classify a question:
+
+Questions suitable for the "PROJECTS" label will involve past or current projects,\
+    services, work, interventions,
+or experiences at IDinsight. Here are some examples: {PROJECTS}
+
+Questions suitable for the "POLICIES" label are questions that involve eligibility\
+    and steps of all global or
+regional policies, processes, benefits, and all queries related to Unit4. Here are\
+    some examples: {POLICIES}
+
+The "RESOURCES" label applies to questions about guidelines, tools, products, and\
+    resources that support
+organizational work, such as guidelines for project execution, service-related\
+    resources, or questions about
+tools like the GDH Dashboard, HubGPT, KM processes, etc. Here are some examples:\
+    {RESOURCES}
+
+Questions suitable for the "TEAMS" label involve IDinsight personnel-related\
+    questions around people's
+designations, roles and responsibilities, regions, and who to contact for a\
+    specific task, as well as organizational
+structure. Here are some examples: {TEAMS}
+
+Questions suitable for the "MISCELLANEOUS" label include anything that doesn't\
+    clearly fall into the above
+four categories, such as random questions about IDinsight, or general non-IDinsight\
+    questions that
+can be asked to chatGPT. Here are some examples: {MISCELLANEOUS}
+"""
+
+TASK_PROMPT = """
+
+Example usage:
+
+Submitted question from user: "Who is the head of Human Resources?"
+Answer: "TEAMS"
+
+Consider the lists and labels, then read the user text in triple backticks below, then\
+    provide your one-word classification.
+
+Submitted question from user: ```{question}```
+
+What is the most accurate label for the submitted question from the user?
+
+Output your response between the tags <Response> and </Response>, without any additional text.
+
+"""
+
+
+def insert_question(user_question):
+    full_prompt = CONTEXT_PROMPT + TASK_PROMPT.format(question=user_question)
+    return full_prompt
+
+
+def label_question(user_question, client):
+    chat_completion = client.chat.completions.create(
+        messages=[
+            {
+                "role": "user",
+                "content": insert_question(user_question),
+            }
+        ],
+        model="gpt-3.5-turbo",
+    )
+    output = chat_completion.choices[0].message.content
+    label = re.findall(r"<Response>\s*([\s\S]*?)\s*</Response>", output)[0]
+    return label
diff --git a/backend/scripts/send_slack_report/queries.py b/backend/scripts/send_slack_report/queries.py
@@ -22,3 +22,13 @@
     INTERVAL '7 days')
     AND user_id IS NOT NULL
 """
+INITIAL_MESSAGES_QUERY = """
+    SELECT message as initial_query FROM (
+        SELECT *,
+            ROW_NUMBER() OVER (PARTITION BY chat_session_id ORDER BY time_sent ASC) as rn
+        FROM chat_message
+        WHERE (time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days')
+        AND (message_type = 'USER')
+    ) sub
+    WHERE sub.rn = 1
+    ORDER BY sub.time_sent ASC;"""
diff --git a/backend/scripts/send_slack_report/send_slack_report.py b/backend/scripts/send_slack_report/send_slack_report.py
@@ -1,13 +1,21 @@
+import json
 import os
 
-import yaml
+import pandas as pd
+from initial_query_classification import label_question
+from openai import OpenAI
+from queries import INITIAL_MESSAGES_QUERY
 from queries import SLACK_MESSAGES_QUERY
 from queries import WEB_MESSAGES_QUERY
 from queries import WEB_USERS_QUERY
 from slack_sdk import WebClient
 from sqlalchemy import create_engine
 from sqlalchemy import text
 
+from danswer.utils.logger import setup_logger
+
+logger = setup_logger()
+
 
 def get_engine():
     POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
@@ -24,6 +32,7 @@ def get_engine():
 
 
 def get_counts():
+    logger.info("Connecting to SQL database")
     engine = get_engine()
 
     with engine.connect() as connection:
@@ -37,45 +46,74 @@ def get_counts():
     with engine.connect() as connection:
         unique_users = connection.execute(text(WEB_USERS_QUERY))
         web_users = unique_users.fetchone()[0]
-
+    logger.info("Counts retrieved")
     return slack_messages, web_messages, web_users
 
 
-def create_message(slack_messages, web_messages, web_users):
+def classify_initial_queries():
+    engine = get_engine()
+    with engine.connect() as connection:
+        df = pd.read_sql_query(INITIAL_MESSAGES_QUERY, connection)
+        logger.info("Initial queries recieved")
+        client = OpenAI(api_key=os.environ["GEN_AI_API_KEY"])
+        label_series = df["initial_query"].map(lambda x: label_question(x, client))
+        logger.info("Labelling complete")
+        tally_json = json.loads(label_series.value_counts().to_json())
+        classifications = ""
+        total_initial_queries = sum(tally_json.values())
+        for k, v in tally_json.items():
+            classifications += f"""There were {v} queries (representing {v/total_initial_queries * 100}% \
+of all initial queries) about {k} \n"""
+        return classifications
+
+
+def create_message(slack_messages, web_messages, web_users, classifications):
     message = (
         f"Hello Users!\n\n"
         f"Here are some updates from HubGPT regarding the last 7 days:\n"
         f"- {slack_messages}: Slack messages in the last 7 days.\n"
         f"- {web_messages}: Web App messages in the last 7 days.\n"
-        f"- {web_users}: Unique users on the Web App."
+        f"- {web_users}: Unique users on the Web App.\n"
+        "Usage breakdown:\n"
+        f"{classifications}"
     )
     return message
 
 
 def send_message(user_id, message):
-    # Get Slack token from yaml
-    with open("secrets.yaml", "r") as file:
-        secrets = yaml.safe_load(file)
+    SLACK_TOKEN = os.environ["SLACK_BOT_TOKEN"]
+    if not SLACK_TOKEN:
+        logger.debug(
+            "Slack OAuth token not provided. Check env prod template for guindace"
+        )
+        return None
+    logger.info("Initializing Slack client")
 
-    SLACK_TOKEN = secrets["SLACK_BOT_TOKEN"]
     slack_client = WebClient(token=SLACK_TOKEN)
 
-    print("Sending message")
+    logger.info("Sending Slack message")
     # Send a message to the user
     slack_client.chat_postMessage(channel=user_id, text=message)
-    print("Message sent")
+    logger.info("Message sent")
     return None
 
 
 def send_usage_report_to_slack(user_id):
     slack, web, web_users = get_counts()
-    message = create_message(slack, web, web_users)
+    classifications = classify_initial_queries()
+    message = create_message(slack, web, web_users, classifications)
     send_message(user_id, message)
 
     return None
 
 
 if __name__ == "__main__":
-    USER_ID = "C05K8F6RXU3"
-    print("Starting...")
-    send_usage_report_to_slack(USER_ID)
+    USER_ID = os.environ["METRICS_CHANNEL_ID"]
+    if not USER_ID:
+        logger.debug(
+            "Slack Metrics Channel ID token not provided. \
+Check env prod template for guidance"
+        )
+    else:
+        logger.info("Starting Slack usage report")
+        send_usage_report_to_slack(USER_ID)
diff --git a/deployment/docker_compose/env.prod.template b/deployment/docker_compose/env.prod.template
@@ -18,6 +18,11 @@ GEN_AI_MODEL_VERSION=gpt-4
 #GEN_AI_API_VERSION=
 
 
+# Metric Slack Bot token OAuth token (from Slack API'sOauth + Permission w/ chat:write) 
+SLACK_BOT_TOKEN=
+# Slack channel to post metrics into
+METRICS_CHANNEL_ID=
+
 # If you want to setup a slack bot to answer questions automatically in Slack 
 # channels it is added to, you must specify the two below.
 # More information in the guide here: https://docs.danswer.dev/slack_bot_setup