Skip to content

Commit

Permalink
Add Slack classification and GHA for scheduled runs
Browse files Browse the repository at this point in the history
  • Loading branch information
markbotterill committed May 1, 2024
1 parent d8da706 commit f78ee1d
Show file tree
Hide file tree
Showing 7 changed files with 239 additions and 15 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/send-slack-metrics.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Send Slack Metrics

on:
schedule:
- cron: "*/5 * * * *"
branches:
- prod

jobs:
deploy:
runs-on: ubuntu-latest
environment:
name: production
steps:
- name: Checkout code
uses: actions/checkout@v2

- name: Configure SSH
run: |
mkdir -p ~/.ssh/
echo "$PROD_SSH_KEY" > ~/.ssh/hubgpt_prod.key
chmod 600 ~/.ssh/hubgpt_prod.key
cat >>~/.ssh/config <<END
Host prod
HostName $PROD_EC2_HOST_IP
User $PROD_EC2_USER
IdentityFile ~/.ssh/hubgpt_prod.key
StrictHostKeyChecking no
END
env:
PROD_EC2_USER: ${{ secrets.PROD_EC2_USER }}
PROD_SSH_KEY: ${{ secrets.PROD_SSH_KEY }}
PROD_EC2_HOST_IP: ${{ secrets.PROD_EC2_HOST_IP }}

- name: Trigger metric dispatch
run: |
ssh prod << 'EOF'
cd danswer
make send-slack-metrics
EOF
2 changes: 2 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,5 @@ re-deploy:
docker compose -p danswer-stack down && \
git pull origin prod && \
docker compose -p danswer-stack -f docker-compose.prod.yml up -d --build
send-slack-metrics:
docker exec danswer-stack-background-1 /usr/local/bin/python /app/scripts/send_slack_report/send_slack_report.py
3 changes: 2 additions & 1 deletion backend/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ RUN apt-get remove -y --allow-remove-essential perl-base xserver-common xvfb cma
# Set up application files
WORKDIR /app
COPY ./danswer /app/danswer
COPY ./scripts /app/scripts
COPY ./shared_configs /app/shared_configs
COPY ./alembic /app/alembic
COPY ./alembic.ini /app/alembic.ini
Expand All @@ -49,4 +50,4 @@ ENV PYTHONPATH /app

# Default command which does nothing
# This container is used by api server and background which specify their own CMD
CMD ["tail", "-f", "/dev/null"]
CMD ["tail", "-f", "/dev/null"]
128 changes: 128 additions & 0 deletions backend/scripts/send_slack_report/initial_query_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import re

PROJECTS = [
"Can you list all the projects in education that IDinsight has ever done?",
"Hello - can you give me a list of all the projects Karan Nagpal has worked on as a\
Director? Source answers from Slack channel project updates.",
"have we done any work in social protection",
"Is the Philippines' Health Promotion and Literacy Longitudinal Study considered an\
impact evaluation project",
"What was the most recent MLE project IDinsight completed?",
"project summary dobu",
"Give me a summary of what has happened on a care LP in the last 13 months",
]

POLICIES = [
"How do you sign up on surveystream",
"How do I use Unit4",
"Where do I submit my expenses",
"How do I install perimeter81?",
"What is the link for accessing timesheets?",
]

RESOURCES = [
"can you share with me the IDi doc templates link",
"Can you give me an example of IRB submission",
"Hello! Where can I find the user manual to be filled for a project on-boarding?",
"workplans for a mixed methods process evaluation",
"An example of a short proposal pitch for work IDinsight has done in Microfinance",
"can anyone share a MELA proposal that we've done recently in education? WNA is developing\
a proposal for a MELA engagement with the MEN in Cote d'Ivoire.",
]

TEAMS = [
"About Dinabandhu Bharti",
"Tell me about Mark Botterell",
"Who is Zia",
"who is amit kumar",
"tell me about dinabandhu bharti field manager at idinsight",
"Who is DS team director?",
]

MISCELLANEOUS = [
"hi",
"Test",
"Hi <!channel>",
"holiday calendar",
"what is 'hubgpt'",
"What are the list of holidays in India in 2024",
"Qualgpt",
"what was the temperature in Delhi on 25th February 2024?",
"mumbwa",
]

CONTEXT_PROMPT = f"""
You are a labeling bot. Your job is to give a label to a question asked by a\
user and return a one-word label.
The labels you can choose from are as follows: "PROJECTS", "POLICIES", "RESOURCES",\
"TEAMS", or "MISCELLANEOUS".
I have provided you with some demonstrations as to how you MUST classify a question:
Questions suitable for the "PROJECTS" label will involve past or current projects,\
services, work, interventions,
or experiences at IDinsight. Here are some examples: {PROJECTS}
Questions suitable for the "POLICIES" label are questions that involve eligibility\
and steps of all global or
regional policies, processes, benefits, and all queries related to Unit4. Here are\
some examples: {POLICIES}
The "RESOURCES" label applies to questions about guidelines, tools, products, and\
resources that support
organizational work, such as guidelines for project execution, service-related\
resources, or questions about
tools like the GDH Dashboard, HubGPT, KM processes, etc. Here are some examples:\
{RESOURCES}
Questions suitable for the "TEAMS" label involve IDinsight personnel-related\
questions around people's
designations, roles and responsibilities, regions, and who to contact for a\
specific task, as well as organizational
structure. Here are some examples: {TEAMS}
Questions suitable for the "MISCELLANEOUS" label include anything that doesn't\
clearly fall into the above
four categories, such as random questions about IDinsight, or general non-IDinsight\
questions that
can be asked to chatGPT. Here are some examples: {MISCELLANEOUS}
"""

TASK_PROMPT = """
Example usage:
Submitted question from user: "Who is the head of Human Resources?"
Answer: "TEAMS"
Consider the lists and labels, then read the user text in triple backticks below, then\
provide your one-word classification.
Submitted question from user: ```{question}```
What is the most accurate label for the submitted question from the user?
Output your response between the tags <Response> and </Response>, without any additional text.
"""


def insert_question(user_question):
full_prompt = CONTEXT_PROMPT + TASK_PROMPT.format(question=user_question)
return full_prompt


def label_question(user_question, client):
chat_completion = client.chat.completions.create(
messages=[
{
"role": "user",
"content": insert_question(user_question),
}
],
model="gpt-3.5-turbo",
)
output = chat_completion.choices[0].message.content
label = re.findall(r"<Response>\s*([\s\S]*?)\s*</Response>", output)[0]
return label
10 changes: 10 additions & 0 deletions backend/scripts/send_slack_report/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,13 @@
INTERVAL '7 days')
AND user_id IS NOT NULL
"""
INITIAL_MESSAGES_QUERY = """
SELECT message as initial_query FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY chat_session_id ORDER BY time_sent ASC) as rn
FROM chat_message
WHERE (time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days')
AND (message_type = 'USER')
) sub
WHERE sub.rn = 1
ORDER BY sub.time_sent ASC;"""
66 changes: 52 additions & 14 deletions backend/scripts/send_slack_report/send_slack_report.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,21 @@
import json
import os

import yaml
import pandas as pd
from initial_query_classification import label_question
from openai import OpenAI
from queries import INITIAL_MESSAGES_QUERY
from queries import SLACK_MESSAGES_QUERY
from queries import WEB_MESSAGES_QUERY
from queries import WEB_USERS_QUERY
from slack_sdk import WebClient
from sqlalchemy import create_engine
from sqlalchemy import text

from danswer.utils.logger import setup_logger

logger = setup_logger()


def get_engine():
POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
Expand All @@ -24,6 +32,7 @@ def get_engine():


def get_counts():
logger.info("Connecting to SQL database")
engine = get_engine()

with engine.connect() as connection:
Expand All @@ -37,45 +46,74 @@ def get_counts():
with engine.connect() as connection:
unique_users = connection.execute(text(WEB_USERS_QUERY))
web_users = unique_users.fetchone()[0]

logger.info("Counts retrieved")
return slack_messages, web_messages, web_users


def create_message(slack_messages, web_messages, web_users):
def classify_initial_queries():
engine = get_engine()
with engine.connect() as connection:
df = pd.read_sql_query(INITIAL_MESSAGES_QUERY, connection)
logger.info("Initial queries recieved")
client = OpenAI(api_key=os.environ["GEN_AI_API_KEY"])
label_series = df["initial_query"].map(lambda x: label_question(x, client))
logger.info("Labelling complete")
tally_json = json.loads(label_series.value_counts().to_json())
classifications = ""
total_initial_queries = sum(tally_json.values())
for k, v in tally_json.items():
classifications += f"""There were {v} queries (representing {v/total_initial_queries * 100}% \
of all initial queries) about {k} \n"""
return classifications


def create_message(slack_messages, web_messages, web_users, classifications):
message = (
f"Hello Users!\n\n"
f"Here are some updates from HubGPT regarding the last 7 days:\n"
f"- {slack_messages}: Slack messages in the last 7 days.\n"
f"- {web_messages}: Web App messages in the last 7 days.\n"
f"- {web_users}: Unique users on the Web App."
f"- {web_users}: Unique users on the Web App.\n"
"Usage breakdown:\n"
f"{classifications}"
)
return message


def send_message(user_id, message):
# Get Slack token from yaml
with open("secrets.yaml", "r") as file:
secrets = yaml.safe_load(file)
SLACK_TOKEN = os.environ["SLACK_BOT_TOKEN"]
if not SLACK_TOKEN:
logger.debug(
"Slack OAuth token not provided. Check env prod template for guindace"
)
return None
logger.info("Initializing Slack client")

SLACK_TOKEN = secrets["SLACK_BOT_TOKEN"]
slack_client = WebClient(token=SLACK_TOKEN)

print("Sending message")
logger.info("Sending Slack message")
# Send a message to the user
slack_client.chat_postMessage(channel=user_id, text=message)
print("Message sent")
logger.info("Message sent")
return None


def send_usage_report_to_slack(user_id):
slack, web, web_users = get_counts()
message = create_message(slack, web, web_users)
classifications = classify_initial_queries()
message = create_message(slack, web, web_users, classifications)
send_message(user_id, message)

return None


if __name__ == "__main__":
USER_ID = "C05K8F6RXU3"
print("Starting...")
send_usage_report_to_slack(USER_ID)
USER_ID = os.environ["METRICS_CHANNEL_ID"]
if not USER_ID:
logger.debug(
"Slack Metrics Channel ID token not provided. \
Check env prod template for guidance"
)
else:
logger.info("Starting Slack usage report")
send_usage_report_to_slack(USER_ID)
5 changes: 5 additions & 0 deletions deployment/docker_compose/env.prod.template
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@ GEN_AI_MODEL_VERSION=gpt-4
#GEN_AI_API_VERSION=


# Metric Slack Bot token OAuth token (from Slack API'sOauth + Permission w/ chat:write)
SLACK_BOT_TOKEN=
# Slack channel to post metrics into
METRICS_CHANNEL_ID=

# If you want to setup a slack bot to answer questions automatically in Slack
# channels it is added to, you must specify the two below.
# More information in the guide here: https://docs.danswer.dev/slack_bot_setup
Expand Down

0 comments on commit f78ee1d

Please sign in to comment.