Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Staging #21

Merged
merged 20 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions backend/scripts/hubgpt_eval_automation.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ def upload_to_slack(filename, channel_id):
for num, query in enumerate(queries_list):
print(f"Query {num+1}/{len(queries_list)}: {query}")
response = process_question(
# Change to staging for staging testing
danswer_url="https://hubgpt.idinsight.io", question=query, api_key=None
)
responses.append(response)
Expand Down
23 changes: 14 additions & 9 deletions backend/scripts/send_slack_report/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,12 +73,17 @@
},
}
INITIAL_MESSAGES_QUERY = """
SELECT message as initial_query FROM (
SELECT *,
ROW_NUMBER() OVER (PARTITION BY chat_session_id ORDER BY time_sent ASC) as rn
FROM chat_message
WHERE (time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days')
AND (message_type = 'USER')
) sub
WHERE sub.rn = 1
ORDER BY sub.time_sent ASC;"""
WITH subquery AS (
SELECT cm.time_sent, cs.user_id, cm.message, cm.id, cf.is_positive, cf.feedback_text,
ROW_NUMBER() OVER (PARTITION BY cm.chat_session_id ORDER BY cm.time_sent ASC) AS rn
FROM chat_message cm
LEFT JOIN chat_session cs ON cs.id = cm.chat_session_id
LEFT JOIN chat_feedback cf ON cf.chat_message_id = cm.id
WHERE cm.time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days'
AND cm.message_type = 'USER'
)
SELECT time_sent, user_id, message, id, is_positive, feedback_text
FROM subquery
WHERE rn = 1
ORDER BY time_sent ASC;
"""
203 changes: 127 additions & 76 deletions backend/scripts/send_slack_report/send_slack_report.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import re

import pandas as pd
import plotly.express as px
Expand All @@ -15,147 +16,197 @@

from danswer.utils.logger import setup_logger


# Global Variables and Paths
CSV_PATH = "/app/scripts/send_slack_report/all_data.csv"
POSTGRES_USER = os.environ.get("POSTGRES_USER", "postgres")
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", "password")
POSTGRES_HOST = os.environ.get("POSTGRES_HOST", "localhost")
POSTGRES_PORT = os.environ.get("POSTGRES_PORT", "5432")
POSTGRES_DB = os.environ.get("POSTGRES_DB", "postgres")
SLACK_BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN")
GEN_AI_API_KEY = os.environ.get("GEN_AI_API_KEY")
METRICS_CHANNEL_ID = os.environ.get("METRICS_CHANNEL_ID")

# Setup Logger
logger = setup_logger()


def get_engine():
POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres"
POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD") or "password"
POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "localhost"
POSTGRES_PORT = os.environ.get("POSTGRES_PORT") or "5432"
POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres"

"""Create and return a SQLAlchemy engine."""
engine = create_engine(
f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}"
)

return engine


def execute_numerical_query(engine, query):
"""Execute a SQL query and return the resulting number."""
with engine.connect() as connection:
result = connection.execute(text(query.replace("\n", "")))
return result.scalar()


def get_counts():
"""Fetches counts based on the specified period from the global queries dictionary."""
"""Fetch usage counts based on the specified period from the global queries dictionary."""
results = {"medium": [], "time_period": [], "count": []}

engine = get_engine()

with engine.connect() as connection:
for period in USAGE_QUERIES.keys():
for key, query in USAGE_QUERIES[period].items():
result = connection.execute(text(query.replace("\n", "")))
results["count"].append(result.scalar())
results["medium"].append(key)
results["time_period"].append(period)
for period in USAGE_QUERIES.keys():
for key, query in USAGE_QUERIES[period].items():
count = execute_numerical_query(engine, query)
results["count"].append(count)
results["medium"].append(key)
results["time_period"].append(period)

return pd.DataFrame(results)


def get_last_week_counts(df):
"""Take a DataFrame and returns a dictionary of counts ofr users
from the last 7 days across Slack, Web and unique users"""
"""Return counts for the last 7 days across different mediums."""
last_week_count = {}
for medium in MEDIUMS:
count = df.query(f"time_period =='last_7_days' and medium == '{medium}'")[
count = df.query(f"time_period == 'last_7_days' and medium == '{medium}'")[
"count"
].iloc[0]
last_week_count[medium] = count
return last_week_count


def save_bar_plot(df, filename):
"""Save a bar plot of the data and return the filename."""
fig = px.bar(df, x="medium", y="count", color="time_period", barmode="group")
fig.write_image(file=filename, format="jpg")
return filename


def upload_to_slack_and_delete(filename, channel_id):
slack_client = WebClient(token=os.environ.get("SLACK_BOT_TOKEN"))
def upload_file_to_slack(filename, channel_id, title, delete_after_upload=False):
"""Upload a file to Slack and optionally delete it locally."""
slack_client = WebClient(token=SLACK_BOT_TOKEN)
size = os.stat(filename).st_size
response = slack_client.files_getUploadURLExternal(filename=filename, length=size)
upload_url = response.data["upload_url"]
file_id = response.data["file_id"]
post_response = requests.post(url=upload_url, data=open(filename, "rb"))
upload_url = response["upload_url"]
file_id = response["file_id"]

with open(filename, "rb") as file:
post_response = requests.post(url=upload_url, data=file)

if post_response.status_code == 200:
upload_response = slack_client.files_completeUploadExternal(
files=[{"id": file_id, "title": "Metrics graph"}], channel_id=channel_id
slack_client.files_completeUploadExternal(
files=[{"id": file_id, "title": title}], channel_id=channel_id
)
# Clean up
os.remove(filename)
return upload_response.status_code
if delete_after_upload:
os.remove(filename)
return 200
else:
logger.error(f"Failed to upload {filename} to Slack.")
return post_response.status_code


def categorize(text):
"""Categorize the given text based on predefined categories."""
categories = ["PROJECTS", "POLICIES", "RESOURCES", "TEAMS", "MISCELLANEOUS"]
regex_pattern = r"\b(" + "|".join(categories) + r")\b"
match = re.search(regex_pattern, text, re.IGNORECASE)
return match.group(1).upper() if match else "MISCELLANEOUS"

def classify_initial_queries():

def gather_and_combine_data():
"""Gather past week's data, concatenate with existing data, and dispatch as a CSV."""
engine = get_engine()
with engine.connect() as connection:
df = pd.read_sql_query(INITIAL_MESSAGES_QUERY, connection)
logger.info("Initial queries recieved")
client = OpenAI(api_key=os.environ.get("GEN_AI_API_KEY"))
label_series = df["initial_query"].map(lambda x: label_question(x, client))
logger.info("Labelling complete")
tally_json = json.loads(label_series.value_counts().to_json())
classifications = ""
total_initial_queries = sum(tally_json.values())
for k, v in tally_json.items():
percentage = v / total_initial_queries * 100
classifications += f"{k}: {v} queries ({percentage:.1f}%)\n"
return classifications
logger.info("Initial queries received")

# Fill missing user IDs with 'SLACK'
df["user_id"] = df["user_id"].fillna("SLACK")
clean_weekly = df.drop_duplicates(subset="id").copy()
clean_weekly["time_sent"] = clean_weekly["time_sent"].dt.date

# Combine with historic data
overlap_ids = clean_weekly["id"]
full_df = pd.read_csv(CSV_PATH)
clean_all_time_df = full_df[~full_df["id"].isin(overlap_ids)]
combined_df = (
pd.concat([clean_all_time_df, clean_weekly])
.sort_values(by="time_sent")
.reset_index(drop=True)
)
combined_df.to_csv(CSV_PATH, index=False)
logger.info("Combined with historic data and saved to CSV")

return clean_weekly


def classify_initial_queries(clean_weekly):
"""Classify the initial queries and prepare a summary."""
# Label data using OpenAI
client = OpenAI(api_key=GEN_AI_API_KEY)
clean_weekly["labels"] = clean_weekly["message"].apply(
lambda x: label_question(x, client)
)
clean_weekly["labels"] = clean_weekly["labels"].apply(categorize)
logger.info("Labelling complete")

# Prepare classification summary
tally_json = json.loads(clean_weekly["labels"].value_counts().to_json())
total_initial_queries = sum(tally_json.values())
classifications = "\n".join(
f"{k}: {v} queries ({v / total_initial_queries * 100:.1f}%)"
for k, v in tally_json.items()
)
return classifications


def create_message(last_week_count, classifications):
message = (
"""Create a summary message to send to Slack."""
return (
f"Hello Users!\n\n"
f"Here are some updates from HubGPT regarding the last 7 days:\n"
f"- {last_week_count['slack_messages']}: Slack messages in the last 7 days.\n"
f"- {last_week_count['web_messages']}: Web App messages in the last 7 days.\n"
f"- {last_week_count['distinct_web_users']}: Unique users on the Web App.\n"
f"- {last_week_count.get('slack_messages', 0)} Slack messages in the last 7 days.\n"
f"- {last_week_count.get('web_messages', 0)} Web App messages in the last 7 days.\n"
f"- {last_week_count.get('distinct_web_users', 0)} Unique users on the Web App.\n"
"Usage breakdown:\n"
f"{classifications}"
)
return message


def send_message(user_id, message):
SLACK_BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN")
if not SLACK_BOT_TOKEN:
logger.debug(
"Slack OAuth token not provided. Check env prod template for guidance"
)
return None
logger.info("Initializing Slack client")

slack_client = WebClient(token=SLACK_BOT_TOKEN)

logger.info("Sending Slack message")
# Send a message to the user
slack_client.chat_postMessage(channel=user_id, text=message)
logger.info("Message sent")
return None
def send_message(channel_id, message):
"""Send a message to the specified Slack channel."""
try:
slack_client = WebClient(token=SLACK_BOT_TOKEN)
slack_client.chat_postMessage(channel=channel_id, text=message)
logger.info("Message sent to Slack channel")
except Exception as e:
logger.error(f"Failed to send message to Slack channel {channel_id}: {e}")


def send_usage_report_to_slack(channel_id):
"""Generate and send the usage report to Slack."""
counts_df = get_counts()
classifications = classify_initial_queries()

clean_weekly = gather_and_combine_data()
classifications = classify_initial_queries(clean_weekly)
last_week_counts = get_last_week_counts(counts_df)

file = save_bar_plot(counts_df, "metrics.jpg")

plot_filename = save_bar_plot(counts_df, "metrics.jpg")
message = create_message(last_week_counts, classifications)

send_message(channel_id, message)
upload_status = upload_to_slack_and_delete(file, channel_id)

return upload_status
upload_file_to_slack(
plot_filename, channel_id, "Metrics graph", delete_after_upload=True
)
upload_file_to_slack(CSV_PATH, channel_id, "Historic data")


if __name__ == "__main__":
try:
CHANNEL_ID = os.environ.get("METRICS_CHANNEL_ID")
if CHANNEL_ID:
if METRICS_CHANNEL_ID:
logger.info("Starting Slack usage report")
send_usage_report_to_slack(CHANNEL_ID)
send_usage_report_to_slack(METRICS_CHANNEL_ID)
else:
logger.warning("Slack Metrics Channel ID token not provided.")
logger.warning("Check env prod template for guidance.")
logger.warning(
"Slack Metrics Channel ID token not provided. Check env prod template for guidance."
)
except Exception as e:
logger.exception("An error occurred while sending usage report to Slack: %s", e)
logger.exception(
"An error occurred while sending usage report to Slack", exc_info=e
)
Loading