diff --git a/backend/scripts/hubgpt_eval_automation.py b/backend/scripts/hubgpt_eval_automation.py index 446be261445..3b32a247fcc 100644 --- a/backend/scripts/hubgpt_eval_automation.py +++ b/backend/scripts/hubgpt_eval_automation.py @@ -84,6 +84,7 @@ def upload_to_slack(filename, channel_id): for num, query in enumerate(queries_list): print(f"Query {num+1}/{len(queries_list)}: {query}") response = process_question( + # Change to staging for staging testing danswer_url="https://hubgpt.idinsight.io", question=query, api_key=None ) responses.append(response) diff --git a/backend/scripts/send_slack_report/queries.py b/backend/scripts/send_slack_report/queries.py index f25b7b15e3f..4a761ac85be 100644 --- a/backend/scripts/send_slack_report/queries.py +++ b/backend/scripts/send_slack_report/queries.py @@ -73,12 +73,17 @@ }, } INITIAL_MESSAGES_QUERY = """ - SELECT message as initial_query FROM ( - SELECT *, - ROW_NUMBER() OVER (PARTITION BY chat_session_id ORDER BY time_sent ASC) as rn - FROM chat_message - WHERE (time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days') - AND (message_type = 'USER') - ) sub - WHERE sub.rn = 1 - ORDER BY sub.time_sent ASC;""" +WITH subquery AS ( + SELECT cm.time_sent, cs.user_id, cm.message, cm.id, cf.is_positive, cf.feedback_text, + ROW_NUMBER() OVER (PARTITION BY cm.chat_session_id ORDER BY cm.time_sent ASC) AS rn + FROM chat_message cm + LEFT JOIN chat_session cs ON cs.id = cm.chat_session_id + LEFT JOIN chat_feedback cf ON cf.chat_message_id = cm.id + WHERE cm.time_sent >= (NOW() AT TIME ZONE 'UTC') - INTERVAL '7 days' + AND cm.message_type = 'USER' +) +SELECT time_sent, user_id, message, id, is_positive, feedback_text +FROM subquery +WHERE rn = 1 +ORDER BY time_sent ASC; +""" diff --git a/backend/scripts/send_slack_report/send_slack_report.py b/backend/scripts/send_slack_report/send_slack_report.py index f08be3c1959..a663484abf5 100755 --- a/backend/scripts/send_slack_report/send_slack_report.py +++ b/backend/scripts/send_slack_report/send_slack_report.py @@ -1,5 +1,6 @@ import json import os +import re import pandas as pd import plotly.express as px @@ -15,47 +16,56 @@ from danswer.utils.logger import setup_logger - +# Global Variables and Paths +CSV_PATH = "/app/scripts/send_slack_report/all_data.csv" +POSTGRES_USER = os.environ.get("POSTGRES_USER", "postgres") +POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD", "password") +POSTGRES_HOST = os.environ.get("POSTGRES_HOST", "localhost") +POSTGRES_PORT = os.environ.get("POSTGRES_PORT", "5432") +POSTGRES_DB = os.environ.get("POSTGRES_DB", "postgres") +SLACK_BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN") +GEN_AI_API_KEY = os.environ.get("GEN_AI_API_KEY") +METRICS_CHANNEL_ID = os.environ.get("METRICS_CHANNEL_ID") + +# Setup Logger logger = setup_logger() def get_engine(): - POSTGRES_USER = os.environ.get("POSTGRES_USER") or "postgres" - POSTGRES_PASSWORD = os.environ.get("POSTGRES_PASSWORD") or "password" - POSTGRES_HOST = os.environ.get("POSTGRES_HOST") or "localhost" - POSTGRES_PORT = os.environ.get("POSTGRES_PORT") or "5432" - POSTGRES_DB = os.environ.get("POSTGRES_DB") or "postgres" - + """Create and return a SQLAlchemy engine.""" engine = create_engine( f"postgresql://{POSTGRES_USER}:{POSTGRES_PASSWORD}@{POSTGRES_HOST}:{POSTGRES_PORT}/{POSTGRES_DB}" ) - return engine +def execute_numerical_query(engine, query): + """Execute a SQL query and return the resulting number.""" + with engine.connect() as connection: + result = connection.execute(text(query.replace("\n", ""))) + return result.scalar() + + def get_counts(): - """Fetches counts based on the specified period from the global queries dictionary.""" + """Fetch usage counts based on the specified period from the global queries dictionary.""" results = {"medium": [], "time_period": [], "count": []} - engine = get_engine() - with engine.connect() as connection: - for period in USAGE_QUERIES.keys(): - for key, query in USAGE_QUERIES[period].items(): - result = connection.execute(text(query.replace("\n", ""))) - results["count"].append(result.scalar()) - results["medium"].append(key) - results["time_period"].append(period) + for period in USAGE_QUERIES.keys(): + for key, query in USAGE_QUERIES[period].items(): + count = execute_numerical_query(engine, query) + results["count"].append(count) + results["medium"].append(key) + results["time_period"].append(period) return pd.DataFrame(results) def get_last_week_counts(df): - """Take a DataFrame and returns a dictionary of counts ofr users - from the last 7 days across Slack, Web and unique users""" + """Return counts for the last 7 days across different mediums.""" last_week_count = {} for medium in MEDIUMS: - count = df.query(f"time_period =='last_7_days' and medium == '{medium}'")[ + count = df.query(f"time_period == 'last_7_days' and medium == '{medium}'")[ "count" ].iloc[0] last_week_count[medium] = count @@ -63,99 +73,140 @@ def get_last_week_counts(df): def save_bar_plot(df, filename): + """Save a bar plot of the data and return the filename.""" fig = px.bar(df, x="medium", y="count", color="time_period", barmode="group") fig.write_image(file=filename, format="jpg") return filename -def upload_to_slack_and_delete(filename, channel_id): - slack_client = WebClient(token=os.environ.get("SLACK_BOT_TOKEN")) +def upload_file_to_slack(filename, channel_id, title, delete_after_upload=False): + """Upload a file to Slack and optionally delete it locally.""" + slack_client = WebClient(token=SLACK_BOT_TOKEN) size = os.stat(filename).st_size response = slack_client.files_getUploadURLExternal(filename=filename, length=size) - upload_url = response.data["upload_url"] - file_id = response.data["file_id"] - post_response = requests.post(url=upload_url, data=open(filename, "rb")) + upload_url = response["upload_url"] + file_id = response["file_id"] + + with open(filename, "rb") as file: + post_response = requests.post(url=upload_url, data=file) + if post_response.status_code == 200: - upload_response = slack_client.files_completeUploadExternal( - files=[{"id": file_id, "title": "Metrics graph"}], channel_id=channel_id + slack_client.files_completeUploadExternal( + files=[{"id": file_id, "title": title}], channel_id=channel_id ) - # Clean up - os.remove(filename) - return upload_response.status_code + if delete_after_upload: + os.remove(filename) + return 200 + else: + logger.error(f"Failed to upload {filename} to Slack.") + return post_response.status_code + +def categorize(text): + """Categorize the given text based on predefined categories.""" + categories = ["PROJECTS", "POLICIES", "RESOURCES", "TEAMS", "MISCELLANEOUS"] + regex_pattern = r"\b(" + "|".join(categories) + r")\b" + match = re.search(regex_pattern, text, re.IGNORECASE) + return match.group(1).upper() if match else "MISCELLANEOUS" -def classify_initial_queries(): + +def gather_and_combine_data(): + """Gather past week's data, concatenate with existing data, and dispatch as a CSV.""" engine = get_engine() with engine.connect() as connection: df = pd.read_sql_query(INITIAL_MESSAGES_QUERY, connection) - logger.info("Initial queries recieved") - client = OpenAI(api_key=os.environ.get("GEN_AI_API_KEY")) - label_series = df["initial_query"].map(lambda x: label_question(x, client)) - logger.info("Labelling complete") - tally_json = json.loads(label_series.value_counts().to_json()) - classifications = "" - total_initial_queries = sum(tally_json.values()) - for k, v in tally_json.items(): - percentage = v / total_initial_queries * 100 - classifications += f"{k}: {v} queries ({percentage:.1f}%)\n" - return classifications + logger.info("Initial queries received") + + # Fill missing user IDs with 'SLACK' + df["user_id"] = df["user_id"].fillna("SLACK") + clean_weekly = df.drop_duplicates(subset="id").copy() + clean_weekly["time_sent"] = clean_weekly["time_sent"].dt.date + + # Combine with historic data + overlap_ids = clean_weekly["id"] + full_df = pd.read_csv(CSV_PATH) + clean_all_time_df = full_df[~full_df["id"].isin(overlap_ids)] + combined_df = ( + pd.concat([clean_all_time_df, clean_weekly]) + .sort_values(by="time_sent") + .reset_index(drop=True) + ) + combined_df.to_csv(CSV_PATH, index=False) + logger.info("Combined with historic data and saved to CSV") + + return clean_weekly + + +def classify_initial_queries(clean_weekly): + """Classify the initial queries and prepare a summary.""" + # Label data using OpenAI + client = OpenAI(api_key=GEN_AI_API_KEY) + clean_weekly["labels"] = clean_weekly["message"].apply( + lambda x: label_question(x, client) + ) + clean_weekly["labels"] = clean_weekly["labels"].apply(categorize) + logger.info("Labelling complete") + + # Prepare classification summary + tally_json = json.loads(clean_weekly["labels"].value_counts().to_json()) + total_initial_queries = sum(tally_json.values()) + classifications = "\n".join( + f"{k}: {v} queries ({v / total_initial_queries * 100:.1f}%)" + for k, v in tally_json.items() + ) + return classifications def create_message(last_week_count, classifications): - message = ( + """Create a summary message to send to Slack.""" + return ( f"Hello Users!\n\n" f"Here are some updates from HubGPT regarding the last 7 days:\n" - f"- {last_week_count['slack_messages']}: Slack messages in the last 7 days.\n" - f"- {last_week_count['web_messages']}: Web App messages in the last 7 days.\n" - f"- {last_week_count['distinct_web_users']}: Unique users on the Web App.\n" + f"- {last_week_count.get('slack_messages', 0)} Slack messages in the last 7 days.\n" + f"- {last_week_count.get('web_messages', 0)} Web App messages in the last 7 days.\n" + f"- {last_week_count.get('distinct_web_users', 0)} Unique users on the Web App.\n" "Usage breakdown:\n" f"{classifications}" ) - return message -def send_message(user_id, message): - SLACK_BOT_TOKEN = os.environ.get("SLACK_BOT_TOKEN") - if not SLACK_BOT_TOKEN: - logger.debug( - "Slack OAuth token not provided. Check env prod template for guidance" - ) - return None - logger.info("Initializing Slack client") - - slack_client = WebClient(token=SLACK_BOT_TOKEN) - - logger.info("Sending Slack message") - # Send a message to the user - slack_client.chat_postMessage(channel=user_id, text=message) - logger.info("Message sent") - return None +def send_message(channel_id, message): + """Send a message to the specified Slack channel.""" + try: + slack_client = WebClient(token=SLACK_BOT_TOKEN) + slack_client.chat_postMessage(channel=channel_id, text=message) + logger.info("Message sent to Slack channel") + except Exception as e: + logger.error(f"Failed to send message to Slack channel {channel_id}: {e}") def send_usage_report_to_slack(channel_id): + """Generate and send the usage report to Slack.""" counts_df = get_counts() - classifications = classify_initial_queries() - + clean_weekly = gather_and_combine_data() + classifications = classify_initial_queries(clean_weekly) last_week_counts = get_last_week_counts(counts_df) - - file = save_bar_plot(counts_df, "metrics.jpg") - + plot_filename = save_bar_plot(counts_df, "metrics.jpg") message = create_message(last_week_counts, classifications) send_message(channel_id, message) - upload_status = upload_to_slack_and_delete(file, channel_id) - return upload_status + upload_file_to_slack( + plot_filename, channel_id, "Metrics graph", delete_after_upload=True + ) + upload_file_to_slack(CSV_PATH, channel_id, "Historic data") if __name__ == "__main__": try: - CHANNEL_ID = os.environ.get("METRICS_CHANNEL_ID") - if CHANNEL_ID: + if METRICS_CHANNEL_ID: logger.info("Starting Slack usage report") - send_usage_report_to_slack(CHANNEL_ID) + send_usage_report_to_slack(METRICS_CHANNEL_ID) else: - logger.warning("Slack Metrics Channel ID token not provided.") - logger.warning("Check env prod template for guidance.") + logger.warning( + "Slack Metrics Channel ID token not provided. Check env prod template for guidance." + ) except Exception as e: - logger.exception("An error occurred while sending usage report to Slack: %s", e) + logger.exception( + "An error occurred while sending usage report to Slack", exc_info=e + )