Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add /verifyrun cog #36

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/discord-cluster-manager/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from cogs.modal_cog import ModalCog
from cogs.github_cog import GitHubCog
from cogs.verify_run_cog import VerifyRunCog

logger = setup_logging()

Expand All @@ -38,6 +39,7 @@ async def setup_hook(self):
await self.add_cog(ModalCog(self))
await self.add_cog(GitHubCog(self))
await self.add_cog(BotManagerCog(self))
await self.add_cog(VerifyRunCog(self))

guild_id = (
DISCORD_CLUSTER_STAGING_ID
Expand Down
103 changes: 103 additions & 0 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import discord
from discord import app_commands
from discord.ext import commands
import re
from utils import setup_logging

logger = setup_logging()

class VerifyRunCog(commands.Cog):
"""
A Discord cog for verifying the success of trainingruns.

This cog provides functionality to verify that either a GitHub Actions or
Modal run completed successfully by checking for specific message patterns
in a Discord thread. It supports verification of two types of runs:
1. GitHub Actions runs - Identified by "GitHub Action triggered!" message
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this seems to test a trigger but we've had issues where this wouldn't have helped like for example when we had timeout issues with both the NVIDIA and AMD runner

2. Modal runs - Identified by "Running on Modal..." message

Commands:
/verifyrun: Verifies the success of a run in the current thread. Can
only be used in a thread. Automatically detects the run type and
applies appropriate verification.
"""

def __init__(self, bot):
self.bot = bot

async def verify_github_run(self, interaction: discord.Interaction, message_contents: list[str]):
"""Verify that a GitHub Actions run completed successfully"""

required_patterns = [
"Processing `.*` with",
"GitHub Action triggered! Run ID:",
"Training completed with status: success",
".*```\nLogs.*:",
"View the full run at:",
]

all_patterns_found = all(
any(
re.match(pattern, content, re.DOTALL) != None
for content in message_contents
)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.response.send_message(
"✅ All expected messages found - run completed successfully!")
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.match(pattern, content, re.DOTALL) for content in message_contents)
]
await interaction.response.send_message(
"❌ Run verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)

async def verify_modal_run(self, interaction: discord.Interaction, message_contents: list[str]):
"""Verify that a Modal run completed successfully"""

required_patterns = [
"Processing `.*` with",
"Running on Modal...",
".*```\nModal execution result:",
]

all_patterns_found = all(
any(re.match(pattern, content, re.DOTALL) != None for content in message_contents)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.response.send_message("✅ All expected messages found - Modal run completed successfully!")
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.match(pattern, content, re.DOTALL) for content in message_contents)
]
await interaction.response.send_message(
"❌ Modal run verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)

@app_commands.command(name='verifyrun')
async def verify_run(self, interaction: discord.Interaction):
"""Verify that a run in the current thread completed successfully"""

if not isinstance(interaction.channel, discord.Thread):
await interaction.response.send_message("This command can only be used in a thread!")
return

message_contents = [msg.content async for msg in interaction.channel.history(limit=None)]

# Check for GitHub Action run
if any("GitHub Action triggered!" in content for content in message_contents):
await self.verify_github_run(interaction, message_contents)
# Check for Modal run
elif any("Running on Modal..." in content for content in message_contents):
await self.verify_modal_run(interaction, message_contents)
else:
await interaction.response.send_message("❌ Could not determine run type!")
Loading