Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add /verifyrun cog #36

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/discord-cluster-manager/bot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
)
from cogs.modal_cog import ModalCog
from cogs.github_cog import GitHubCog
from cogs.verify_run_cog import VerifyRunCog

logger = setup_logging()

Expand All @@ -38,6 +39,7 @@ async def setup_hook(self):
await self.add_cog(ModalCog(self))
await self.add_cog(GitHubCog(self))
await self.add_cog(BotManagerCog(self))
await self.add_cog(VerifyRunCog(self))

guild_id = (
DISCORD_CLUSTER_STAGING_ID
Expand Down
12 changes: 9 additions & 3 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ async def run_github(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
use_followup: bool = False
):
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
Expand All @@ -43,10 +44,15 @@ async def run_github(
return

thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
message = f"Created thread {thread.mention} for your GitHub job"

if use_followup:
if not interaction.response.is_done():
await interaction.response.defer()
await interaction.followup.send(message)
else:
await interaction.response.send_message(message)

await interaction.response.send_message(
f"Created thread {thread.mention} for your GitHub job"
)
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")

try:
Expand Down
12 changes: 9 additions & 3 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ async def run_modal(
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
use_followup: bool = False
):
if not script.filename.endswith(".py") and not script.filename.endswith(".cu"):
await interaction.response.send_message(
Expand All @@ -37,10 +38,15 @@ async def run_modal(
return

thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job")
message = f"Created thread {thread.mention} for your Modal job"

if use_followup:
if not interaction.response.is_done():
await interaction.response.defer()
await interaction.followup.send(message)
else:
await interaction.response.send_message(message)

await interaction.response.send_message(
f"Created thread {thread.mention} for your Modal job"
)
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")

try:
Expand Down
154 changes: 154 additions & 0 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import discord
from discord import app_commands
from discord.ext import commands
import re
from utils import setup_logging
from unittest.mock import AsyncMock

logger = setup_logging()

def create_mock_attachment(filename: str, content: str):
"Create an AsyncMock to simulate discord.Attachment"

mock_attachment = AsyncMock(spec=discord.Attachment)
mock_attachment.filename = filename
mock_attachment.content_type = 'text/plain'
# Simulate the read method
mock_attachment.read = AsyncMock(return_value=content.encode('utf-8'))
return mock_attachment

class VerifyRunCog(commands.Cog):
"""
A Discord cog for verifying the success of trainingruns.

This cog provides functionality to verify that either a GitHub Actions or
Modal run completed successfully by checking for specific message patterns
in a Discord thread. It supports verification of two types of runs:
1. GitHub Actions runs - Identified by "GitHub Action triggered!" message
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this seems to test a trigger but we've had issues where this wouldn't have helped like for example when we had timeout issues with both the NVIDIA and AMD runner

2. Modal runs - Identified by "Running on Modal..." message

Commands:
/verifyrun: Verifies the success of a run in the current thread. Can
only be used in a thread. Automatically detects the run type and
applies appropriate verification.
"""

def __init__(self, bot):
self.bot = bot

async def verify_github_run(self, interaction: discord.Interaction, message_contents: list[str]):
"""Verify that a GitHub Actions run completed successfully"""

required_patterns = [
"Processing `.*` with",
"GitHub Action triggered! Run ID:",
"Training completed with status: success",
".*```\nLogs.*:",
"View the full run at:",
]

all_patterns_found = all(
any(
re.match(pattern, content, re.DOTALL) != None
for content in message_contents
)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.response.send_message(
"✅ All expected messages found - run completed successfully!")
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.match(pattern, content, re.DOTALL) for content in message_contents)
]
await interaction.response.send_message(
"❌ Run verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)

async def verify_modal_run(self, interaction: discord.Interaction, message_contents: list[str]):
"""Verify that a Modal run completed successfully"""

required_patterns = [
"Processing `.*` with",
"Running on Modal...",
".*```\nModal execution result:",
]

all_patterns_found = all(
any(re.match(pattern, content, re.DOTALL) != None for content in message_contents)
for pattern in required_patterns
)

if all_patterns_found:
await interaction.response.send_message("✅ All expected messages found - Modal run completed successfully!")
else:
missing_patterns = [
pattern for pattern in required_patterns
if not any(re.match(pattern, content, re.DOTALL) for content in message_contents)
]
await interaction.response.send_message(
"❌ Modal run verification failed. Missing expected messages:\n" +
"\n".join(f"- {pattern}" for pattern in missing_patterns)
)

@app_commands.command(name='verifyrun')
async def verify_run(self, interaction: discord.Interaction):
"""Verify that a run in the current thread completed successfully"""

if not isinstance(interaction.channel, discord.Thread):
await interaction.response.send_message("This command can only be used in a thread!")
return

message_contents = [msg.content async for msg in interaction.channel.history(limit=None)]

# Check for GitHub Action run
if any("GitHub Action triggered!" in content for content in message_contents):
await self.verify_github_run(interaction, message_contents)
# Check for Modal run
elif any("Running on Modal..." in content for content in message_contents):
await self.verify_modal_run(interaction, message_contents)
else:
await interaction.response.send_message("❌ Could not determine run type!")

@app_commands.command(name='verifyrun2')
async def verify_run2(self, interaction: discord.Interaction):
"""Verify runs on on Modal, GitHub Nvidia, and GitHub AMD."""

try:
# Get instances of the other cogs
modal_cog = self.bot.get_cog('ModalCog')
github_cog = self.bot.get_cog('GitHubCog')

if not all([modal_cog, github_cog]):
await interaction.followup.send("❌ Required cogs not found!")
return

script_content = "print('Hello, world!')"
script_file = create_mock_attachment("test_script.py", script_content)

t4 = app_commands.Choice(name="NVIDIA T4", value="t4")
nvidia = app_commands.Choice(name="NVIDIA", value="nvidia")
amd = app_commands.Choice(name="AMD", value="amd")

modal_command = modal_cog.run_modal
await modal_command.callback(modal_cog, interaction, script_file, t4, use_followup=True)

github_command = github_cog.run_github
await github_command.callback(github_cog, interaction, script_file, nvidia, use_followup=True)
await github_command.callback(github_cog, interaction, script_file, amd, use_followup=True)

await interaction.followup.send(
"✅ Started all verification runs:\n"
"- Modal run\n"
"- GitHub Nvidia run\n"
"- GitHub AMD run"
)

except Exception as e:
logger.error(f"Error starting verification runs: {e}", exc_info=True)
await interaction.followup.send(
f"❌ Error starting verification runs: {str(e)}"
)
Loading