From 96627de043b79d591208c6bb50493db97d19087e Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 4 Jun 2025 14:50:22 -0400 Subject: [PATCH 1/7] create baselines --- examples/matmul_py/pytorch_ref.py | 8 ++ examples/matmul_py/task.yml | 4 + src/discord-cluster-manager/api/utils.py | 2 +- .../cogs/leaderboard_cog.py | 118 +++++++++++++----- src/discord-cluster-manager/consts.py | 1 + src/discord-cluster-manager/run_eval.py | 4 +- src/discord-cluster-manager/submission.py | 17 +-- src/discord-cluster-manager/task.py | 1 + src/discord-cluster-manager/utils.py | 1 + 9 files changed, 115 insertions(+), 41 deletions(-) create mode 100644 examples/matmul_py/pytorch_ref.py diff --git a/examples/matmul_py/pytorch_ref.py b/examples/matmul_py/pytorch_ref.py new file mode 100644 index 00000000..3d92d2a8 --- /dev/null +++ b/examples/matmul_py/pytorch_ref.py @@ -0,0 +1,8 @@ +#!POPCORN leaderboard matmul_py + +from task import input_t, output_t + + +def custom_kernel(data: input_t) -> output_t: + a, b = data + return a @ b diff --git a/examples/matmul_py/task.yml b/examples/matmul_py/task.yml index 6bb74d69..752391ee 100644 --- a/examples/matmul_py/task.yml +++ b/examples/matmul_py/task.yml @@ -6,6 +6,10 @@ files: - {"name": "utils.py", "source": "../utils.py"} - {"name": "reference.py", "source": "reference.py"} - {"name": "eval.py", "source": "../eval.py"} + - {"name": "pytorch_ref.py", "source": "pytorch_ref.py"} + +milestones: + - {"milestone_name": "pytorch reference", "filename": "pytorch_ref.py"} lang: "py" diff --git a/src/discord-cluster-manager/api/utils.py b/src/discord-cluster-manager/api/utils.py index 131c9e69..e0219257 100644 --- a/src/discord-cluster-manager/api/utils.py +++ b/src/discord-cluster-manager/api/utils.py @@ -154,7 +154,7 @@ async def _run_submission( submission: SubmissionRequest, user_info: dict, mode: SubmissionMode, bot ): try: - req = prepare_submission(submission, bot.leaderboard_db) + req = prepare_submission(submission, bot.leaderboard_db, mode) except Exception as e: raise HTTPException(status_code=400, detail=str(e)) from e diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index fadc9c41..024720a5 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -37,7 +37,12 @@ class LeaderboardSubmitCog(app_commands.Group): def __init__(self, bot: "ClusterBot"): super().__init__(name="submit", description="Submit to leaderboard") self.bot = bot - + + async def _admin_check(self, interaction: discord.Interaction) -> bool: + if not interaction.user.get_role(self.bot.leaderboard_admin_role_id): + return False + return True + async def select_gpu_view( self, interaction: discord.Interaction, @@ -63,32 +68,35 @@ async def on_submit_hook( # noqa: C901 self, interaction: discord.Interaction, leaderboard_name: Optional[str], - script: discord.Attachment, + script: Optional[discord.Attachment], mode: SubmissionMode, cmd_gpus: Optional[List[str]], ) -> int: """ Called as the main body of a submission to route to the correct runner. """ - # Read the template file - submission_content = await script.read() - - try: - submission_content = submission_content.decode() - except UnicodeError: - await send_discord_message( - interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True - ) - return -1 + # Read the template file + submission_content = "" + if mode != SubmissionMode.MILESTONE: + # for milestones we don't have a submission file and instead use the ones in the task + submission_content = await script.read() + try: + submission_content = submission_content.decode() + except UnicodeError: + await send_discord_message( + interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True + ) + return -1 + filename = script.filename if not mode == SubmissionMode.MILESTONE else "performance milestone" req = SubmissionRequest( code=submission_content, - file_name=script.filename, + file_name=filename, user_id=interaction.user.id, gpus=cmd_gpus, leaderboard=leaderboard_name, ) - req = prepare_submission(req, self.bot.leaderboard_db) + req = prepare_submission(req, self.bot.leaderboard_db, mode) # if there is more than one candidate GPU, display UI to let user select, # otherwise just run on that GPU @@ -106,33 +114,53 @@ async def on_submit_hook( # noqa: C901 command = self.bot.get_cog("SubmitCog").submit_leaderboard user_name = interaction.user.global_name or interaction.user.name + # Create a submission entry in the database with self.bot.leaderboard_db as db: sub_id = db.create_submission( leaderboard=req.leaderboard, - file_name=script.filename, + file_name=filename, code=submission_content, user_id=interaction.user.id, time=datetime.now(), user_name=user_name, ) - run_msg = f"Submission **{sub_id}**: `{script.filename}` for `{req.leaderboard}`" + run_msg = f"Submission **{sub_id}**: `{filename}` for `{req.leaderboard}`" reporter = MultiProgressReporter(interaction, run_msg) try: - tasks = [ - command( - sub_id, - submission_content, - script.filename, - gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner}"), - req.task, - mode, - None, - ) - for gpu in selected_gpus - ] + + if mode == SubmissionMode.MILESTONE: + milestones = req.task.milestones + files = req.task.files + tasks = [ + command( + sub_id, + milestone["filename"], + files[milestone["filename"]], + gpu, + reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone['milestone_name']}",), + req.task, + mode, + None, + ) + for milestone in milestones + for gpu in selected_gpus + ] + else: + tasks = [ + command( + sub_id, + submission_content, + script.filename, + gpu, + reporter.add_run(f"{gpu.name} on {gpu.runner}"), + req.task, + mode, + None, + ) + for gpu in selected_gpus + ] # also schedule secret run if mode == SubmissionMode.LEADERBOARD: @@ -224,10 +252,18 @@ async def submit( self, interaction: discord.Interaction, leaderboard_name: Optional[str], - script: discord.Attachment, + script: Optional[discord.Attachment], mode: SubmissionMode, gpu: Optional[str], ): + + if mode != SubmissionMode.MILESTONE and script is None: + await interaction.response.send_message( + "Script is required for non-milestone submissions.", + ephemeral=True, + ) + return + if not self.bot.accepts_jobs: await send_discord_message( interaction, @@ -279,6 +315,28 @@ async def submit_bench( interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu ) + @app_commands.command(name="milestone", description="Start a milestone run") + @app_commands.describe( + leaderboard_name="Name of the competition / kernel to optimize", + gpu="Select GPU. Leave empty for interactive or automatic selection.", + ) + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def submit_milestone( + self, + interaction: discord.Interaction, + leaderboard_name: Optional[str], + gpu: Optional[str], + ): + if not await self._admin_check(interaction): + await interaction.response.send_message( + "You do not have permission to submit milestones.", ephemeral=True + ) + return + return await self.submit( + interaction, leaderboard_name, None, mode=SubmissionMode.MILESTONE, gpu=gpu + ) + @app_commands.command(name="profile", description="Start a profiling run") @app_commands.describe( leaderboard_name="Name of the competition / kernel to optimize", diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 928f59d4..ffb8f3c5 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -97,6 +97,7 @@ class SubmissionMode(Enum): LEADERBOARD = "leaderboard" PRIVATE = "private" SCRIPT = "script" + MILESTONE = "milestone" class Language(Enum): diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 5e7ab046..61be3304 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -511,7 +511,7 @@ def run_evaluation( require multiple runner calls. """ results: dict[str, EvalResult] = {} - if mode in ["test", "benchmark", "profile", "script"]: + if mode in ["test", "benchmark", "profile", "script", "milestone"]: results[mode] = call(mode=mode) elif mode in ["private", "leaderboard"]: # first, run the tests @@ -528,7 +528,7 @@ def run_evaluation( # if they pass, run the leaderboard validation results["leaderboard"] = call(mode="leaderboard") else: - raise AssertionError("Invalid mode") + raise AssertionError(f"Invalid mode: {mode}") return results diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 2777b15f..1af5f56a 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -6,7 +6,7 @@ from better_profanity import profanity from leaderboard_db import LeaderboardDB from task import LeaderboardTask -from utils import KernelBotError, LeaderboardItem +from utils import KernelBotError, LeaderboardItem, SubmissionMode @dataclasses.dataclass @@ -26,15 +26,16 @@ class ProcessedSubmissionRequest(SubmissionRequest): task_gpus: list -def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB) -> ProcessedSubmissionRequest: +def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB, mode: SubmissionMode) -> ProcessedSubmissionRequest: if profanity.contains_profanity(req.file_name): raise KernelBotError("Please provide a non rude filename") - - # check file extension - if not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")): - raise KernelBotError( - "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file", - ) + if mode != SubmissionMode.MILESTONE: + # for milestones we don't have a submission file + # check file extension + if not req.file_name.endswith((".py", ".cu", ".cuh", ".cpp")): + raise KernelBotError( + "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file", + ) # process file directives req = handle_popcorn_directives(req) diff --git a/src/discord-cluster-manager/task.py b/src/discord-cluster-manager/task.py index 3a14bc51..d95f86d7 100644 --- a/src/discord-cluster-manager/task.py +++ b/src/discord-cluster-manager/task.py @@ -64,6 +64,7 @@ class LeaderboardTask: ranking_by: RankCriterion = RankCriterion.LAST templates: dict[str, str] = dataclasses.field(default_factory=dict) seed: Optional[int] = None + milestones: list[dict[str, str]] = dataclasses.field(default_factory=list) @staticmethod def from_dict(data: dict): diff --git a/src/discord-cluster-manager/utils.py b/src/discord-cluster-manager/utils.py index c39192f7..6af160c0 100644 --- a/src/discord-cluster-manager/utils.py +++ b/src/discord-cluster-manager/utils.py @@ -271,6 +271,7 @@ def build_task_config( "ranked_timeout": task.ranked_timeout, "ranking_by": task.ranking_by.value, "seed": task.seed, + "milestones": task.milestones, } if task.lang == Language.Python: From e8e035139937f6341ef7fce57d3e0e8e9d361b20 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 4 Jun 2025 14:54:18 -0400 Subject: [PATCH 2/7] create baselines --- .github/workflows/nvidia_workflow.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 7852377d..ee529e9e 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -28,6 +28,10 @@ jobs: - name: Create input files shell: bash run: | + + # install jq + apt update && apt install -y jq + # Extract the payload content without printing it PAYLOAD=$(jq -r '.inputs.payload' $GITHUB_EVENT_PATH) From 9ffc5b6565b698ffbbe15749b2c38b04244e14b3 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 4 Jun 2025 15:16:00 -0400 Subject: [PATCH 3/7] push to test workflow --- examples/eval.py | 2 +- src/discord-cluster-manager/run_eval.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/eval.py b/examples/eval.py index e414a580..7bec7f87 100644 --- a/examples/eval.py +++ b/examples/eval.py @@ -345,7 +345,7 @@ def main(): if mode == "benchmark": return run_benchmarking(logger, pool, tests) - if mode == "leaderboard": + if mode == "leaderboard" or mode == "milestone": # warmup run_single_benchmark(pool, tests[0], False, 100, 1e7) logger.log("benchmark-count", len(tests)) diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py index 61be3304..4a5d8d41 100644 --- a/src/discord-cluster-manager/run_eval.py +++ b/src/discord-cluster-manager/run_eval.py @@ -296,7 +296,7 @@ def run_single_evaluation( tests_file.write(tests) tests_file.flush() return run_program(call + [mode, tests_file.name], seed=seed, timeout=test_timeout) - elif mode in ["benchmark", "profile", "leaderboard"]: + elif mode in ["benchmark", "profile", "leaderboard", "milestone"]: timeout = ranked_timeout if mode == "leaderboard" else benchmark_timeout with tempfile.NamedTemporaryFile("w") as bench_file: if ranking_by == "last": From a3fe89114cc790c51bd410b688dc1eb9e75356f1 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Thu, 5 Jun 2025 17:04:59 -0400 Subject: [PATCH 4/7] save work --- examples/matmul_py/reference.py | 4 +- examples/matmul_py/task.yml | 12 +- examples/matmul_py/triton_ref.py | 126 ++++++ src/discord-cluster-manager/api/main.py | 2 +- src/discord-cluster-manager/cogs/admin_cog.py | 386 +++++++++++++++++- .../cogs/leaderboard_cog.py | 248 +++++++---- .../cogs/submit_cog.py | 41 +- src/discord-cluster-manager/consts.py | 5 + src/discord-cluster-manager/leaderboard_db.py | 159 ++++++++ .../20250605_01_hwite-add-milestone-table.py | 33 ++ src/discord-cluster-manager/report.py | 12 + src/discord-cluster-manager/submission.py | 2 +- 12 files changed, 927 insertions(+), 103 deletions(-) create mode 100644 examples/matmul_py/triton_ref.py create mode 100644 src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py diff --git a/examples/matmul_py/reference.py b/examples/matmul_py/reference.py index 76da5c6a..569ac27e 100644 --- a/examples/matmul_py/reference.py +++ b/examples/matmul_py/reference.py @@ -20,7 +20,7 @@ def check_implementation(data: input_t, output: output_t) -> str: reasons = verbose_allclose(output, expected) if len(reasons) > 0: # TODO better processing of reasons - return "mismatch found! custom implementation doesn't match reference.: " + reasons[0] + return False, "mismatch found! custom implementation doesn't match reference.: " + reasons[0] - return '' + return True, '' diff --git a/examples/matmul_py/task.yml b/examples/matmul_py/task.yml index 752391ee..f8d2a8a7 100644 --- a/examples/matmul_py/task.yml +++ b/examples/matmul_py/task.yml @@ -7,9 +7,19 @@ files: - {"name": "reference.py", "source": "reference.py"} - {"name": "eval.py", "source": "../eval.py"} - {"name": "pytorch_ref.py", "source": "pytorch_ref.py"} + - {"name": "triton_ref.py", "source": "triton_ref.py"} milestones: - - {"milestone_name": "pytorch reference", "filename": "pytorch_ref.py"} + - { + "milestone_name": "pytorch reference", + "filename": "pytorch_ref.py", + "description": "PyTorch reference implementation as a performance baseline for matmul" + } + - { + "milestone_name": "triton reference", + "filename": "triton_ref.py", + "description": "Triton reference implementation as a performance baseline for matmul" + } lang: "py" diff --git a/examples/matmul_py/triton_ref.py b/examples/matmul_py/triton_ref.py new file mode 100644 index 00000000..1d6280f4 --- /dev/null +++ b/examples/matmul_py/triton_ref.py @@ -0,0 +1,126 @@ +#!POPCORN leaderboard matmul_py +import triton +import triton.language as tl +import torch +from task import input_t, output_t + +@triton.jit +def matmul_kernel( + # Pointers to matrices + a_ptr, b_ptr, c_ptr, + # Matrix dimensions + M, N, K, + # The stride variables represent how much to increase the ptr by when moving by 1 + # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` + # by to get the element one row down (A has M rows). + stride_am, stride_ak, + stride_bk, stride_bn, + stride_cm, stride_cn, + # Meta-parameters + BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, + GROUP_SIZE_M: tl.constexpr, +): + """Kernel for computing the matmul C = A x B. + A has shape (M, K), B has shape (K, N) and C has shape (M, N) + """ + # ----------------------------------------------------------- + # Map program ids `pid` to the block of C it should compute. + # This is done in a grouped ordering to promote L2 cache hit rates. + # See above `L2 Cache Optimizations` section for details. + pid = tl.program_id(axis=0) + num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) + num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) + num_pid_in_group = GROUP_SIZE_M * num_pid_n + group_id = pid // num_pid_in_group + first_pid_m = group_id * GROUP_SIZE_M + group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) + pid_m = first_pid_m + (pid % group_size_m) + pid_n = (pid % num_pid_in_group) // group_size_m + + # ---------------------------------------------------------- + # Create pointers for the first blocks of A and B. + # We will advance this pointer as we move in the K direction + # and accumulate + # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers + # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers + # See above `Pointer Arithmetic` section for details + offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M + offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N + offs_k = tl.arange(0, BLOCK_SIZE_K) + a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) + b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) + + # ----------------------------------------------------------- + # Iterate to compute a block of the C matrix. + # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block + # of fp32 values for higher precision. + # `accumulator` will be converted back to fp16 after the loop. + accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) + for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): + # Load the next block of A and B, generate a mask by checking the K dimension. + # If it is out of bounds, set it to 0. + a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) + b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) + # We accumulate along the K dimension. + accumulator += tl.dot(a, b) + # Advance the ptrs to the next K block. + a_ptrs += BLOCK_SIZE_K * stride_ak + b_ptrs += BLOCK_SIZE_K * stride_bk + # You can fuse arbitrary activation functions here + # while the accumulator is still in FP32! + c = accumulator.to(tl.float16) + + # ----------------------------------------------------------- + # Write back the block of the output matrix C with masks. + offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) + offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) + c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] + c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) + tl.store(c_ptrs, c, mask=c_mask) + + +def triton_matmul(a, b): + # Check constraints. + assert a.shape[1] == b.shape[0], "Incompatible dimensions" + assert a.is_contiguous(), "Matrix A must be contiguous" + assert b.is_contiguous(), "Matrix B must be contiguous" + M, K = a.shape + K, N = b.shape + # Allocate output. + c = torch.empty((M, N), device=a.device, dtype=a.dtype) + # 1D launch kernel where each block gets its own program. + grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) + matmul_kernel[grid]( + a, b, c, + M, N, K, + a.stride(0), a.stride(1), + b.stride(0), b.stride(1), + c.stride(0), c.stride(1), + BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32, + GROUP_SIZE_M=8, + ) + return c + + +def custom_kernel(data: input_t) -> output_t: + a, b = data + # Convert to torch tensors if they aren't already + if not isinstance(a, torch.Tensor): + a = torch.tensor(a, dtype=torch.float16).cuda() + if not isinstance(b, torch.Tensor): + b = torch.tensor(b, dtype=torch.float16).cuda() + + # Ensure tensors are on GPU and contiguous + if not a.is_cuda: + a = a.cuda() + if not b.is_cuda: + b = b.cuda() + + a = a.contiguous() + b = b.contiguous() + + # Use our custom Triton matmul + result = triton_matmul(a, b) + + # Convert back to the expected output format + return result \ No newline at end of file diff --git a/src/discord-cluster-manager/api/main.py b/src/discord-cluster-manager/api/main.py index dca8cf11..1c09bda4 100644 --- a/src/discord-cluster-manager/api/main.py +++ b/src/discord-cluster-manager/api/main.py @@ -352,7 +352,7 @@ async def run_submission( # noqa: C901 all_leaderboards = [lb["name"] for lb in db.get_leaderboards()] if leaderboard_name not in all_leaderboards: raise HTTPException( - status_code=404, detail=f"Leaderboard '{leaderboard_name}' not found." + status_code=404, detail=f"Leaderboard '{leaderboard_name}' not found when trying to run submission." ) else: raise HTTPException( diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 8e39ee2f..738b49dc 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -1,3 +1,4 @@ +import asyncio import json import subprocess import tempfile @@ -10,7 +11,7 @@ import discord import env import yaml -from consts import GitHubGPU, ModalGPU +from consts import GitHubGPU, ModalGPU, SubmissionMode from discord import app_commands from discord.ext import commands, tasks from leaderboard_db import leaderboard_name_autocomplete @@ -20,10 +21,12 @@ KernelBotError, LeaderboardItem, SubmissionItem, + format_time, send_discord_message, setup_logging, with_error_handling, ) +from submission import lookup_leaderboard if TYPE_CHECKING: from ..bot import ClusterBot @@ -120,6 +123,22 @@ def __init__(self, bot: "ClusterBot"): name="set-forum-ids", description="Sets forum IDs" )(self.set_forum_ids) + self.submit_milestones = bot.admin_group.command( + name="submit-milestones", description="Start a milestone run to get milestone results" + )(self.submit_milestones) + + self.list_milestones = bot.admin_group.command( + name="list-milestones", description="List all milestones for a leaderboard" + )(self.list_milestones) + + self.milestone_results = bot.admin_group.command( + name="milestone-results", description="Show results for a milestone" + )(self.milestone_results) + + self.delete_milestone = bot.admin_group.command( + name="delete-milestone", description="Delete a milestone and all its runs" + )(self.delete_milestone) + self._scheduled_cleanup_temp_users.start() # -------------------------------------------------------------------------- @@ -349,7 +368,7 @@ async def create_leaderboard_in_db( with self.bot.leaderboard_db as db: try: - db.create_leaderboard( + leaderboard_id = db.create_leaderboard( { "name": leaderboard_name, "deadline": date_value, @@ -366,7 +385,134 @@ async def create_leaderboard_in_db( ephemeral=True, ) return False - return True + + # Check if the task has milestones and automatically submit them + if hasattr(task, 'milestones') and task.milestones: + try: + await send_discord_message( + interaction, + f"šŸš€ Leaderboard `{leaderboard_name}` created successfully! Auto-submitting {len(task.milestones)} milestone(s)...", + ephemeral=True, + ) + + # Call the underlying milestone submission logic directly + await self._submit_milestones_directly(leaderboard_name, task, selected_gpus) + + await send_discord_message( + interaction, + f"āœ… Milestone submissions completed for `{leaderboard_name}`!", + ephemeral=True, + ) + except Exception as e: + logger.exception("Error auto-submitting milestones for new leaderboard", exc_info=e) + await send_discord_message( + interaction, + f"āš ļø Leaderboard `{leaderboard_name}` created but milestone auto-submission failed: {str(e)}", + ephemeral=True, + ) + + return True + + async def _submit_milestones_directly(self, leaderboard_name: str, task: LeaderboardTask, selected_gpus: list[str]): + """Directly submit milestones without going through Discord command layer""" + from consts import SYSTEM_USER_ID, SYSTEM_USER_NAME, SubmissionMode, get_gpu_by_name + from submission import SubmissionRequest, prepare_submission + from report import RunProgressReporterAPI + + # Ensure system user exists in database + with self.bot.leaderboard_db as db: + db.cursor.execute( + "SELECT 1 FROM leaderboard.user_info WHERE id = %s", + (str(SYSTEM_USER_ID),), + ) + if not db.cursor.fetchone(): + db.cursor.execute( + "INSERT INTO leaderboard.user_info (id, user_name) VALUES (%s, %s)", + (str(SYSTEM_USER_ID), SYSTEM_USER_NAME), + ) + db.connection.commit() + + # Prepare submission request for milestones + req = SubmissionRequest( + code="", # Not used for milestones + file_name="performance milestone", + user_id=SYSTEM_USER_ID, + gpus=selected_gpus, + leaderboard=leaderboard_name, + ) + + # Prepare the submission (validates leaderboard, deadline, etc.) + processed_req = prepare_submission(req, self.bot.leaderboard_db, SubmissionMode.MILESTONE) + + # Convert GPU strings to GPU objects + gpu_objects = [get_gpu_by_name(gpu) for gpu in selected_gpus] + + # Sync milestones to database + leaderboard_item = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) + with self.bot.leaderboard_db as db: + existing_milestones = db.get_leaderboard_milestones(leaderboard_item["id"]) + existing_names = {m["milestone_name"] for m in existing_milestones} + + # Create any new milestones in the database + for milestone in task.milestones: + if milestone["milestone_name"] not in existing_names: + db.create_milestone( + leaderboard_item["id"], + milestone["milestone_name"], + milestone["filename"], + description=milestone.get("description", f"Milestone for {milestone['filename']}") + ) + + # Get submit cog for the submission runner + submit_cog = self.bot.get_cog("SubmitCog") + if not submit_cog: + raise Exception("SubmitCog not available") + + # Create separate submission for each milestone + submission_ids = [] + tasks = [] + + for milestone in task.milestones: + milestone_filename = milestone["filename"] + milestone_code = task.files[milestone_filename] + + # Create separate submission entry for each milestone + with self.bot.leaderboard_db as db: + sub_id = db.create_submission( + leaderboard=leaderboard_name, + file_name=milestone_filename, + code=milestone_code, + user_id=SYSTEM_USER_ID, + time=datetime.now(), + user_name=SYSTEM_USER_NAME, + ) + submission_ids.append(sub_id) + + # Create tasks for this milestone on all selected GPUs + for gpu in gpu_objects: + # Create a background reporter for this submission + reporter = RunProgressReporterAPI(f"Milestone {milestone['milestone_name']} on {gpu.name}") + + tasks.append( + submit_cog.submit_leaderboard( + sub_id, + milestone_code, + milestone_filename, + gpu, + reporter, + processed_req.task, + SubmissionMode.MILESTONE, + None, + ) + ) + + # Execute all milestone submissions + await asyncio.gather(*tasks) + + # Mark all submissions as done + with self.bot.leaderboard_db as db: + for sub_id in submission_ids: + db.mark_submission_done(sub_id) @discord.app_commands.describe(leaderboard_name="Name of the leaderboard") @discord.app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @@ -1025,3 +1171,237 @@ async def set_forum_ids(self, interaction: discord.Interaction): error_message = f"Error updating forum ids: {str(e)}" logger.error(error_message, exc_info=True) await send_discord_message(interaction, error_message, ephemeral=True) + + @app_commands.describe( + leaderboard_name="Name of Leaderboard", + gpu="Select GPU. Leave empty for interactive or automatic selection.", + ) + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def submit_milestones( + self, + interaction: discord.Interaction, + leaderboard_name: Optional[str], + gpu: Optional[str], + ): + if not await self.admin_check(interaction): + await send_discord_message( + interaction, + "You do not have permission to submit milestones.", + ephemeral=True + ) + return + + # Get the submit cog to access the submission logic + submit_cog = self.bot.get_cog("SubmitCog") + if not submit_cog: + await send_discord_message( + interaction, + "Submission system is not available.", + ephemeral=True + ) + return + + # Get the submit group from the leaderboard cog + submit_group = None + for command in self.bot.leaderboard_group.commands: + if hasattr(command, 'name') and command.name == "submit": + submit_group = command + break + + if not submit_group: + await send_discord_message( + interaction, + "Submission system is not available.", + ephemeral=True + ) + return + + return await submit_group.submit( + interaction, leaderboard_name, None, mode=SubmissionMode.MILESTONE, gpu=gpu + ) + + @app_commands.describe(leaderboard_name="Name of the leaderboard") + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def list_milestones( + self, + interaction: discord.Interaction, + leaderboard_name: str, + ): + if not await self.admin_check(interaction): + await send_discord_message( + interaction, + "You need to have Admin permissions to run this command", + ephemeral=True, + ) + return + + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) + with self.bot.leaderboard_db as db: + milestones = db.get_leaderboard_milestones(leaderboard["id"]) + + if not milestones: + await interaction.response.send_message(f"No milestones found for {leaderboard_name}") + return + + message = f"**Milestones for {leaderboard_name}:**\n" + for milestone in milestones: + message += f"• {milestone['milestone_name']} ({milestone['filename']}) - {milestone['description']}\n" + + await interaction.response.send_message(message) + + @app_commands.describe(leaderboard_name="Name of the leaderboard") + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def milestone_results( + self, + interaction: discord.Interaction, + leaderboard_name: str, + ): + if not await self.admin_check(interaction): + await send_discord_message( + interaction, + "You need to have Admin permissions to run this command", + ephemeral=True, + ) + return + + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) + with self.bot.leaderboard_db as db: + milestones = db.get_leaderboard_milestones(leaderboard["id"]) + + if not milestones: + await interaction.response.send_message(f"No milestones found for {leaderboard_name}") + return + + message = f"**All Milestone Results for {leaderboard_name}:**\n\n" + + for milestone in milestones: + with self.bot.leaderboard_db as db: + runs = db.get_milestone_runs(milestone["id"]) + + message += f"šŸ“ **{milestone['milestone_name']}** ({milestone['filename']})\n" + + if not runs: + message += " _No runs found_\n\n" + continue + + # Show top 5 runs for each milestone + for i, run in enumerate(runs[:5], 1): + score = format_time(float(run['score']) * 1e9) if run['score'] else "N/A" + status = 'āœ…' if run['passed'] else 'āŒ' + message += f" {i}. {run['user_name']} - {score} {status} (#{run['submission_id']})\n" + + if len(runs) > 5: + message += f" _... and {len(runs) - 5} more runs_\n" + + message += "\n" + + # Split message if it's too long for Discord + if len(message) > 2000: + messages = [] + current_message = f"**All Milestone Results for {leaderboard_name}:**\n\n" + + for milestone in milestones: + with self.bot.leaderboard_db as db: + runs = db.get_milestone_runs(milestone["id"]) + # sort runs by submission time + runs.sort(key=lambda x: x['submission_time'], reverse=True) + + milestone_section = f"šŸ“ **{milestone['milestone_name']}** ({milestone['filename']}) | {milestone['description']}\n" + + if not runs: + milestone_section += " _No runs found_\n\n" + else: + for i, run in enumerate(runs[:5], 1): + score = format_time(float(run['score']) * 1e9) if run['score'] else "N/A" + status = 'āœ…' if run['passed'] else 'āŒ' + milestone_section += f"{i}. {run['user_name']} - {score} {status} (#{run['submission_id']})\n" + + if len(runs) > 5: + milestone_section += f"_... and {len(runs) - 5} more runs_\n" + + milestone_section += "\n" + + # Check if adding this milestone would exceed Discord's limit + if len(current_message) + len(milestone_section) > 1900: + messages.append(current_message) + current_message = milestone_section + else: + current_message += milestone_section + + # Add the last message + if current_message.strip(): + messages.append(current_message) + + # Send all messages + await interaction.response.send_message(messages[0]) + for msg in messages[1:]: + await interaction.followup.send(msg) + else: + await interaction.response.send_message(message) + + @app_commands.describe( + leaderboard_name="Name of the leaderboard", + milestone_name="Name of the milestone to delete" + ) + @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) + @with_error_handling + async def delete_milestone( + self, + interaction: discord.Interaction, + leaderboard_name: str, + milestone_name: str, + ): + if not await self.admin_check(interaction): + await send_discord_message( + interaction, + "You need to have Admin permissions to run this command", + ephemeral=True, + ) + return + + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) + with self.bot.leaderboard_db as db: + milestones = db.get_leaderboard_milestones(leaderboard["id"]) + milestone = next((m for m in milestones if m["milestone_name"] == milestone_name), None) + + if not milestone: + await interaction.response.send_message(f"Milestone '{milestone_name}' not found") + return + + # Create confirmation dialog + async def do_delete(): + with self.bot.leaderboard_db as db: + db.delete_milestone(milestone["id"]) + await send_discord_message( + interaction, + f"šŸ’„ Milestone `{milestone_name}` from leaderboard `{leaderboard_name}` has been **deleted**.", + ephemeral=True, + ) + + async def no_delete(): + await send_discord_message( + interaction, + f"šŸ’¾ Milestone `{milestone_name}` has **not** been deleted.", + ephemeral=True, + ) + + confirm = ConfirmationView( + confirm_text="Delete", + confirm_callback=do_delete, + reject_text="Keep", + reject_callback=no_delete, + ) + + await interaction.response.send_message( + f"# Attention\nYou are about to **delete** milestone `{milestone_name}` from leaderboard `{leaderboard_name}`.\n" + f"This will also delete all associated runs. This action cannot be undone.\n\nšŸ’‚ Please confirm!" + ) + await send_discord_message( + interaction, + "", + view=confirm, + ephemeral=True, + ) \ No newline at end of file diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index 024720a5..0c28f029 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -5,6 +5,8 @@ import discord from consts import ( + SYSTEM_USER_ID, + SYSTEM_USER_NAME, SubmissionMode, get_gpu_by_name, ) @@ -12,7 +14,7 @@ from discord.ext import commands from leaderboard_db import leaderboard_name_autocomplete from report import MultiProgressReporter -from submission import SubmissionRequest, prepare_submission +from submission import SubmissionRequest, lookup_leaderboard, prepare_submission from ui.misc import GPUSelectionView from ui.table import create_table from utils import ( @@ -38,11 +40,7 @@ def __init__(self, bot: "ClusterBot"): super().__init__(name="submit", description="Submit to leaderboard") self.bot = bot - async def _admin_check(self, interaction: discord.Interaction) -> bool: - if not interaction.user.get_role(self.bot.leaderboard_admin_role_id): - return False - return True - + async def select_gpu_view( self, interaction: discord.Interaction, @@ -113,78 +111,176 @@ async def on_submit_hook( # noqa: C901 command = self.bot.get_cog("SubmitCog").submit_leaderboard - user_name = interaction.user.global_name or interaction.user.name - - # Create a submission entry in the database - with self.bot.leaderboard_db as db: - sub_id = db.create_submission( - leaderboard=req.leaderboard, - file_name=filename, - code=submission_content, - user_id=interaction.user.id, - time=datetime.now(), - user_name=user_name, - ) + # For milestone submissions, use consistent system user + if mode == SubmissionMode.MILESTONE: + user_id = SYSTEM_USER_ID + user_name = SYSTEM_USER_NAME + else: + user_id = interaction.user.id + user_name = interaction.user.global_name or interaction.user.name - run_msg = f"Submission **{sub_id}**: `{filename}` for `{req.leaderboard}`" + run_msg = f"Milestone submissions for `{req.leaderboard}`" if mode == SubmissionMode.MILESTONE else f"Submission: `{filename}` for `{req.leaderboard}`" reporter = MultiProgressReporter(interaction, run_msg) + try: - if mode == SubmissionMode.MILESTONE: - milestones = req.task.milestones - files = req.task.files - tasks = [ - command( - sub_id, + submission_ids = await self._handle_milestone_submissions( + req, user_id, user_name, selected_gpus, reporter, command + ) + return submission_ids + else: + sub_id = await self._handle_regular_submission( + req, submission_content, filename, user_id, user_name, + selected_gpus, reporter, command, mode + ) + + if mode == SubmissionMode.LEADERBOARD: + await self.post_submit_hook(interaction, sub_id) + return [sub_id] + finally: + # Mark all submissions as done + if mode == SubmissionMode.MILESTONE: + # submission_ids is a list for milestones + if 'submission_ids' in locals(): + with self.bot.leaderboard_db as db: + for sub_id in submission_ids: + db.mark_submission_done(sub_id) + else: + # sub_id is a single ID for regular submissions + if 'sub_id' in locals(): + with self.bot.leaderboard_db as db: + db.mark_submission_done(sub_id) + + async def _handle_milestone_submissions( + self, req, user_id, user_name, selected_gpus, reporter, command + ): + """Handle milestone submissions with separate submission IDs for each milestone""" + milestones = req.task.milestones + files = req.task.files + + # Ensure system user exists in database for milestone submissions + with self.bot.leaderboard_db as db: + # Check if system user exists + db.cursor.execute( + """ + SELECT 1 FROM leaderboard.user_info WHERE id = %s + """, + (str(SYSTEM_USER_ID),), + ) + if not db.cursor.fetchone(): + # Create system user + db.cursor.execute( + """ + INSERT INTO leaderboard.user_info (id, user_name) + VALUES (%s, %s) + """, + (str(SYSTEM_USER_ID), SYSTEM_USER_NAME), + ) + db.connection.commit() + + # Sync milestones to database + leaderboard_item = lookup_leaderboard(req.leaderboard, self.bot.leaderboard_db) + with self.bot.leaderboard_db as db: + existing_milestones = db.get_leaderboard_milestones(leaderboard_item["id"]) + existing_names = {m["milestone_name"] for m in existing_milestones} + + # Create any new milestones in the database + for milestone in milestones: + if milestone["milestone_name"] not in existing_names: + db.create_milestone( + leaderboard_item["id"], + milestone["milestone_name"], milestone["filename"], - files[milestone["filename"]], - gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone['milestone_name']}",), - req.task, - mode, - None, + description=milestone.get("description", f"Milestone for {milestone['filename']}") ) - for milestone in milestones - for gpu in selected_gpus - ] - else: - tasks = [ + + # Create separate submission for each milestone + submission_ids = [] + tasks = [] + + for milestone in milestones: + milestone_filename = milestone["filename"] + milestone_code = files[milestone_filename] + + # Create separate submission entry for each milestone + with self.bot.leaderboard_db as db: + sub_id = db.create_submission( + leaderboard=req.leaderboard, + file_name=milestone_filename, + code=milestone_code, + user_id=user_id, + time=datetime.now(), + user_name=user_name, + ) + submission_ids.append(sub_id) + + # Create tasks for this milestone on all selected GPUs + for gpu in selected_gpus: + tasks.append( command( sub_id, - submission_content, - script.filename, + milestone_code, + milestone_filename, gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner}"), + reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone['milestone_name']} (#{sub_id})"), req.task, - mode, + SubmissionMode.MILESTONE, None, ) - for gpu in selected_gpus - ] + ) + + await reporter.show() + await asyncio.gather(*tasks) + return submission_ids - # also schedule secret run - if mode == SubmissionMode.LEADERBOARD: - tasks += [ - command( - sub_id, - submission_content, - script.filename, - gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"), - req.task, - SubmissionMode.PRIVATE, - req.secret_seed, - ) - for gpu in selected_gpus - ] - await reporter.show() - await asyncio.gather(*tasks) - finally: - with self.bot.leaderboard_db as db: - db.mark_submission_done(sub_id) + async def _handle_regular_submission( + self, req, submission_content, filename, user_id, user_name, + selected_gpus, reporter, command, mode + ): + """Handle regular submissions with a single submission ID""" + # Create a submission entry in the database + with self.bot.leaderboard_db as db: + sub_id = db.create_submission( + leaderboard=req.leaderboard, + file_name=filename, + code=submission_content, + user_id=user_id, + time=datetime.now(), + user_name=user_name, + ) + tasks = [ + command( + sub_id, + submission_content, + filename, + gpu, + reporter.add_run(f"{gpu.name} on {gpu.runner}"), + req.task, + mode, + None, + ) + for gpu in selected_gpus + ] + + # Add secret run for leaderboard submissions if mode == SubmissionMode.LEADERBOARD: - await self.post_submit_hook(interaction, sub_id) + tasks += [ + command( + sub_id, + submission_content, + filename, + gpu, + reporter.add_run(f"{gpu.name} on {gpu.runner} (secret)"), + req.task, + SubmissionMode.PRIVATE, + req.secret_seed, + ) + for gpu in selected_gpus + ] + + await reporter.show() + await asyncio.gather(*tasks) return sub_id def generate_run_verdict(self, run: RunItem, sub_data: SubmissionItem): @@ -272,8 +368,10 @@ async def submit( ) return - if gpu is not None: + if gpu is not None and gpu.strip(): gpu = [gpu.strip() for gpu in gpu.split(",")] + else: + gpu = None return await self.on_submit_hook(interaction, leaderboard_name, script, mode, gpu) @@ -315,27 +413,7 @@ async def submit_bench( interaction, leaderboard_name, script, mode=SubmissionMode.BENCHMARK, gpu=gpu ) - @app_commands.command(name="milestone", description="Start a milestone run") - @app_commands.describe( - leaderboard_name="Name of the competition / kernel to optimize", - gpu="Select GPU. Leave empty for interactive or automatic selection.", - ) - @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) - @with_error_handling - async def submit_milestone( - self, - interaction: discord.Interaction, - leaderboard_name: Optional[str], - gpu: Optional[str], - ): - if not await self._admin_check(interaction): - await interaction.response.send_message( - "You do not have permission to submit milestones.", ephemeral=True - ) - return - return await self.submit( - interaction, leaderboard_name, None, mode=SubmissionMode.MILESTONE, gpu=gpu - ) + @app_commands.command(name="profile", description="Start a profiling run") @app_commands.describe( diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index 0657641f..4102694a 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -103,13 +103,16 @@ async def submit_leaderboard( # noqa: C901 if result.success: score = None - if ( - "leaderboard" in result.runs - and result.runs["leaderboard"].run.success - and result.runs["leaderboard"].run.passed - ): + # Calculate score for both leaderboard and milestone runs + score_run_key = None + if "leaderboard" in result.runs and result.runs["leaderboard"].run.success and result.runs["leaderboard"].run.passed: + score_run_key = "leaderboard" + elif "milestone" in result.runs and result.runs["milestone"].run.success and result.runs["milestone"].run.passed: + score_run_key = "milestone" + + if score_run_key: score = 0.0 - num_benchmarks = int(result.runs["leaderboard"].run.result["benchmark-count"]) + num_benchmarks = int(result.runs[score_run_key].run.result["benchmark-count"]) if task.ranking_by == RankCriterion.LAST: if num_benchmarks != 1: logger.error( @@ -122,12 +125,12 @@ async def submit_leaderboard( # noqa: C901 f"Expected submission to have exactly one benchmark," f"got {num_benchmarks}." ) - score = float(result.runs["leaderboard"].run.result["benchmark.0.mean"]) / 1e9 + score = float(result.runs[score_run_key].run.result["benchmark.0.mean"]) / 1e9 else: scores = [] for i in range(num_benchmarks): scores.append( - float(result.runs["leaderboard"].run.result[f"benchmark.{i}.mean"]) + float(result.runs[score_run_key].run.result[f"benchmark.{i}.mean"]) / 1e9 ) if task.ranking_by == RankCriterion.MEAN: @@ -139,18 +142,36 @@ async def submit_leaderboard( # noqa: C901 if submission_id != -1: with self.bot.leaderboard_db as db: for key, value in result.runs.items(): - db.create_submission_run( + # Assign score for leaderboard and milestone runs + run_score = None + if key == "leaderboard" or (key == "milestone" and mode == SubmissionMode.MILESTONE): + run_score = score + + run_id = db.create_submission_run( submission_id, value.start, value.end, mode=key, runner=gpu_type.name, - score=None if key != "leaderboard" else score, + score=run_score, secret=mode == SubmissionMode.PRIVATE, compilation=value.compilation, result=value.run, system=result.system, ) + + # If this is a milestone submission, record the milestone run + if mode == SubmissionMode.MILESTONE and run_id: + # Get submission data to find the leaderboard + submission_data = db.get_submission_by_id(submission_id) + if submission_data: + leaderboard = db.get_leaderboard(submission_data["leaderboard_name"]) + if leaderboard: + # Find the milestone ID based on the filename + milestones = db.get_leaderboard_milestones(leaderboard["id"]) + milestone = next((m for m in milestones if m["filename"] == name), None) + if milestone: + db.record_milestone_run(milestone["id"], submission_id, run_id) return result diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index ffb8f3c5..948625d3 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -158,3 +158,8 @@ class RankCriterion(Enum): --index-url https://download.pytorch.org/whl/rocm6.2.4 torch """ + + +# System user constants for milestone submissions +SYSTEM_USER_ID = -123 +SYSTEM_USER_NAME = "KernelBot" diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index d48e8404..d4925b00 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -166,6 +166,35 @@ def update_leaderboard(self, name, deadline, task): def delete_leaderboard(self, leaderboard_name: str, force: bool = False): try: + # Get leaderboard ID first + self.cursor.execute( + "SELECT id FROM leaderboard.leaderboard WHERE name = %s", + (leaderboard_name,), + ) + result = self.cursor.fetchone() + if not result: + # if there is no leaderboard, there is nothing to do + return + leaderboard_id = result[0] + + # Delete milestone runs first (they reference milestones) + self.cursor.execute( + """ + DELETE FROM leaderboard.milestone_runs + WHERE milestone_id IN ( + SELECT id FROM leaderboard.milestones + WHERE leaderboard_id = %s + ) + """, + (leaderboard_id,), + ) + + # Delete milestones (they reference the leaderboard) + self.cursor.execute( + "DELETE FROM leaderboard.milestones WHERE leaderboard_id = %s", + (leaderboard_id,), + ) + if force: self.cursor.execute( """ @@ -204,6 +233,133 @@ def delete_leaderboard(self, leaderboard_name: str, force: bool = False): logger.exception("Could not delete leaderboard %s.", leaderboard_name, exc_info=e) raise KernelBotError(f"Could not delete leaderboard {leaderboard_name}.") from e + + + def create_milestone( + self, + leaderboard_id: int, + milestone_name: str, + filename: str, + description: str = None, + ) -> int: + """Create a new milestone for a leaderboard""" + try: + self.cursor.execute( + """ + INSERT INTO leaderboard.milestones (leaderboard_id, milestone_name, filename, description) + VALUES (%s, %s, %s, %s) + RETURNING id + """, + (leaderboard_id, milestone_name, filename, description), + ) + milestone_id = self.cursor.fetchone()[0] + self.connection.commit() + return milestone_id + except psycopg2.Error as e: + self.connection.rollback() + logger.exception("Error creating milestone", exc_info=e) + raise KernelBotError("Error creating milestone") from e + + def get_leaderboard_milestones(self, leaderboard_id: int) -> list[dict]: + """Get all milestones for a leaderboard""" + self.cursor.execute( + """ + SELECT id, milestone_name, filename, description, created_at + FROM leaderboard.milestones + WHERE leaderboard_id = %s + ORDER BY created_at + """, + (leaderboard_id,), + ) + return [ + { + "id": row[0], + "milestone_name": row[1], + "filename": row[2], + "description": row[3], + "created_at": row[4], + } + for row in self.cursor.fetchall() + ] + + def record_milestone_run( + self, + milestone_id: int, + submission_id: int, + run_id: int, + ) -> None: + """Record that a milestone was run as part of a submission""" + try: + self.cursor.execute( + """ + INSERT INTO leaderboard.milestone_runs (milestone_id, submission_id, run_id) + VALUES (%s, %s, %s) + """, + (milestone_id, submission_id, run_id), + ) + self.connection.commit() + except psycopg2.Error as e: + self.connection.rollback() + logger.exception("Error recording milestone run", exc_info=e) + raise KernelBotError("Error recording milestone run") from e + + def get_milestone_runs(self, milestone_id: int) -> list[dict]: + """Get all runs for a specific milestone""" + self.cursor.execute( + """ + SELECT + mr.id, + mr.submission_id, + mr.run_id, + s.user_id, + s.submission_time, + r.score, + r.passed, + r.runner, + ui.user_name + FROM leaderboard.milestone_runs mr + JOIN leaderboard.submission s ON mr.submission_id = s.id + JOIN leaderboard.runs r ON mr.run_id = r.id + JOIN leaderboard.user_info ui ON s.user_id = ui.id + WHERE mr.milestone_id = %s + ORDER BY r.score ASC NULLS LAST, s.submission_time DESC + """, + (milestone_id,), + ) + return [ + { + "id": row[0], + "submission_id": row[1], + "run_id": row[2], + "user_id": row[3], + "user_name": row[8], + "submission_time": row[4], + "score": row[5], + "passed": row[6], + "runner": row[7], + } + for row in self.cursor.fetchall() + ] + + def delete_milestone(self, milestone_id: int) -> None: + """Delete a milestone and all associated runs""" + try: + # Delete milestone runs first (foreign key constraint) + self.cursor.execute( + "DELETE FROM leaderboard.milestone_runs WHERE milestone_id = %s", + (milestone_id,), + ) + # Delete the milestone + self.cursor.execute( + "DELETE FROM leaderboard.milestones WHERE id = %s", + (milestone_id,), + ) + self.connection.commit() + except psycopg2.Error as e: + self.connection.rollback() + logger.exception("Error deleting milestone", exc_info=e) + raise KernelBotError("Error deleting milestone") from e + def create_submission( self, leaderboard: str, @@ -333,6 +489,7 @@ def create_submission_run( secret, runner, score, passed, compilation, meta, result, system_info ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s) + RETURNING id """, ( submission, @@ -349,7 +506,9 @@ def create_submission_run( json.dumps(dataclasses.asdict(system)), ), ) + run_id = self.cursor.fetchone()[0] self.connection.commit() + return run_id except psycopg2.Error as e: logger.exception( "Error during adding %s run on %s for submission '%s'", diff --git a/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py b/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py new file mode 100644 index 00000000..18aa44ef --- /dev/null +++ b/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py @@ -0,0 +1,33 @@ +""" +Add milestone table for better milestone tracking +""" + +from yoyo import step + +__depends__ = {"20250506_01_38PkG-add-index-on-runs-runner-score"} # Update with the latest migration + +steps = [ + step(""" + CREATE TABLE IF NOT EXISTS leaderboard.milestones ( + id SERIAL PRIMARY KEY, + leaderboard_id INTEGER NOT NULL REFERENCES leaderboard.leaderboard(id), + milestone_name TEXT NOT NULL, + filename TEXT NOT NULL, + description TEXT, + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW(), + UNIQUE(leaderboard_id, milestone_name) + ) + """), + step("CREATE INDEX ON leaderboard.milestones (leaderboard_id)"), + step(""" + CREATE TABLE IF NOT EXISTS leaderboard.milestone_runs ( + id SERIAL PRIMARY KEY, + milestone_id INTEGER NOT NULL REFERENCES leaderboard.milestones(id), + submission_id INTEGER NOT NULL REFERENCES leaderboard.submission(id), + run_id INTEGER NOT NULL REFERENCES leaderboard.runs(id), + created_at TIMESTAMP WITH TIME ZONE NOT NULL DEFAULT NOW() + ) + """), + step("CREATE INDEX ON leaderboard.milestone_runs (milestone_id)"), + step("CREATE INDEX ON leaderboard.milestone_runs (submission_id)"), +] diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index 4e09c2c5..4d63d4ce 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -218,6 +218,18 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("āœ… Leaderboard run successful") elif full: result.append("āŒ Leaderboard missing") + + if "milestone" in runs: + ms_run = runs["milestone"].run + if not ms_run.success: + result.append("āŒ Running milestone failed" + _short_fail_reason(ms_run)) + elif not ms_run.passed: + result.append("āŒ Milestone run failed") + else: + result.append("āœ… Milestone run successful") + elif full: + result.append("āŒ Milestone missing") + return result diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 1af5f56a..4082ff8d 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -70,7 +70,7 @@ def lookup_leaderboard(leaderboard: str, lb_db: LeaderboardDB) -> LeaderboardIte with lb_db as db: leaderboard_item = db.get_leaderboard(leaderboard) if not leaderboard_item: - raise KernelBotError(f"Leaderboard {leaderboard} not found.") + raise KernelBotError(f"Tried to lookup leaderboard {leaderboard} but it was not found.") return leaderboard_item From 6cde161e1dac6dfe0d77ca105a83a9d8b4da1f93 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 13 Jun 2025 13:40:15 -0400 Subject: [PATCH 5/7] cleanup --- examples/matmul_py/task.yml | 8 +- examples/matmul_py/torch_mm_ref.py | 9 ++ examples/matmul_py/triton_ref.py | 126 ------------------ src/discord-cluster-manager/cogs/admin_cog.py | 13 +- .../cogs/leaderboard_cog.py | 16 ++- src/discord-cluster-manager/consts.py | 15 ++- src/discord-cluster-manager/report.py | 2 - 7 files changed, 46 insertions(+), 143 deletions(-) create mode 100644 examples/matmul_py/torch_mm_ref.py delete mode 100644 examples/matmul_py/triton_ref.py diff --git a/examples/matmul_py/task.yml b/examples/matmul_py/task.yml index f8d2a8a7..46a65f2c 100644 --- a/examples/matmul_py/task.yml +++ b/examples/matmul_py/task.yml @@ -7,7 +7,7 @@ files: - {"name": "reference.py", "source": "reference.py"} - {"name": "eval.py", "source": "../eval.py"} - {"name": "pytorch_ref.py", "source": "pytorch_ref.py"} - - {"name": "triton_ref.py", "source": "triton_ref.py"} + - {"name": "torch_mm_ref.py", "source": "torch_mm_ref.py"} milestones: - { @@ -16,9 +16,9 @@ milestones: "description": "PyTorch reference implementation as a performance baseline for matmul" } - { - "milestone_name": "triton reference", - "filename": "triton_ref.py", - "description": "Triton reference implementation as a performance baseline for matmul" + "milestone_name": "torch.mm reference", + "filename": "torch_mm_ref.py", + "description": "torch.mm reference implementation as a performance baseline for matmul" } lang: "py" diff --git a/examples/matmul_py/torch_mm_ref.py b/examples/matmul_py/torch_mm_ref.py new file mode 100644 index 00000000..0761b4e4 --- /dev/null +++ b/examples/matmul_py/torch_mm_ref.py @@ -0,0 +1,9 @@ +#!POPCORN leaderboard matmul_py + +from task import input_t, output_t +import torch + + +def custom_kernel(data: input_t) -> output_t: + a, b = data + return torch.mm(a, b) diff --git a/examples/matmul_py/triton_ref.py b/examples/matmul_py/triton_ref.py deleted file mode 100644 index 1d6280f4..00000000 --- a/examples/matmul_py/triton_ref.py +++ /dev/null @@ -1,126 +0,0 @@ -#!POPCORN leaderboard matmul_py -import triton -import triton.language as tl -import torch -from task import input_t, output_t - -@triton.jit -def matmul_kernel( - # Pointers to matrices - a_ptr, b_ptr, c_ptr, - # Matrix dimensions - M, N, K, - # The stride variables represent how much to increase the ptr by when moving by 1 - # element in a particular dimension. E.g. `stride_am` is how much to increase `a_ptr` - # by to get the element one row down (A has M rows). - stride_am, stride_ak, - stride_bk, stride_bn, - stride_cm, stride_cn, - # Meta-parameters - BLOCK_SIZE_M: tl.constexpr, BLOCK_SIZE_N: tl.constexpr, BLOCK_SIZE_K: tl.constexpr, - GROUP_SIZE_M: tl.constexpr, -): - """Kernel for computing the matmul C = A x B. - A has shape (M, K), B has shape (K, N) and C has shape (M, N) - """ - # ----------------------------------------------------------- - # Map program ids `pid` to the block of C it should compute. - # This is done in a grouped ordering to promote L2 cache hit rates. - # See above `L2 Cache Optimizations` section for details. - pid = tl.program_id(axis=0) - num_pid_m = tl.cdiv(M, BLOCK_SIZE_M) - num_pid_n = tl.cdiv(N, BLOCK_SIZE_N) - num_pid_in_group = GROUP_SIZE_M * num_pid_n - group_id = pid // num_pid_in_group - first_pid_m = group_id * GROUP_SIZE_M - group_size_m = min(num_pid_m - first_pid_m, GROUP_SIZE_M) - pid_m = first_pid_m + (pid % group_size_m) - pid_n = (pid % num_pid_in_group) // group_size_m - - # ---------------------------------------------------------- - # Create pointers for the first blocks of A and B. - # We will advance this pointer as we move in the K direction - # and accumulate - # `a_ptrs` is a block of [BLOCK_SIZE_M, BLOCK_SIZE_K] pointers - # `b_ptrs` is a block of [BLOCK_SIZE_K, BLOCK_SIZE_N] pointers - # See above `Pointer Arithmetic` section for details - offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M - offs_bn = (pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N)) % N - offs_k = tl.arange(0, BLOCK_SIZE_K) - a_ptrs = a_ptr + (offs_am[:, None] * stride_am + offs_k[None, :] * stride_ak) - b_ptrs = b_ptr + (offs_k[:, None] * stride_bk + offs_bn[None, :] * stride_bn) - - # ----------------------------------------------------------- - # Iterate to compute a block of the C matrix. - # We accumulate into a `[BLOCK_SIZE_M, BLOCK_SIZE_N]` block - # of fp32 values for higher precision. - # `accumulator` will be converted back to fp16 after the loop. - accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - for k in range(0, tl.cdiv(K, BLOCK_SIZE_K)): - # Load the next block of A and B, generate a mask by checking the K dimension. - # If it is out of bounds, set it to 0. - a = tl.load(a_ptrs, mask=offs_k[None, :] < K - k * BLOCK_SIZE_K, other=0.0) - b = tl.load(b_ptrs, mask=offs_k[:, None] < K - k * BLOCK_SIZE_K, other=0.0) - # We accumulate along the K dimension. - accumulator += tl.dot(a, b) - # Advance the ptrs to the next K block. - a_ptrs += BLOCK_SIZE_K * stride_ak - b_ptrs += BLOCK_SIZE_K * stride_bk - # You can fuse arbitrary activation functions here - # while the accumulator is still in FP32! - c = accumulator.to(tl.float16) - - # ----------------------------------------------------------- - # Write back the block of the output matrix C with masks. - offs_cm = pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M) - offs_cn = pid_n * BLOCK_SIZE_N + tl.arange(0, BLOCK_SIZE_N) - c_ptrs = c_ptr + stride_cm * offs_cm[:, None] + stride_cn * offs_cn[None, :] - c_mask = (offs_cm[:, None] < M) & (offs_cn[None, :] < N) - tl.store(c_ptrs, c, mask=c_mask) - - -def triton_matmul(a, b): - # Check constraints. - assert a.shape[1] == b.shape[0], "Incompatible dimensions" - assert a.is_contiguous(), "Matrix A must be contiguous" - assert b.is_contiguous(), "Matrix B must be contiguous" - M, K = a.shape - K, N = b.shape - # Allocate output. - c = torch.empty((M, N), device=a.device, dtype=a.dtype) - # 1D launch kernel where each block gets its own program. - grid = lambda META: (triton.cdiv(M, META['BLOCK_SIZE_M']) * triton.cdiv(N, META['BLOCK_SIZE_N']), ) - matmul_kernel[grid]( - a, b, c, - M, N, K, - a.stride(0), a.stride(1), - b.stride(0), b.stride(1), - c.stride(0), c.stride(1), - BLOCK_SIZE_M=128, BLOCK_SIZE_N=128, BLOCK_SIZE_K=32, - GROUP_SIZE_M=8, - ) - return c - - -def custom_kernel(data: input_t) -> output_t: - a, b = data - # Convert to torch tensors if they aren't already - if not isinstance(a, torch.Tensor): - a = torch.tensor(a, dtype=torch.float16).cuda() - if not isinstance(b, torch.Tensor): - b = torch.tensor(b, dtype=torch.float16).cuda() - - # Ensure tensors are on GPU and contiguous - if not a.is_cuda: - a = a.cuda() - if not b.is_cuda: - b = b.cuda() - - a = a.contiguous() - b = b.contiguous() - - # Use our custom Triton matmul - result = triton_matmul(a, b) - - # Convert back to the expected output format - return result \ No newline at end of file diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 738b49dc..2620cc40 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -11,7 +11,7 @@ import discord import env import yaml -from consts import GitHubGPU, ModalGPU, SubmissionMode +from consts import GitHubGPU, ModalGPU, SubmissionMode, get_system_user_name from discord import app_commands from discord.ext import commands, tasks from leaderboard_db import leaderboard_name_autocomplete @@ -415,7 +415,7 @@ async def create_leaderboard_in_db( async def _submit_milestones_directly(self, leaderboard_name: str, task: LeaderboardTask, selected_gpus: list[str]): """Directly submit milestones without going through Discord command layer""" - from consts import SYSTEM_USER_ID, SYSTEM_USER_NAME, SubmissionMode, get_gpu_by_name + from consts import SYSTEM_USER_ID, get_system_user_name, SubmissionMode, get_gpu_by_name from submission import SubmissionRequest, prepare_submission from report import RunProgressReporterAPI @@ -426,9 +426,10 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb (str(SYSTEM_USER_ID),), ) if not db.cursor.fetchone(): + user_name, user_id = get_system_user_name() db.cursor.execute( "INSERT INTO leaderboard.user_info (id, user_name) VALUES (%s, %s)", - (str(SYSTEM_USER_ID), SYSTEM_USER_NAME), + (str(user_id), user_name), ) db.connection.commit() @@ -475,16 +476,18 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb for milestone in task.milestones: milestone_filename = milestone["filename"] milestone_code = task.files[milestone_filename] + milestone_name = milestone["milestone_name"] # Create separate submission entry for each milestone with self.bot.leaderboard_db as db: + user_name, user_id = get_system_user_name(milestone_name) sub_id = db.create_submission( leaderboard=leaderboard_name, file_name=milestone_filename, code=milestone_code, - user_id=SYSTEM_USER_ID, + user_id=user_id, time=datetime.now(), - user_name=SYSTEM_USER_NAME, + user_name=user_name, ) submission_ids.append(sub_id) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index cc28c623..b274d287 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -6,7 +6,7 @@ import discord from consts import ( SYSTEM_USER_ID, - SYSTEM_USER_NAME, + get_system_user_name, SubmissionMode, get_gpu_by_name, ) @@ -113,8 +113,12 @@ async def on_submit_hook( # noqa: C901 # For milestone submissions, use consistent system user if mode == SubmissionMode.MILESTONE: - user_id = SYSTEM_USER_ID - user_name = SYSTEM_USER_NAME + # Get the milestone name from the task + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) + milestone_name = None + if leaderboard["task"].milestones: + milestone_name = leaderboard["task"].milestones[0]["milestone_name"] + user_name, user_id = get_system_user_name(milestone_name) else: user_id = interaction.user.id user_name = interaction.user.global_name or interaction.user.name @@ -174,7 +178,7 @@ async def _handle_milestone_submissions( INSERT INTO leaderboard.user_info (id, user_name) VALUES (%s, %s) """, - (str(SYSTEM_USER_ID), SYSTEM_USER_NAME), + (str(SYSTEM_USER_ID), get_system_user_name(None)), ) db.connection.commit() @@ -201,9 +205,11 @@ async def _handle_milestone_submissions( for milestone in milestones: milestone_filename = milestone["filename"] milestone_code = files[milestone_filename] + milestone_name = milestone["milestone_name"] # Create separate submission entry for each milestone with self.bot.leaderboard_db as db: + user_name, user_id = get_system_user_name(milestone_name) sub_id = db.create_submission( leaderboard=req.leaderboard, file_name=milestone_filename, @@ -222,7 +228,7 @@ async def _handle_milestone_submissions( milestone_code, milestone_filename, gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone['milestone_name']} (#{sub_id})"), + reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone_name} (#{sub_id})"), req.task, SubmissionMode.MILESTONE, None, diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index 05d3e66c..ee38dac0 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -1,6 +1,7 @@ import dataclasses from enum import Enum, IntEnum from typing import Type +import hashlib class Timeout(IntEnum): @@ -154,5 +155,17 @@ class RankCriterion(Enum): torch """ SYSTEM_USER_ID = -123 -SYSTEM_USER_NAME = "KernelBot" + +def get_milestone_user_id(milestone_name: str | None = None) -> int: + if not milestone_name: + return SYSTEM_USER_ID + # Generate a consistent negative ID between -1000 and -100 based on the milestone name + hash_value = int(hashlib.md5(milestone_name.encode()).hexdigest(), 16) + return -100 - (hash_value % 900) # This ensures ID is between -1000 and -100 + +def get_system_user_name(milestone_name: str | None = None) -> tuple[str, int]: + if milestone_name: + return f"KernelBot - {milestone_name}", get_milestone_user_id(milestone_name) + return "KernelBot", SYSTEM_USER_ID + TIMEOUT_BUFFER_MINUTES = 2 diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index 4d63d4ce..d057ab3b 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -227,8 +227,6 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("āŒ Milestone run failed") else: result.append("āœ… Milestone run successful") - elif full: - result.append("āŒ Milestone missing") return result From 341cbc99698e40c6025d70d63b9314cff51f510a Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 13 Jun 2025 13:46:03 -0400 Subject: [PATCH 6/7] remove milestone deletion --- src/discord-cluster-manager/cogs/admin_cog.py | 66 +------------------ src/discord-cluster-manager/leaderboard_db.py | 19 ------ 2 files changed, 1 insertion(+), 84 deletions(-) diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 2620cc40..767f9f61 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -1343,68 +1343,4 @@ async def milestone_results( for msg in messages[1:]: await interaction.followup.send(msg) else: - await interaction.response.send_message(message) - - @app_commands.describe( - leaderboard_name="Name of the leaderboard", - milestone_name="Name of the milestone to delete" - ) - @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) - @with_error_handling - async def delete_milestone( - self, - interaction: discord.Interaction, - leaderboard_name: str, - milestone_name: str, - ): - if not await self.admin_check(interaction): - await send_discord_message( - interaction, - "You need to have Admin permissions to run this command", - ephemeral=True, - ) - return - - leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) - with self.bot.leaderboard_db as db: - milestones = db.get_leaderboard_milestones(leaderboard["id"]) - milestone = next((m for m in milestones if m["milestone_name"] == milestone_name), None) - - if not milestone: - await interaction.response.send_message(f"Milestone '{milestone_name}' not found") - return - - # Create confirmation dialog - async def do_delete(): - with self.bot.leaderboard_db as db: - db.delete_milestone(milestone["id"]) - await send_discord_message( - interaction, - f"šŸ’„ Milestone `{milestone_name}` from leaderboard `{leaderboard_name}` has been **deleted**.", - ephemeral=True, - ) - - async def no_delete(): - await send_discord_message( - interaction, - f"šŸ’¾ Milestone `{milestone_name}` has **not** been deleted.", - ephemeral=True, - ) - - confirm = ConfirmationView( - confirm_text="Delete", - confirm_callback=do_delete, - reject_text="Keep", - reject_callback=no_delete, - ) - - await interaction.response.send_message( - f"# Attention\nYou are about to **delete** milestone `{milestone_name}` from leaderboard `{leaderboard_name}`.\n" - f"This will also delete all associated runs. This action cannot be undone.\n\nšŸ’‚ Please confirm!" - ) - await send_discord_message( - interaction, - "", - view=confirm, - ephemeral=True, - ) \ No newline at end of file + await interaction.response.send_message(message) \ No newline at end of file diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index 5ef8834c..fd308197 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -348,25 +348,6 @@ def get_milestone_runs(self, milestone_id: int) -> list[dict]: for row in self.cursor.fetchall() ] - def delete_milestone(self, milestone_id: int) -> None: - """Delete a milestone and all associated runs""" - try: - # Delete milestone runs first (foreign key constraint) - self.cursor.execute( - "DELETE FROM leaderboard.milestone_runs WHERE milestone_id = %s", - (milestone_id,), - ) - # Delete the milestone - self.cursor.execute( - "DELETE FROM leaderboard.milestones WHERE id = %s", - (milestone_id,), - ) - self.connection.commit() - except psycopg2.Error as e: - self.connection.rollback() - logger.exception("Error deleting milestone", exc_info=e) - raise KernelBotError("Error deleting milestone") from e - def create_submission( self, leaderboard: str, From 33c9f826822e82a39ff2884c1391c8f538c02c59 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Fri, 13 Jun 2025 13:58:15 -0400 Subject: [PATCH 7/7] lint --- src/discord-cluster-manager/api/main.py | 6 +- src/discord-cluster-manager/cogs/admin_cog.py | 232 ++++++++++-------- .../cogs/leaderboard_cog.py | 50 ++-- .../cogs/submit_cog.py | 38 ++- src/discord-cluster-manager/consts.py | 2 +- src/discord-cluster-manager/leaderboard_db.py | 12 +- .../20250605_01_hwite-add-milestone-table.py | 2 +- src/discord-cluster-manager/report.py | 2 +- src/discord-cluster-manager/submission.py | 6 +- 9 files changed, 207 insertions(+), 143 deletions(-) diff --git a/src/discord-cluster-manager/api/main.py b/src/discord-cluster-manager/api/main.py index b3771fbf..9315279f 100644 --- a/src/discord-cluster-manager/api/main.py +++ b/src/discord-cluster-manager/api/main.py @@ -356,7 +356,11 @@ async def run_submission( # noqa: C901 all_leaderboards = [lb["name"] for lb in db.get_leaderboards()] if leaderboard_name not in all_leaderboards: raise HTTPException( - status_code=404, detail=f"Leaderboard '{leaderboard_name}' not found when trying to run submission." + status_code=404, + detail=( + f"Leaderboard '{leaderboard_name}' not found " + "when trying to run submission." + ) ) else: raise HTTPException( diff --git a/src/discord-cluster-manager/cogs/admin_cog.py b/src/discord-cluster-manager/cogs/admin_cog.py index 767f9f61..753d7d54 100644 --- a/src/discord-cluster-manager/cogs/admin_cog.py +++ b/src/discord-cluster-manager/cogs/admin_cog.py @@ -15,6 +15,7 @@ from discord import app_commands from discord.ext import commands, tasks from leaderboard_db import leaderboard_name_autocomplete +from submission import lookup_leaderboard from task import LeaderboardTask, make_task from ui.misc import ConfirmationView, DeleteConfirmationModal, GPUSelectionView from utils import ( @@ -26,7 +27,6 @@ setup_logging, with_error_handling, ) -from submission import lookup_leaderboard if TYPE_CHECKING: from ..bot import ClusterBot @@ -135,10 +135,6 @@ def __init__(self, bot: "ClusterBot"): name="milestone-results", description="Show results for a milestone" )(self.milestone_results) - self.delete_milestone = bot.admin_group.command( - name="delete-milestone", description="Delete a milestone and all its runs" - )(self.delete_milestone) - self._scheduled_cleanup_temp_users.start() # -------------------------------------------------------------------------- @@ -368,7 +364,7 @@ async def create_leaderboard_in_db( with self.bot.leaderboard_db as db: try: - leaderboard_id = db.create_leaderboard( + db.create_leaderboard( { "name": leaderboard_name, "deadline": date_value, @@ -391,13 +387,14 @@ async def create_leaderboard_in_db( try: await send_discord_message( interaction, - f"šŸš€ Leaderboard `{leaderboard_name}` created successfully! Auto-submitting {len(task.milestones)} milestone(s)...", + f"šŸš€ Leaderboard `{leaderboard_name}` created successfully! " + f"Auto-submitting {len(task.milestones)} milestone(s)...", ephemeral=True, ) - + # Call the underlying milestone submission logic directly await self._submit_milestones_directly(leaderboard_name, task, selected_gpus) - + await send_discord_message( interaction, f"āœ… Milestone submissions completed for `{leaderboard_name}`!", @@ -407,18 +404,24 @@ async def create_leaderboard_in_db( logger.exception("Error auto-submitting milestones for new leaderboard", exc_info=e) await send_discord_message( interaction, - f"āš ļø Leaderboard `{leaderboard_name}` created but milestone auto-submission failed: {str(e)}", + f"āš ļø Leaderboard `{leaderboard_name}` created but milestone " + f"auto-submission failed: {str(e)}", ephemeral=True, ) return True - async def _submit_milestones_directly(self, leaderboard_name: str, task: LeaderboardTask, selected_gpus: list[str]): + async def _submit_milestones_directly( + self, + leaderboard_name: str, + task: LeaderboardTask, + selected_gpus: list[str] + ): """Directly submit milestones without going through Discord command layer""" - from consts import SYSTEM_USER_ID, get_system_user_name, SubmissionMode, get_gpu_by_name - from submission import SubmissionRequest, prepare_submission + from consts import SYSTEM_USER_ID, SubmissionMode, get_gpu_by_name from report import RunProgressReporterAPI - + from submission import SubmissionRequest, prepare_submission + # Ensure system user exists in database with self.bot.leaderboard_db as db: db.cursor.execute( @@ -432,7 +435,7 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb (str(user_id), user_name), ) db.connection.commit() - + # Prepare submission request for milestones req = SubmissionRequest( code="", # Not used for milestones @@ -441,19 +444,19 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb gpus=selected_gpus, leaderboard=leaderboard_name, ) - + # Prepare the submission (validates leaderboard, deadline, etc.) processed_req = prepare_submission(req, self.bot.leaderboard_db, SubmissionMode.MILESTONE) - + # Convert GPU strings to GPU objects gpu_objects = [get_gpu_by_name(gpu) for gpu in selected_gpus] - + # Sync milestones to database leaderboard_item = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) with self.bot.leaderboard_db as db: existing_milestones = db.get_leaderboard_milestones(leaderboard_item["id"]) existing_names = {m["milestone_name"] for m in existing_milestones} - + # Create any new milestones in the database for milestone in task.milestones: if milestone["milestone_name"] not in existing_names: @@ -461,23 +464,26 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb leaderboard_item["id"], milestone["milestone_name"], milestone["filename"], - description=milestone.get("description", f"Milestone for {milestone['filename']}") + description=milestone.get( + "description", + f"Milestone for {milestone['filename']}" + ) ) - + # Get submit cog for the submission runner submit_cog = self.bot.get_cog("SubmitCog") if not submit_cog: raise Exception("SubmitCog not available") - + # Create separate submission for each milestone submission_ids = [] tasks = [] - + for milestone in task.milestones: milestone_filename = milestone["filename"] milestone_code = task.files[milestone_filename] milestone_name = milestone["milestone_name"] - + # Create separate submission entry for each milestone with self.bot.leaderboard_db as db: user_name, user_id = get_system_user_name(milestone_name) @@ -490,12 +496,14 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb user_name=user_name, ) submission_ids.append(sub_id) - + # Create tasks for this milestone on all selected GPUs for gpu in gpu_objects: # Create a background reporter for this submission - reporter = RunProgressReporterAPI(f"Milestone {milestone['milestone_name']} on {gpu.name}") - + reporter = RunProgressReporterAPI( + f"Milestone {milestone['milestone_name']} on {gpu.name}" + ) + tasks.append( submit_cog.submit_leaderboard( sub_id, @@ -508,10 +516,10 @@ async def _submit_milestones_directly(self, leaderboard_name: str, task: Leaderb None, ) ) - + # Execute all milestone submissions await asyncio.gather(*tasks) - + # Mark all submissions as done with self.bot.leaderboard_db as db: for sub_id in submission_ids: @@ -1190,11 +1198,11 @@ async def submit_milestones( if not await self.admin_check(interaction): await send_discord_message( interaction, - "You do not have permission to submit milestones.", + "You do not have permission to submit milestones.", ephemeral=True ) return - + # Get the submit cog to access the submission logic submit_cog = self.bot.get_cog("SubmitCog") if not submit_cog: @@ -1204,14 +1212,14 @@ async def submit_milestones( ephemeral=True ) return - + # Get the submit group from the leaderboard cog submit_group = None for command in self.bot.leaderboard_group.commands: if hasattr(command, 'name') and command.name == "submit": submit_group = command break - + if not submit_group: await send_discord_message( interaction, @@ -1219,7 +1227,7 @@ async def submit_milestones( ephemeral=True ) return - + return await submit_group.submit( interaction, leaderboard_name, None, mode=SubmissionMode.MILESTONE, gpu=gpu ) @@ -1239,21 +1247,83 @@ async def list_milestones( ephemeral=True, ) return - + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) with self.bot.leaderboard_db as db: milestones = db.get_leaderboard_milestones(leaderboard["id"]) - + if not milestones: await interaction.response.send_message(f"No milestones found for {leaderboard_name}") return - + message = f"**Milestones for {leaderboard_name}:**\n" for milestone in milestones: - message += f"• {milestone['milestone_name']} ({milestone['filename']}) - {milestone['description']}\n" - + message += ( + f"• {milestone['milestone_name']} " + f"({milestone['filename']}) - " + f"{milestone['description']}\n" + ) + await interaction.response.send_message(message) + async def _format_milestone_runs(self, runs: list[dict], max_runs: int = 5) -> str: + """Format milestone runs into a string message.""" + message = "" + for i, run in enumerate(runs[:max_runs], 1): + score = format_time(float(run['score']) * 1e9) if run['score'] else "N/A" + status = 'āœ…' if run['passed'] else 'āŒ' + message += ( + f" {i}. {run['user_name']} - {score} {status} " + f"(#{run['submission_id']})\n" + ) + + if len(runs) > max_runs: + message += f" _... and {len(runs) - max_runs} more runs_\n" + + return message + + def _format_milestone_section( + self, milestone: dict, runs: list[dict] + ) -> str: + """Format a single milestone section with its runs.""" + section = ( + f"šŸ“ **{milestone['milestone_name']}** " + f"({milestone['filename']}) | {milestone['description']}\n" + ) + + if not runs: + section += " _No runs found_\n\n" + return section + + section += self._format_milestone_runs(runs) + section += "\n" + return section + + def _create_milestone_messages( + self, leaderboard_name: str, milestones: list[dict] + ) -> list[str]: + """Create a list of messages for all milestones, splitting if needed.""" + messages = [] + current_message = f"**All Milestone Results for {leaderboard_name}:**\n\n" + + for milestone in milestones: + with self.bot.leaderboard_db as db: + runs = db.get_milestone_runs(milestone["id"]) + runs.sort(key=lambda x: x['submission_time'], reverse=True) + + milestone_section = self._format_milestone_section(milestone, runs) + + if len(current_message) + len(milestone_section) > 1900: + messages.append(current_message) + current_message = milestone_section + else: + current_message += milestone_section + + if current_message.strip(): + messages.append(current_message) + + return messages + @app_commands.describe(leaderboard_name="Name of the leaderboard") @app_commands.autocomplete(leaderboard_name=leaderboard_name_autocomplete) @with_error_handling @@ -1269,78 +1339,30 @@ async def milestone_results( ephemeral=True, ) return - + leaderboard = lookup_leaderboard(leaderboard_name, self.bot.leaderboard_db) with self.bot.leaderboard_db as db: milestones = db.get_leaderboard_milestones(leaderboard["id"]) - + if not milestones: - await interaction.response.send_message(f"No milestones found for {leaderboard_name}") + await interaction.response.send_message( + f"No milestones found for {leaderboard_name}" + ) return - + + # Create a single message if it fits within Discord's limit message = f"**All Milestone Results for {leaderboard_name}:**\n\n" - for milestone in milestones: with self.bot.leaderboard_db as db: runs = db.get_milestone_runs(milestone["id"]) - - message += f"šŸ“ **{milestone['milestone_name']}** ({milestone['filename']})\n" - - if not runs: - message += " _No runs found_\n\n" - continue - - # Show top 5 runs for each milestone - for i, run in enumerate(runs[:5], 1): - score = format_time(float(run['score']) * 1e9) if run['score'] else "N/A" - status = 'āœ…' if run['passed'] else 'āŒ' - message += f" {i}. {run['user_name']} - {score} {status} (#{run['submission_id']})\n" - - if len(runs) > 5: - message += f" _... and {len(runs) - 5} more runs_\n" - - message += "\n" - - # Split message if it's too long for Discord - if len(message) > 2000: - messages = [] - current_message = f"**All Milestone Results for {leaderboard_name}:**\n\n" - - for milestone in milestones: - with self.bot.leaderboard_db as db: - runs = db.get_milestone_runs(milestone["id"]) - # sort runs by submission time - runs.sort(key=lambda x: x['submission_time'], reverse=True) - - milestone_section = f"šŸ“ **{milestone['milestone_name']}** ({milestone['filename']}) | {milestone['description']}\n" - - if not runs: - milestone_section += " _No runs found_\n\n" - else: - for i, run in enumerate(runs[:5], 1): - score = format_time(float(run['score']) * 1e9) if run['score'] else "N/A" - status = 'āœ…' if run['passed'] else 'āŒ' - milestone_section += f"{i}. {run['user_name']} - {score} {status} (#{run['submission_id']})\n" - - if len(runs) > 5: - milestone_section += f"_... and {len(runs) - 5} more runs_\n" - - milestone_section += "\n" - - # Check if adding this milestone would exceed Discord's limit - if len(current_message) + len(milestone_section) > 1900: - messages.append(current_message) - current_message = milestone_section - else: - current_message += milestone_section - - # Add the last message - if current_message.strip(): - messages.append(current_message) - - # Send all messages - await interaction.response.send_message(messages[0]) - for msg in messages[1:]: - await interaction.followup.send(msg) - else: - await interaction.response.send_message(message) \ No newline at end of file + message += self._format_milestone_section(milestone, runs) + + if len(message) <= 2000: + await interaction.response.send_message(message) + return + + # If message is too long, split it into multiple messages + messages = self._create_milestone_messages(leaderboard_name, milestones) + await interaction.response.send_message(messages[0]) + for msg in messages[1:]: + await interaction.followup.send(msg) diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index b274d287..22aab3c8 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -6,9 +6,9 @@ import discord from consts import ( SYSTEM_USER_ID, - get_system_user_name, SubmissionMode, get_gpu_by_name, + get_system_user_name, ) from discord import app_commands from discord.ext import commands @@ -39,7 +39,7 @@ class LeaderboardSubmitCog(app_commands.Group): def __init__(self, bot: "ClusterBot"): super().__init__(name="submit", description="Submit to leaderboard") self.bot = bot - + async def select_gpu_view( self, @@ -86,7 +86,11 @@ async def on_submit_hook( # noqa: C901 interaction, "Could not decode your file. Is it UTF-8?", ephemeral=True ) return -1 - filename = script.filename if not mode == SubmissionMode.MILESTONE else "performance milestone" + filename = ( + script.filename + if not mode == SubmissionMode.MILESTONE + else "performance milestone" + ) req = SubmissionRequest( code=submission_content, file_name=filename, @@ -123,9 +127,13 @@ async def on_submit_hook( # noqa: C901 user_id = interaction.user.id user_name = interaction.user.global_name or interaction.user.name - run_msg = f"Milestone submissions for `{req.leaderboard}`" if mode == SubmissionMode.MILESTONE else f"Submission: `{filename}` for `{req.leaderboard}`" + run_msg = ( + f"Milestone submissions for `{req.leaderboard}`" + if mode == SubmissionMode.MILESTONE + else f"Submission: `{filename}` for `{req.leaderboard}`" + ) reporter = MultiProgressReporter(interaction, run_msg) - + try: if mode == SubmissionMode.MILESTONE: submission_ids = await self._handle_milestone_submissions( @@ -134,10 +142,10 @@ async def on_submit_hook( # noqa: C901 return submission_ids else: sub_id = await self._handle_regular_submission( - req, submission_content, filename, user_id, user_name, + req, submission_content, filename, user_id, user_name, selected_gpus, reporter, command, mode ) - + if mode == SubmissionMode.LEADERBOARD: await self.post_submit_hook(interaction, sub_id) return [sub_id] @@ -161,7 +169,7 @@ async def _handle_milestone_submissions( """Handle milestone submissions with separate submission IDs for each milestone""" milestones = req.task.milestones files = req.task.files - + # Ensure system user exists in database for milestone submissions with self.bot.leaderboard_db as db: # Check if system user exists @@ -181,13 +189,13 @@ async def _handle_milestone_submissions( (str(SYSTEM_USER_ID), get_system_user_name(None)), ) db.connection.commit() - + # Sync milestones to database leaderboard_item = lookup_leaderboard(req.leaderboard, self.bot.leaderboard_db) with self.bot.leaderboard_db as db: existing_milestones = db.get_leaderboard_milestones(leaderboard_item["id"]) existing_names = {m["milestone_name"] for m in existing_milestones} - + # Create any new milestones in the database for milestone in milestones: if milestone["milestone_name"] not in existing_names: @@ -195,18 +203,21 @@ async def _handle_milestone_submissions( leaderboard_item["id"], milestone["milestone_name"], milestone["filename"], - description=milestone.get("description", f"Milestone for {milestone['filename']}") + description=milestone.get( + "description", + f"Milestone for {milestone['filename']}" + ) ) # Create separate submission for each milestone submission_ids = [] tasks = [] - + for milestone in milestones: milestone_filename = milestone["filename"] milestone_code = files[milestone_filename] milestone_name = milestone["milestone_name"] - + # Create separate submission entry for each milestone with self.bot.leaderboard_db as db: user_name, user_id = get_system_user_name(milestone_name) @@ -219,7 +230,7 @@ async def _handle_milestone_submissions( user_name=user_name, ) submission_ids.append(sub_id) - + # Create tasks for this milestone on all selected GPUs for gpu in selected_gpus: tasks.append( @@ -228,19 +239,22 @@ async def _handle_milestone_submissions( milestone_code, milestone_filename, gpu, - reporter.add_run(f"{gpu.name} on {gpu.runner} for milestone {milestone_name} (#{sub_id})"), + reporter.add_run( + f"{gpu.name} on {gpu.runner} for milestone " + f"{milestone_name} (#{sub_id})" + ), req.task, SubmissionMode.MILESTONE, None, ) ) - + await reporter.show() await asyncio.gather(*tasks) return submission_ids async def _handle_regular_submission( - self, req, submission_content, filename, user_id, user_name, + self, req, submission_content, filename, user_id, user_name, selected_gpus, reporter, command, mode ): """Handle regular submissions with a single submission ID""" @@ -284,7 +298,7 @@ async def _handle_regular_submission( ) for gpu in selected_gpus ] - + await reporter.show() await asyncio.gather(*tasks) return sub_id diff --git a/src/discord-cluster-manager/cogs/submit_cog.py b/src/discord-cluster-manager/cogs/submit_cog.py index 4102694a..484c4e76 100644 --- a/src/discord-cluster-manager/cogs/submit_cog.py +++ b/src/discord-cluster-manager/cogs/submit_cog.py @@ -105,11 +105,19 @@ async def submit_leaderboard( # noqa: C901 score = None # Calculate score for both leaderboard and milestone runs score_run_key = None - if "leaderboard" in result.runs and result.runs["leaderboard"].run.success and result.runs["leaderboard"].run.passed: + if ( + "leaderboard" in result.runs + and result.runs["leaderboard"].run.success + and result.runs["leaderboard"].run.passed + ): score_run_key = "leaderboard" - elif "milestone" in result.runs and result.runs["milestone"].run.success and result.runs["milestone"].run.passed: + elif ( + "milestone" in result.runs + and result.runs["milestone"].run.success + and result.runs["milestone"].run.passed + ): score_run_key = "milestone" - + if score_run_key: score = 0.0 num_benchmarks = int(result.runs[score_run_key].run.result["benchmark-count"]) @@ -144,9 +152,12 @@ async def submit_leaderboard( # noqa: C901 for key, value in result.runs.items(): # Assign score for leaderboard and milestone runs run_score = None - if key == "leaderboard" or (key == "milestone" and mode == SubmissionMode.MILESTONE): + if ( + key == "leaderboard" + or (key == "milestone" and mode == SubmissionMode.MILESTONE) + ): run_score = score - + run_id = db.create_submission_run( submission_id, value.start, @@ -159,19 +170,28 @@ async def submit_leaderboard( # noqa: C901 result=value.run, system=result.system, ) - + # If this is a milestone submission, record the milestone run if mode == SubmissionMode.MILESTONE and run_id: # Get submission data to find the leaderboard submission_data = db.get_submission_by_id(submission_id) if submission_data: - leaderboard = db.get_leaderboard(submission_data["leaderboard_name"]) + leaderboard = db.get_leaderboard( + submission_data["leaderboard_name"] + ) if leaderboard: # Find the milestone ID based on the filename milestones = db.get_leaderboard_milestones(leaderboard["id"]) - milestone = next((m for m in milestones if m["filename"] == name), None) + milestone = next( + (m for m in milestones if m["filename"] == name), + None + ) if milestone: - db.record_milestone_run(milestone["id"], submission_id, run_id) + db.record_milestone_run( + milestone["id"], + submission_id, + run_id + ) return result diff --git a/src/discord-cluster-manager/consts.py b/src/discord-cluster-manager/consts.py index ee38dac0..ef8d33d4 100644 --- a/src/discord-cluster-manager/consts.py +++ b/src/discord-cluster-manager/consts.py @@ -1,7 +1,7 @@ import dataclasses +import hashlib from enum import Enum, IntEnum from typing import Type -import hashlib class Timeout(IntEnum): diff --git a/src/discord-cluster-manager/leaderboard_db.py b/src/discord-cluster-manager/leaderboard_db.py index fd308197..27b7609a 100644 --- a/src/discord-cluster-manager/leaderboard_db.py +++ b/src/discord-cluster-manager/leaderboard_db.py @@ -187,9 +187,9 @@ def delete_leaderboard(self, leaderboard_name: str, force: bool = False): # Delete milestone runs first (they reference milestones) self.cursor.execute( """ - DELETE FROM leaderboard.milestone_runs + DELETE FROM leaderboard.milestone_runs WHERE milestone_id IN ( - SELECT id FROM leaderboard.milestones + SELECT id FROM leaderboard.milestones WHERE leaderboard_id = %s ) """, @@ -240,8 +240,6 @@ def delete_leaderboard(self, leaderboard_name: str, force: bool = False): logger.exception("Could not delete leaderboard %s.", leaderboard_name, exc_info=e) raise KernelBotError(f"Could not delete leaderboard {leaderboard_name}.") from e - - def create_milestone( self, leaderboard_id: int, @@ -253,7 +251,9 @@ def create_milestone( try: self.cursor.execute( """ - INSERT INTO leaderboard.milestones (leaderboard_id, milestone_name, filename, description) + INSERT INTO leaderboard.milestones ( + leaderboard_id, milestone_name, filename, description + ) VALUES (%s, %s, %s, %s) RETURNING id """, @@ -314,7 +314,7 @@ def get_milestone_runs(self, milestone_id: int) -> list[dict]: """Get all runs for a specific milestone""" self.cursor.execute( """ - SELECT + SELECT mr.id, mr.submission_id, mr.run_id, diff --git a/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py b/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py index 18aa44ef..1368bd50 100644 --- a/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py +++ b/src/discord-cluster-manager/migrations/20250605_01_hwite-add-milestone-table.py @@ -4,7 +4,7 @@ from yoyo import step -__depends__ = {"20250506_01_38PkG-add-index-on-runs-runner-score"} # Update with the latest migration +__depends__ = {"20250506_01_38PkG-add-index-on-runs-runner-score"} # Update to latest migration steps = [ step(""" diff --git a/src/discord-cluster-manager/report.py b/src/discord-cluster-manager/report.py index d057ab3b..f626aa08 100644 --- a/src/discord-cluster-manager/report.py +++ b/src/discord-cluster-manager/report.py @@ -218,7 +218,7 @@ def make_short_report(runs: dict[str, EvalResult], full=True) -> list[str]: # n result.append("āœ… Leaderboard run successful") elif full: result.append("āŒ Leaderboard missing") - + if "milestone" in runs: ms_run = runs["milestone"].run if not ms_run.success: diff --git a/src/discord-cluster-manager/submission.py b/src/discord-cluster-manager/submission.py index 4082ff8d..6845c6dc 100644 --- a/src/discord-cluster-manager/submission.py +++ b/src/discord-cluster-manager/submission.py @@ -26,7 +26,11 @@ class ProcessedSubmissionRequest(SubmissionRequest): task_gpus: list -def prepare_submission(req: SubmissionRequest, lb_db: LeaderboardDB, mode: SubmissionMode) -> ProcessedSubmissionRequest: +def prepare_submission( + req: SubmissionRequest, + lb_db: LeaderboardDB, + mode: SubmissionMode +) -> ProcessedSubmissionRequest: if profanity.contains_profanity(req.file_name): raise KernelBotError("Please provide a non rude filename") if mode != SubmissionMode.MILESTONE: