pytorch · rzabarazesh · Oct 14, 2025 · Oct 3, 2025 · Oct 6, 2025 · Oct 6, 2025
diff --git a/torchci/clickhouse_queries/vllm/ci_reliability/params.json b/torchci/clickhouse_queries/vllm/ci_reliability/params.json
@@ -0,0 +1,18 @@
+{
+  "params": {
+    "granularity": "String",
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)"
+  },
+  "tests": [
+    {
+      "granularity": "day",
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000"
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/ci_reliability/query.sql b/torchci/clickhouse_queries/vllm/ci_reliability/query.sql
@@ -0,0 +1,89 @@
+-- vLLM CI reliability metrics (main branch only)
+-- Computes CI success rate, failure rate over time for Buildkite builds
+-- Daily breakdown of build states (passed, failed, canceled)
+-- Accounts for soft failures: builds with only soft failures count as successful
+-- Only tracks main branch to exclude work-in-progress PR noise
+
+WITH build_jobs AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(build, 'started_at') AS build_started_at,
+        tupleElement(build, 'finished_at') AS build_finished_at,
+        tupleElement(build, 'state') AS build_state,
+        tupleElement(job, 'state') AS job_state,
+        tupleElement(job, 'soft_failed') AS soft_failed,
+        formatDateTime(
+            DATE_TRUNC(
+                {granularity: String },
+                tupleElement(build, 'started_at')
+            ),
+            '%Y-%m-%d'
+        ) AS bucket
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(build, 'branch') = 'main'
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
+),
+
+builds AS (
+    SELECT
+        repository,
+        pipeline_name,
+        build_number,
+        any(build_started_at) AS build_started_at,
+        any(build_finished_at) AS build_finished_at,
+        any(build_state) AS build_state,
+        any(bucket) AS bucket,
+        -- Count hard failures: job.state='failed' AND soft_failed=false
+        countIf(lowerUTF8(job_state) = 'failed' AND soft_failed = FALSE)
+            AS hard_failures,
+        -- A build is successful if it has no hard failures
+        -- (even if it has soft failures)
+        if(
+            hard_failures = 0
+            AND lowerUTF8(build_state) NOT IN ('canceled', 'cancelled'),
+            1,
+            0
+        ) AS is_success,
+        if(lowerUTF8(build_state) IN ('canceled', 'cancelled'), 1, 0)
+            AS is_canceled
+    FROM build_jobs
+    GROUP BY
+        repository,
+        pipeline_name,
+        build_number
+),
+
+daily_stats AS (
+    SELECT
+        bucket,
+        sum(is_success) AS passed_count,
+        count() - sum(is_success) - sum(is_canceled) AS failed_count,
+        sum(is_canceled) AS canceled_count,
+        count() AS total_count,
+        count() - sum(is_canceled) AS non_canceled_count,
+        if(
+            non_canceled_count > 0,
+            round(passed_count / non_canceled_count, 4),
+            NULL
+        ) AS success_rate
+    FROM builds
+    GROUP BY bucket
+)
+
+SELECT
+    bucket AS granularity_bucket,
+    passed_count,
+    failed_count,
+    canceled_count,
+    total_count,
+    non_canceled_count,
+    success_rate
+FROM daily_stats
+ORDER BY granularity_bucket ASC
diff --git a/torchci/clickhouse_queries/vllm/ci_run_duration/params.json b/torchci/clickhouse_queries/vllm/ci_run_duration/params.json
@@ -0,0 +1,16 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)"
+  },
+  "tests": [
+    {
+      "repo": "vllm-project/vllm",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000"
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/ci_run_duration/query.sql b/torchci/clickhouse_queries/vllm/ci_run_duration/query.sql
@@ -0,0 +1,34 @@
+-- vLLM CI run durations (main branch only)
+-- Lists per-build durations based on build.started_at and build.finished_at
+-- Only tracks main branch to exclude work-in-progress PR noise
+
+WITH b AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(build, 'started_at') AS build_started_at,
+        tupleElement(build, 'finished_at') AS build_finished_at,
+        tupleElement(build, 'state') AS build_state
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(build, 'branch') = 'main'
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'finished_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
+)
+
+SELECT
+    pipeline_name,
+    build_number,
+    max(build_started_at) AS started_at,
+    max(build_finished_at) AS finished_at,
+    any(build_state) AS build_state,
+    dateDiff('second', started_at, finished_at) AS duration_seconds,
+    round(duration_seconds / 3600.0, 3) AS duration_hours
+FROM b
+GROUP BY pipeline_name, build_number
+ORDER BY started_at ASC
diff --git a/torchci/clickhouse_queries/vllm/job_first_failure/params.json b/torchci/clickhouse_queries/vllm/job_first_failure/params.json
@@ -0,0 +1,16 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "jobNames": "Array(String)",
+    "lookbackDays": "UInt32"
+  },
+  "tests": [
+    {
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "jobNames": ["Basic Correctness Test", "Neuron Test", "TPU V1 Test"],
+      "lookbackDays": 60
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/job_first_failure/query.sql b/torchci/clickhouse_queries/vllm/job_first_failure/query.sql
@@ -0,0 +1,78 @@
+-- Find most recent failure and first break for each job
+
+WITH most_recent_any_failure AS (
+    SELECT
+        tupleElement(job, 'name') AS job_name,
+        argMax(toUInt32(tupleElement(build, 'number')), tupleElement(build, 'started_at')) AS recent_failed_build,
+        max(tupleElement(build, 'started_at')) AS recent_failed_at
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String}
+        AND tupleElement(pipeline, 'name') = {pipelineName: String}
+        AND tupleElement(build, 'branch') = 'main'
+        AND tupleElement(job, 'name') IN {jobNames: Array(String)}
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= now() - INTERVAL {lookbackDays: UInt32} DAY
+        AND tupleElement(build, 'started_at') < now()
+        AND lowerUTF8(tupleElement(job, 'state')) = 'failed'
+    GROUP BY job_name
+),
+
+-- Get all job runs with success/failure state
+all_job_runs AS (
+    SELECT
+        tupleElement(job, 'name') AS job_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(build, 'started_at') AS build_started_at,
+        tupleElement(job, 'state') AS job_state,
+        tupleElement(job, 'soft_failed') AS soft_failed,
+        -- Success if passed OR soft failure
+        if(
+            lowerUTF8(job_state) IN ('passed', 'finished', 'success')
+            OR (lowerUTF8(job_state) = 'failed' AND soft_failed = true),
+            1,
+            if(lowerUTF8(job_state) = 'failed' AND soft_failed = false, 0, -1)
+        ) AS is_success
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String}
+        AND tupleElement(pipeline, 'name') = {pipelineName: String}
+        AND tupleElement(build, 'branch') = 'main'
+        AND tupleElement(job, 'name') IN {jobNames: Array(String)}
+        AND tupleElement(build, 'started_at') IS NOT NULL
+        AND tupleElement(build, 'started_at') >= now() - INTERVAL {lookbackDays: UInt32} DAY
+        AND tupleElement(build, 'started_at') < now()
+        AND is_success IN (0, 1)
+),
+
+all_runs_with_prev AS (
+    SELECT
+        job_name,
+        build_number,
+        build_started_at,
+        is_success,
+        lagInFrame(is_success) OVER (PARTITION BY job_name ORDER BY build_started_at) AS prev_is_success
+    FROM all_job_runs
+),
+
+-- Find most recent success->failure transition for each job
+first_break_per_job AS (
+    SELECT
+        job_name,
+        build_number AS first_break_build,
+        build_started_at AS first_break_at,
+        ROW_NUMBER() OVER (PARTITION BY job_name ORDER BY build_started_at DESC) AS rn
+    FROM all_runs_with_prev
+    WHERE is_success = 0 AND prev_is_success = 1
+)
+
+-- Combine recent failure and first break info (URLs constructed client-side from build numbers)
+SELECT
+    a.job_name AS job_name,
+    a.recent_failed_build AS recent_failed_build,
+    a.recent_failed_at AS recent_failed_at,
+    b.first_break_build AS first_break_build,
+    b.first_break_at AS first_break_at
+FROM most_recent_any_failure a
+LEFT JOIN first_break_per_job b ON a.job_name = b.job_name AND b.rn = 1
+ORDER BY a.recent_failed_at DESC
diff --git a/torchci/clickhouse_queries/vllm/job_reliability/params.json b/torchci/clickhouse_queries/vllm/job_reliability/params.json
@@ -0,0 +1,18 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)",
+    "minRuns": "UInt32"
+  },
+  "tests": [
+    {
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "startTime": "2025-09-26T00:00:00.000",
+      "stopTime": "2025-10-03T00:00:00.000",
+      "minRuns": 3
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/job_reliability/query.sql b/torchci/clickhouse_queries/vllm/job_reliability/query.sql
@@ -0,0 +1,71 @@
+-- vLLM per-job reliability metrics (main branch only)
+-- Computes success rate for each individual job in the CI pipeline
+-- Shows which jobs are most/least reliable
+-- Only tracks main branch to exclude work-in-progress PR noise
+
+WITH jobs AS (
+    SELECT
+        tupleElement(pipeline, 'repository') AS repository,
+        tupleElement(pipeline, 'name') AS pipeline_name,
+        toUInt32(tupleElement(build, 'number')) AS build_number,
+        tupleElement(job, 'name') AS job_name,
+        tupleElement(job, 'state') AS job_state,
+        tupleElement(job, 'soft_failed') AS soft_failed,
+        tupleElement(job, 'finished_at') AS job_finished_at
+    FROM vllm.vllm_buildkite_jobs
+    WHERE
+        tupleElement(pipeline, 'repository') = {repo: String }
+        AND tupleElement(pipeline, 'name') = {pipelineName: String }
+        AND tupleElement(build, 'branch') = 'main'
+        AND tupleElement(job, 'finished_at') IS NOT NULL
+        AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
+        AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
+),
+
+job_stats AS (
+    SELECT
+        job_name,
+        -- Count clean successes: passed jobs only
+        countIf(
+            lowerUTF8(job_state) IN ('passed', 'finished', 'success')
+        ) AS passed_count,
+        -- Count soft failures: failed but soft_failed=true (flaky tests)
+        countIf(
+            lowerUTF8(job_state) = 'failed' AND soft_failed = TRUE
+        ) AS soft_failed_count,
+        -- Count hard failures: failed jobs with soft_failed=false
+        countIf(
+            lowerUTF8(job_state) = 'failed' AND soft_failed = FALSE
+        ) AS failed_count,
+        countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled'))
+            AS canceled_count,
+        passed_count
+        + soft_failed_count
+        + failed_count
+        + canceled_count AS total_count,
+        passed_count + soft_failed_count + failed_count AS non_canceled_count,
+        -- Success rate = ONLY clean passes / (all non-canceled)
+        -- This shows true reliability (soft failures don't count as success for job reliability)
+        if(
+            non_canceled_count > 0,
+            round(passed_count / non_canceled_count, 4),
+            NULL
+        ) AS success_rate
+    FROM jobs
+    GROUP BY job_name
+    HAVING non_canceled_count >= {minRuns: UInt32}
+)
+
+SELECT
+    job_name,
+    passed_count,
+    soft_failed_count,
+    failed_count,
+    canceled_count,
+    total_count,
+    non_canceled_count,
+    success_rate
+FROM job_stats
+ORDER BY
+    success_rate ASC,
+    non_canceled_count DESC
diff --git a/torchci/clickhouse_queries/vllm/job_retry_stats/params.json b/torchci/clickhouse_queries/vllm/job_retry_stats/params.json
@@ -0,0 +1,18 @@
+{
+  "params": {
+    "repo": "String",
+    "pipelineName": "String",
+    "startTime": "DateTime64(3)",
+    "stopTime": "DateTime64(3)",
+    "minRuns": "UInt32"
+  },
+  "tests": [
+    {
+      "repo": "https://github.com/vllm-project/vllm.git",
+      "pipelineName": "CI",
+      "startTime": "2025-09-22T00:00:00.000",
+      "stopTime": "2025-09-29T00:00:00.000",
+      "minRuns": 5
+    }
+  ]
+}
diff --git a/torchci/clickhouse_queries/vllm/job_retry_stats/query.sql b/torchci/clickhouse_queries/vllm/job_retry_stats/query.sql
@@ -0,0 +1,24 @@
+-- vLLM job retry statistics
+-- Shows which jobs are retried most often
+
+SELECT
+    tupleElement(job, 'name') AS job_name,
+    count(*) AS total_runs,
+    countIf(tupleElement(job, 'retried') = true) AS retried_count,
+    if(
+        total_runs > 0,
+        round(retried_count / total_runs, 4),
+        null
+    ) AS retry_rate
+FROM vllm.vllm_buildkite_jobs
+WHERE
+    tupleElement(pipeline, 'repository') = {repo: String}
+    AND tupleElement(pipeline, 'name') = {pipelineName: String}
+    AND tupleElement(build, 'branch') = 'main'
+    AND tupleElement(build, 'started_at') IS NOT null
+    AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3)}
+    AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3)}
+GROUP BY job_name
+HAVING total_runs >= {minRuns: UInt32}
+ORDER BY retry_rate DESC, retried_count DESC
+LIMIT 10