Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"granularity": "String",
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"granularity": "day",
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
89 changes: 89 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
-- vLLM CI reliability metrics (main branch only)
-- Computes CI success rate, failure rate over time for Buildkite builds
-- Daily breakdown of build states (passed, failed, canceled)
-- Accounts for soft failures: builds with only soft failures count as successful
-- Only tracks main branch to exclude work-in-progress PR noise

WITH build_jobs AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state,
tupleElement(job, 'state') AS job_state,
tupleElement(job, 'soft_failed') AS soft_failed,
formatDateTime(
DATE_TRUNC(
{granularity: String },
tupleElement(build, 'started_at')
),
'%Y-%m-%d'
) AS bucket
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
),

builds AS (
SELECT
repository,
pipeline_name,
build_number,
any(build_started_at) AS build_started_at,
any(build_finished_at) AS build_finished_at,
any(build_state) AS build_state,
any(bucket) AS bucket,
-- Count hard failures: job.state='failed' AND soft_failed=false
countIf(lowerUTF8(job_state) = 'failed' AND soft_failed = FALSE)
AS hard_failures,
-- A build is successful if it has no hard failures
-- (even if it has soft failures)
if(
hard_failures = 0
AND lowerUTF8(build_state) NOT IN ('canceled', 'cancelled'),
1,
0
) AS is_success,
if(lowerUTF8(build_state) IN ('canceled', 'cancelled'), 1, 0)
AS is_canceled
FROM build_jobs
GROUP BY
repository,
pipeline_name,
build_number
),

daily_stats AS (
SELECT
bucket,
sum(is_success) AS passed_count,
count() - sum(is_success) - sum(is_canceled) AS failed_count,
sum(is_canceled) AS canceled_count,
count() AS total_count,
count() - sum(is_canceled) AS non_canceled_count,
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM builds
GROUP BY bucket
)

SELECT
bucket AS granularity_bucket,
passed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM daily_stats
ORDER BY granularity_bucket ASC
16 changes: 16 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)"
},
"tests": [
{
"repo": "vllm-project/vllm",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000"
}
]
}
34 changes: 34 additions & 0 deletions torchci/clickhouse_queries/vllm/ci_run_duration/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
-- vLLM CI run durations (main branch only)
-- Lists per-build durations based on build.started_at and build.finished_at
-- Only tracks main branch to exclude work-in-progress PR noise

WITH b AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(build, 'finished_at') AS build_finished_at,
tupleElement(build, 'state') AS build_state
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'finished_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3) }
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3) }
)

SELECT
pipeline_name,
build_number,
max(build_started_at) AS started_at,
max(build_finished_at) AS finished_at,
any(build_state) AS build_state,
dateDiff('second', started_at, finished_at) AS duration_seconds,
round(duration_seconds / 3600.0, 3) AS duration_hours
FROM b
GROUP BY pipeline_name, build_number
ORDER BY started_at ASC
16 changes: 16 additions & 0 deletions torchci/clickhouse_queries/vllm/job_first_failure/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"jobNames": "Array(String)",
"lookbackDays": "UInt32"
},
"tests": [
{
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"jobNames": ["Basic Correctness Test", "Neuron Test", "TPU V1 Test"],
"lookbackDays": 60
}
]
}
78 changes: 78 additions & 0 deletions torchci/clickhouse_queries/vllm/job_first_failure/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
-- Find most recent failure and first break for each job

WITH most_recent_any_failure AS (
SELECT
tupleElement(job, 'name') AS job_name,
argMax(toUInt32(tupleElement(build, 'number')), tupleElement(build, 'started_at')) AS recent_failed_build,
max(tupleElement(build, 'started_at')) AS recent_failed_at
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String}
AND tupleElement(pipeline, 'name') = {pipelineName: String}
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(job, 'name') IN {jobNames: Array(String)}
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= now() - INTERVAL {lookbackDays: UInt32} DAY
AND tupleElement(build, 'started_at') < now()
AND lowerUTF8(tupleElement(job, 'state')) = 'failed'
GROUP BY job_name
),

-- Get all job runs with success/failure state
all_job_runs AS (
SELECT
tupleElement(job, 'name') AS job_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(build, 'started_at') AS build_started_at,
tupleElement(job, 'state') AS job_state,
tupleElement(job, 'soft_failed') AS soft_failed,
-- Success if passed OR soft failure
if(
lowerUTF8(job_state) IN ('passed', 'finished', 'success')
OR (lowerUTF8(job_state) = 'failed' AND soft_failed = true),
1,
if(lowerUTF8(job_state) = 'failed' AND soft_failed = false, 0, -1)
) AS is_success
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String}
AND tupleElement(pipeline, 'name') = {pipelineName: String}
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(job, 'name') IN {jobNames: Array(String)}
AND tupleElement(build, 'started_at') IS NOT NULL
AND tupleElement(build, 'started_at') >= now() - INTERVAL {lookbackDays: UInt32} DAY
AND tupleElement(build, 'started_at') < now()
AND is_success IN (0, 1)
),

all_runs_with_prev AS (
SELECT
job_name,
build_number,
build_started_at,
is_success,
lagInFrame(is_success) OVER (PARTITION BY job_name ORDER BY build_started_at) AS prev_is_success
FROM all_job_runs
),

-- Find most recent success->failure transition for each job
first_break_per_job AS (
SELECT
job_name,
build_number AS first_break_build,
build_started_at AS first_break_at,
ROW_NUMBER() OVER (PARTITION BY job_name ORDER BY build_started_at DESC) AS rn
FROM all_runs_with_prev
WHERE is_success = 0 AND prev_is_success = 1
)

-- Combine recent failure and first break info (URLs constructed client-side from build numbers)
SELECT
a.job_name AS job_name,
a.recent_failed_build AS recent_failed_build,
a.recent_failed_at AS recent_failed_at,
b.first_break_build AS first_break_build,
b.first_break_at AS first_break_at
FROM most_recent_any_failure a
LEFT JOIN first_break_per_job b ON a.job_name = b.job_name AND b.rn = 1
ORDER BY a.recent_failed_at DESC
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)",
"minRuns": "UInt32"
},
"tests": [
{
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-26T00:00:00.000",
"stopTime": "2025-10-03T00:00:00.000",
"minRuns": 3
}
]
}
71 changes: 71 additions & 0 deletions torchci/clickhouse_queries/vllm/job_reliability/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
-- vLLM per-job reliability metrics (main branch only)
-- Computes success rate for each individual job in the CI pipeline
-- Shows which jobs are most/least reliable
-- Only tracks main branch to exclude work-in-progress PR noise

WITH jobs AS (
SELECT
tupleElement(pipeline, 'repository') AS repository,
tupleElement(pipeline, 'name') AS pipeline_name,
toUInt32(tupleElement(build, 'number')) AS build_number,
tupleElement(job, 'name') AS job_name,
tupleElement(job, 'state') AS job_state,
tupleElement(job, 'soft_failed') AS soft_failed,
tupleElement(job, 'finished_at') AS job_finished_at
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String }
AND tupleElement(pipeline, 'name') = {pipelineName: String }
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(job, 'finished_at') IS NOT NULL
AND tupleElement(job, 'finished_at') >= {startTime: DateTime64(3) }
AND tupleElement(job, 'finished_at') < {stopTime: DateTime64(3) }
),

job_stats AS (
SELECT
job_name,
-- Count clean successes: passed jobs only
countIf(
lowerUTF8(job_state) IN ('passed', 'finished', 'success')
) AS passed_count,
-- Count soft failures: failed but soft_failed=true (flaky tests)
countIf(
lowerUTF8(job_state) = 'failed' AND soft_failed = TRUE
) AS soft_failed_count,
-- Count hard failures: failed jobs with soft_failed=false
countIf(
lowerUTF8(job_state) = 'failed' AND soft_failed = FALSE
) AS failed_count,
countIf(lowerUTF8(job_state) IN ('canceled', 'cancelled'))
AS canceled_count,
passed_count
+ soft_failed_count
+ failed_count
+ canceled_count AS total_count,
passed_count + soft_failed_count + failed_count AS non_canceled_count,
-- Success rate = ONLY clean passes / (all non-canceled)
-- This shows true reliability (soft failures don't count as success for job reliability)
if(
non_canceled_count > 0,
round(passed_count / non_canceled_count, 4),
NULL
) AS success_rate
FROM jobs
GROUP BY job_name
HAVING non_canceled_count >= {minRuns: UInt32}
)

SELECT
job_name,
passed_count,
soft_failed_count,
failed_count,
canceled_count,
total_count,
non_canceled_count,
success_rate
FROM job_stats
ORDER BY
success_rate ASC,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A curious q: My understand is that this query would return the worst job first. Why is this on the preview all the jobs with 100% success rate are show first? I guess we want to focus on those that are not in a good state and we should show unreliable jobs first, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure. Changed it to show the worst jobs first.

non_canceled_count DESC
18 changes: 18 additions & 0 deletions torchci/clickhouse_queries/vllm/job_retry_stats/params.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"params": {
"repo": "String",
"pipelineName": "String",
"startTime": "DateTime64(3)",
"stopTime": "DateTime64(3)",
"minRuns": "UInt32"
},
"tests": [
{
"repo": "https://github.com/vllm-project/vllm.git",
"pipelineName": "CI",
"startTime": "2025-09-22T00:00:00.000",
"stopTime": "2025-09-29T00:00:00.000",
"minRuns": 5
}
]
}
24 changes: 24 additions & 0 deletions torchci/clickhouse_queries/vllm/job_retry_stats/query.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
-- vLLM job retry statistics
-- Shows which jobs are retried most often

SELECT
tupleElement(job, 'name') AS job_name,
count(*) AS total_runs,
countIf(tupleElement(job, 'retried') = true) AS retried_count,
if(
total_runs > 0,
round(retried_count / total_runs, 4),
null
) AS retry_rate
FROM vllm.vllm_buildkite_jobs
WHERE
tupleElement(pipeline, 'repository') = {repo: String}
AND tupleElement(pipeline, 'name') = {pipelineName: String}
AND tupleElement(build, 'branch') = 'main'
AND tupleElement(build, 'started_at') IS NOT null
AND tupleElement(build, 'started_at') >= {startTime: DateTime64(3)}
AND tupleElement(build, 'started_at') < {stopTime: DateTime64(3)}
GROUP BY job_name
HAVING total_runs >= {minRuns: UInt32}
ORDER BY retry_rate DESC, retried_count DESC
LIMIT 10
Loading