Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/_lambda-do-release-runners.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,13 @@ jobs:
fail-fast: false
matrix:
include: [
{ dir-name: 'ci-queue-pct', zip-name: 'ci-queue-pct' },
{ dir-name: 'oss_ci_job_queue_time', zip-name: 'oss-ci-job-queue-time' },
{ dir-name: 'oss_ci_cur', zip-name: 'oss-ci-cur' },
{ dir-name: 'ci-queue-pct', zip-name: 'ci-queue-pct' },
{ dir-name: 'oss_ci_job_queue_time', zip-name: 'oss-ci-job-queue-time' },
{ dir-name: 'oss_ci_cur', zip-name: 'oss-ci-cur' },
{ dir-name: 'benchmark-results-uploader', zip-name: 'benchmark-results-uploader' },
{ dir-name: 'pytorch-auto-revert', zip-name: 'pytorch-auto-revert' },
{ dir-name: 'pytorch-auto-revert', zip-name: 'pytorch-auto-revert' },
{ dir-name: 'keep-going-call-log-classifier', zip-name: 'keep-going-call-log-classifier' },
{ dir-name: 'buildkite-webhook-handler', zip-name: 'buildkite-webhook-handler' },
]
name: Upload Release for ${{ matrix.dir-name }} lambda
runs-on: ubuntu-latest
Expand Down
19 changes: 19 additions & 0 deletions aws/lambda/buildkite-webhook-handler/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
all: run-local

clean:
rm -rf deployment
rm -rf venv
rm -rf deployment.zip

venv/bin/python:
virtualenv venv
venv/bin/pip install -r requirements.txt

deployment.zip:
mkdir -p deployment
cp lambda_function.py ./deployment/.
pip3.10 install -r requirements.txt -t ./deployment/. --platform manylinux2014_x86_64 --only-binary=:all: --implementation cp --python-version 3.10 --upgrade
cd ./deployment && zip -q -r ../deployment.zip .

.PHONY: create-deployment-package
create-deployment-package: deployment.zip
53 changes: 53 additions & 0 deletions aws/lambda/buildkite-webhook-handler/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
# Buildkite Webhook Handler Lambda

This Lambda function receives and processes Buildkite webhook events for
all available Buildkite webhook events, saving them to DynamoDB tables.

* In the near-term, this allows vLLM maintainers to explore their CI data
like time to signals or queueing time.
* In the longer-term, this will provide the foundation for future UX projects
on vLLM like vLLM HUD, CI failures notifications.

## Overview

The lambda handles two types of Buildkite webhook events:
- **Agent events** (`agent.*`) - Saved to `vllm-buildkite-agent-events` table
- **Build events** (`build.*`) - Saved to `vllm-buildkite-build-events` table
- **Job events** (`job.*`) - Saved to `vllm-buildkite-job-events` table

## DynamoDB Schema

### Agent Events Table: `vllm-buildkite-agent-events`
- **Partition Key**: `dynamoKey` (format: `AGENT_ID`)
- https://buildkite.com/docs/apis/webhooks/pipelines/agent-events

### Build Events Table: `vllm-buildkite-build-events`
- **Partition Key**: `dynamoKey` (format: `REPO_NAME/PIPELINE_NAME/BUILD_NUMBER`)
- https://buildkite.com/docs/apis/webhooks/pipelines/build-events

### Job Events Table: `vllm-buildkite-job-events`
- **Partition Key**: `dynamoKey` (format: `REPO_NAME/JOB_ID`)
- https://buildkite.com/docs/apis/webhooks/pipelines/job-events

## Deployment

```bash
make create-deployment-package
```

This creates a `deployment.zip` file ready for AWS Lambda deployment.

## Event Processing

The lambda automatically:
1. Identifies event type from webhook payload
2. Extracts repository name and relevant IDs
3. Saves to appropriate DynamoDB table with structured key
4. Returns success/error response

## Error Handling

- Invalid JSON payloads return 400 status
- Missing required fields return 400 status
- DynamoDB errors return 500 status
- Unsupported event types return 400 status
203 changes: 203 additions & 0 deletions aws/lambda/buildkite-webhook-handler/lambda_function.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import json
from typing import Any, Dict

import boto3
from botocore.exceptions import ClientError


dynamodb = boto3.resource("dynamodb")
agent_events_table = dynamodb.Table("vllm-buildkite-agent-events")
build_events_table = dynamodb.Table("vllm-buildkite-build-events")
job_events_table = dynamodb.Table("vllm-buildkite-job-events")


def save_agent_event(event_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Save agent events to DynamoDB table.

Args:
event_data: The agent event payload from Buildkite

Returns:
Dict[str, Any]: Response containing status and result information
"""
try:
agent = event_data.get("agent", {})
agent_id = agent.get("id", "")

if not agent_id:
return {
"statusCode": 400,
"body": json.dumps({"message": "Missing agent ID"}),
}

dynamo_key = agent_id
item = {"dynamoKey": dynamo_key, **event_data}

agent_events_table.put_item(Item=item)

return {
"statusCode": 200,
"body": json.dumps(
{"message": f"Agent event saved successfully with key: {dynamo_key}"}
),
}

except ClientError as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"DynamoDB error: {str(e)}"}),
}
except Exception as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"Error saving agent event: {str(e)}"}),
}


def save_build_event(event_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Save build event to DynamoDB table.

Args:
event_data: The build event payload from Buildkite

Returns:
Dict[str, Any]: Response containing status and result information
"""
try:
build = event_data.get("build", {})
repo_name = event_data.get("pipeline", {}).get("repository", "").split("/")[-1]
pipeline_name = event_data.get("pipeline", {}).get("name", "")
build_number = build.get("number", "")

if not repo_name or not build_number:
return {
"statusCode": 400,
"body": json.dumps(
{"message": "Missing repository name or build number"}
),
}

# Buildkite build_number is only unique in a pipeline
dynamo_key = f"{repo_name}/{pipeline_name}/{build_number}"

item = {"dynamoKey": dynamo_key, **event_data}
build_events_table.put_item(Item=item)

return {
"statusCode": 200,
"body": json.dumps(
{"message": f"Build event saved successfully with key: {dynamo_key}"}
),
}

except ClientError as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"DynamoDB error: {str(e)}"}),
}
except Exception as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"Error saving build event: {str(e)}"}),
}


def save_job_event(event_data: Dict[str, Any]) -> Dict[str, Any]:
"""
Save job event to DynamoDB table.

Args:
event_data: The job event payload from Buildkite

Returns:
Dict[str, Any]: Response containing status and result information
"""
try:
job = event_data.get("job", {})
repo_name = event_data.get("pipeline", {}).get("repository", "").split("/")[-1]
job_id = job.get("id", "")

if not repo_name or not job_id:
return {
"statusCode": 400,
"body": json.dumps({"message": "Missing repository name or job ID"}),
}

dynamo_key = f"{repo_name}/{job_id}"

item = {"dynamoKey": dynamo_key, **event_data}

job_events_table.put_item(Item=item)

return {
"statusCode": 200,
"body": json.dumps(
{"message": f"Job event saved successfully with key: {dynamo_key}"}
),
}

except ClientError as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"DynamoDB error: {str(e)}"}),
}
except Exception as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"Error saving job event: {str(e)}"}),
}


def lambda_handler(event: Dict[str, Any], context: Any) -> Dict[str, Any]:
"""
Main Lambda handler function for Buildkite webhook events.

Args:
event: Contains the webhook payload from Buildkite
context: Provides runtime information about the Lambda function

Returns:
Dict[str, Any]: Response containing status and result information
"""
try:
if event.get("body"):
body = json.loads(event["body"])
else:
body = event

event_type = body.get("event")

if not event_type:
return {
"statusCode": 400,
"body": json.dumps(
{"message": "Missing event type in webhook payload"}
),
}

if event_type.startswith("agent."):
return save_agent_event(body)
elif event_type.startswith("build."):
return save_build_event(body)
elif event_type.startswith("job."):
return save_job_event(body)
else:
return {
"statusCode": 400,
"body": json.dumps(
{"message": f"Unsupported event type: {event_type}"}
),
}

except json.JSONDecodeError as e:
return {
"statusCode": 400,
"body": json.dumps({"message": f"Invalid JSON payload: {str(e)}"}),
}
except Exception as e:
return {
"statusCode": 500,
"body": json.dumps({"message": f"Unexpected error: {str(e)}"}),
}
1 change: 1 addition & 0 deletions aws/lambda/buildkite-webhook-handler/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
boto3==1.36.21
Loading