From 7f8ca8df00bd24f875f1b9fb63812cf06ccb3516 Mon Sep 17 00:00:00 2001 From: James Wiesebron Date: Thu, 21 Aug 2025 10:34:50 -0700 Subject: [PATCH] [scheduled-job-alerts] Add alerting for failed scheduled jobs --- .../modules/github-ci-bootstrap/README.md | 3 +- terraform/modules/scheduled-job/README.md | 85 ++++++++++++ .../examples/simple-job/README.md | 20 +++ .../scheduled-job/examples/simple-job/main.tf | 16 +++ .../examples/simple-job/variables.tf | 11 ++ terraform/modules/scheduled-job/main.tf | 127 +++++++++++++++++- terraform/modules/scheduled-job/outputs.tf | 16 ++- terraform/modules/scheduled-job/variables.tf | 24 ++++ 8 files changed, 299 insertions(+), 3 deletions(-) diff --git a/terraform/modules/github-ci-bootstrap/README.md b/terraform/modules/github-ci-bootstrap/README.md index 554ccc5..196a3ac 100644 --- a/terraform/modules/github-ci-bootstrap/README.md +++ b/terraform/modules/github-ci-bootstrap/README.md @@ -266,7 +266,8 @@ With dual service accounts, you can also conditionally run different Terraform o run: terraform plan - name: Terraform Apply (Write-Enabled Branches Only) - if: contains(fromJSON('["refs/heads/main", "refs/heads/master"]'), github.ref) + if: contains(fromJSON('["refs/heads/main", "refs/heads/master"]'), + github.ref) run: terraform apply -auto-approve ``` diff --git a/terraform/modules/scheduled-job/README.md b/terraform/modules/scheduled-job/README.md index 721ed57..2cc49ca 100644 --- a/terraform/modules/scheduled-job/README.md +++ b/terraform/modules/scheduled-job/README.md @@ -13,6 +13,7 @@ Creates a complete scheduled setup: - Storage bucket with lifecycle management - Secret Manager IAM bindings - Source code change detection +- **Slack alerting** for job failures (optional) ## Quick Start @@ -88,6 +89,10 @@ module "my_data_processor" { version = "latest" } ] + + # Enable Slack alerting for job failures (enabled by default) + slack_channel = "#channel-name" + slack_mention_users = ["@group-or-user"] } ``` @@ -253,6 +258,13 @@ module "data_processor" { - `job_args` - Command arguments ([]) - `job_image` - Container image URL (required) +### Alerting (optional) + +- `enable_alerting` - Whether to enable alerting for job failures (true) +- `slack_channel` - Slack channel to send notifications to (e.g., "#1s-and-0s") (required when alerting enabled) +- `slack_mention_users` - List of Slack users or groups to mention in alerts (e.g., ["@user", "@group"]) ([]) +- `alert_project_id` - GCP project ID where monitoring and alerting resources will be created (defaults to project_id) (null) + ## Outputs - `resource_name` - Name of deployed function or job @@ -263,6 +275,11 @@ module "data_processor" { - `storage_bucket_name` - Storage bucket name - `execution_type` - The execution type used +### Alerting Outputs (when `enable_alerting = true`) + +- `monitoring_notification_channel_name` - Name of the monitoring notification channel +- `alert_policy_names` - Names of the monitoring alert policies + ## Repository Structure ``` @@ -404,6 +421,74 @@ Or use Cloud Build directly: gcloud builds submit --tag gcr.io/YOUR_PROJECT_ID/YOUR_JOB_NAME:latest ./jobs/your-job ``` +## Alerting + +The module supports optional Slack alerting for job failures. When enabled, it creates: + +- **Monitoring policies**: Cloud Monitoring alert policies for different failure scenarios +- **Slack notification channel**: Direct integration with Slack using the Slack API token from Secret Manager + +**Note**: The module automatically fetches the Slack API token from Secret Manager in the `khan-academy` project (secret: `Slack__API_token_for_alertlib`). Ensure your Terraform service account has access to read this secret. + +### Enabling Alerting + +```hcl +module "my_job_with_alerts" { + source = "git::https://github.com/Khan/terraform-modules.git//terraform/modules/scheduled-job?ref=v1.0.0" + + # ... other configuration ... + + # Alerting is enabled by default + slack_channel = "#my-team-channel" + slack_mention_users = ["@oncall", "@team-leads"] # Optional: users/groups to mention + + # Optional: Use different project for alerting resources + alert_project_id = "my-monitoring-project" +} +``` + +### What Gets Monitored + +When alerting is enabled, the module creates monitoring policies for: + +1. **Cloud Function failures** (when `execution_type = "function"`) + + - Monitors `cloudfunctions.googleapis.com/function/execution_count` with `status="error"` + - Alerts immediately when any function execution fails + - Includes direct link to function logs in console + +2. **Cloud Run Job failures** (when `execution_type = "job"`) + + - Monitors `run.googleapis.com/job/completed_task_attempt_count` and `failed_task_attempt_count` + - Alerts when tasks fail or don't complete within expected time + - Includes direct link to job logs in console + +### Slack Message Format + +The Slack notifications include: + +- Alert description with job/function name +- Direct link to GCP Console logs for troubleshooting +- Optional CC mentions for users/groups (configured via `slack_mention_users`) +- Markdown-formatted for readability + +Example alert message: + +``` +The Cloud Function my-function has failed to execute. Check the function logs for more details. + +[View Function in Console](https://console.cloud.google.com/...) + +CC: @oncall @team-leads +``` + +### Security + +- Slack API token is fetched from Secret Manager in the `khan-academy` project +- Token is stored securely in the monitoring notification channel's sensitive labels +- All alerting resources are created in the specified project (or same project as the job) +- Requires Secret Manager read permissions on the `Slack__API_token_for_alertlib` secret + ## Common Cron Patterns | Schedule | Description | diff --git a/terraform/modules/scheduled-job/examples/simple-job/README.md b/terraform/modules/scheduled-job/examples/simple-job/README.md index 7ec4dbc..bc611cb 100644 --- a/terraform/modules/scheduled-job/examples/simple-job/README.md +++ b/terraform/modules/scheduled-job/examples/simple-job/README.md @@ -9,6 +9,7 @@ This example demonstrates how to use the scheduled-job module to create a Cloud - Service account with appropriate permissions - Container image built automatically using Cloud Build - Secret Manager IAM bindings +- Slack alerting for job failures (enabled by default) ## Key differences from Cloud Functions @@ -25,6 +26,7 @@ This example demonstrates how to use the scheduled-job module to create a Cloud ```bash export TF_VAR_project_id="your-gcp-project" export TF_VAR_secrets_project_id="your-secrets-project" + export TF_VAR_slack_channel="#my-team-channel" ``` 2. Initialize and apply: @@ -71,6 +73,10 @@ module "daily_data_processor" { job_command = ["python", "processor.py"] job_args = [] # Additional arguments if needed + # Alerting is enabled by default + slack_channel = var.slack_channel + slack_mention_users = ["@oncall"] # Optional + # ... other configuration } ``` @@ -92,3 +98,17 @@ The job code in `job-code/processor.py` is a simple Python script that: - **Branch-based Caching**: Cloud Build caches layers based on branch names for faster builds. - Jobs are triggered via HTTP calls to the Cloud Run Jobs API, not via PubSub like Cloud Functions. - Jobs can run for longer periods and have more resources than Cloud Functions. + +## Alerting + +This example includes Slack alerting for job failures by default. The alerting system: + +- Monitors job execution failures and task completion issues +- Sends notifications to your specified Slack channel +- Uses Slack API token from Secret Manager (`khan-academy` project) +- Provides detailed failure information with direct links to logs +- Supports mentioning specific users/groups via `slack_mention_users` + +**Note**: Requires read access to the `Slack__API_token_for_alertlib` secret in the `khan-academy` project. + +To disable alerting, set `enable_alerting = false` in the module configuration. diff --git a/terraform/modules/scheduled-job/examples/simple-job/main.tf b/terraform/modules/scheduled-job/examples/simple-job/main.tf index 9cfc04d..74c8a26 100644 --- a/terraform/modules/scheduled-job/examples/simple-job/main.tf +++ b/terraform/modules/scheduled-job/examples/simple-job/main.tf @@ -69,6 +69,13 @@ module "daily_data_processor" { version = "latest" } ] + + # Alerting is enabled by default + slack_channel = var.slack_channel + slack_mention_users = ["@oncall"] # Optional: mention specific users/groups + + # Optional: Use different project for alerting resources + alert_project_id = var.alert_project_id } # Output the job details @@ -91,3 +98,12 @@ output "image_info" { image_tag = module.daily_data_processor_image.image_tag } } + +# Output alerting information +output "alerting_info" { + description = "Information about the alerting setup" + value = { + monitoring_notification_channel_name = module.daily_data_processor.monitoring_notification_channel_name + alert_policy_names = module.daily_data_processor.alert_policy_names + } +} diff --git a/terraform/modules/scheduled-job/examples/simple-job/variables.tf b/terraform/modules/scheduled-job/examples/simple-job/variables.tf index 22e236b..3f6f56b 100644 --- a/terraform/modules/scheduled-job/examples/simple-job/variables.tf +++ b/terraform/modules/scheduled-job/examples/simple-job/variables.tf @@ -13,3 +13,14 @@ variable "region" { type = string default = "us-central1" } + +variable "slack_channel" { + description = "Slack channel to send notifications to (e.g., '#my-team-channel')" + type = string +} + +variable "alert_project_id" { + description = "GCP project ID where monitoring and alerting resources will be created (optional, defaults to project_id)" + type = string + default = null +} diff --git a/terraform/modules/scheduled-job/main.tf b/terraform/modules/scheduled-job/main.tf index 74d5cf9..dd60503 100644 --- a/terraform/modules/scheduled-job/main.tf +++ b/terraform/modules/scheduled-job/main.tf @@ -278,4 +278,129 @@ resource "google_cloud_scheduler_job" "job_scheduler" { scope = "https://www.googleapis.com/auth/cloud-platform" } } -} \ No newline at end of file +} + +# Alerting resources (only created when enable_alerting is true) + +# Fetch Slack API token from Secret Manager +data "google_secret_manager_secret_version" "slack_token" { + count = var.enable_alerting ? 1 : 0 + + project = "khan-academy" + secret = "Slack__API_token_for_alertlib" +} + +locals { + alert_project_id = var.alert_project_id != null ? var.alert_project_id : var.project_id + slack_auth_token = var.enable_alerting ? data.google_secret_manager_secret_version.slack_token[0].secret_data : null + slack_cc_mention = length(var.slack_mention_users) > 0 ? "\n\nCC: ${join(" ", var.slack_mention_users)}" : "" + + # Console URLs for functions and jobs + function_console_url = "https://console.cloud.google.com/run/detail/${var.region}/${var.job_name}/observability/logs?project=${var.project_id}" + job_console_url = "https://console.cloud.google.com/run/jobs/detail/${var.region}/${var.job_name}/observability/logs?project=${var.project_id}" +} + +# Monitoring notification channel for Slack +resource "google_monitoring_notification_channel" "slack_channel" { + count = var.enable_alerting ? 1 : 0 + + project = local.alert_project_id + display_name = "${var.job_name} Slack Alerts" + type = "slack" + + labels = { + channel_name = var.slack_channel + } + + sensitive_labels { + auth_token = local.slack_auth_token + } +} + +# Monitoring policy for Cloud Function failures (when execution_type is "function") +resource "google_monitoring_alert_policy" "function_failure" { + count = var.enable_alerting && var.execution_type == "function" ? 1 : 0 + + project = local.alert_project_id + display_name = "${var.job_name} Function Failure Alert" + combiner = "OR" + enabled = true + + alert_strategy { + auto_close = "86400s" # Auto-close after 24 hours if condition is no longer met + } + + conditions { + display_name = "${var.job_name} function execution failure" + + condition_threshold { + filter = "resource.type=\"cloud_function\" AND resource.labels.function_name=\"${var.job_name}\" AND metric.type=\"cloudfunctions.googleapis.com/function/execution_count\" AND metric.labels.status!=\"ok\"" + + comparison = "COMPARISON_GT" + threshold_value = 0 + + duration = "60s" + + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_DELTA" + group_by_fields = ["resource.service_name"] + } + + trigger { + count = 1 + } + } + } + + notification_channels = [google_monitoring_notification_channel.slack_channel[0].name] + + documentation { + content = "The Cloud Function ${var.job_name} has failed to execute. Check the function logs for more details.\n\n[View Function in Console](${local.function_console_url})${local.slack_cc_mention}" + mime_type = "text/markdown" + } +} + +# Monitoring policy for Cloud Run Job failures (when execution_type is "job") +resource "google_monitoring_alert_policy" "job_failure" { + count = var.enable_alerting && var.execution_type == "job" ? 1 : 0 + + project = local.alert_project_id + display_name = "${var.job_name} Job Failure Alert" + combiner = "OR" + enabled = true + + alert_strategy { + auto_close = "86400s" # Auto-close after 24 hours if condition is no longer met + } + + conditions { + display_name = "${var.job_name} job execution failure" + + condition_threshold { + filter = "resource.type=\"cloud_run_job\" AND resource.labels.job_name=\"${var.job_name}\" AND metric.type=\"run.googleapis.com/job/completed_execution_count\" AND metric.labels.result!=\"succeeded\"" + + comparison = "COMPARISON_GT" + threshold_value = 0 + + duration = "60s" + + aggregations { + alignment_period = "60s" + per_series_aligner = "ALIGN_DELTA" + group_by_fields = ["resource.service_name"] + } + + trigger { + count = 1 + } + } + } + + notification_channels = [google_monitoring_notification_channel.slack_channel[0].name] + + documentation { + content = "The Cloud Run Job ${var.job_name} has failed to execute or complete successfully. Check the job logs for more details.\n\n[View Job in Console](${local.job_console_url})${local.slack_cc_mention}" + mime_type = "text/markdown" + } +} diff --git a/terraform/modules/scheduled-job/outputs.tf b/terraform/modules/scheduled-job/outputs.tf index a6d4f99..84c91d1 100644 --- a/terraform/modules/scheduled-job/outputs.tf +++ b/terraform/modules/scheduled-job/outputs.tf @@ -53,4 +53,18 @@ output "region" { output "execution_type" { description = "The execution type used (function or job)" value = var.execution_type -} \ No newline at end of file +} + +# Alerting outputs +output "monitoring_notification_channel_name" { + description = "Name of the monitoring notification channel (when alerting is enabled)" + value = var.enable_alerting ? google_monitoring_notification_channel.slack_channel[0].name : null +} + +output "alert_policy_names" { + description = "Names of the monitoring alert policies (when alerting is enabled)" + value = var.enable_alerting ? { + function_failure = var.execution_type == "function" ? google_monitoring_alert_policy.function_failure[0].display_name : null + job_failure = var.execution_type == "job" ? google_monitoring_alert_policy.job_failure[0].display_name : null + } : null +} diff --git a/terraform/modules/scheduled-job/variables.tf b/terraform/modules/scheduled-job/variables.tf index 813165a..191ee90 100644 --- a/terraform/modules/scheduled-job/variables.tf +++ b/terraform/modules/scheduled-job/variables.tf @@ -194,3 +194,27 @@ variable "job_image" { type = string default = null } + +# Alerting configuration +variable "enable_alerting" { + description = "Whether to enable alerting for job failures" + type = bool + default = true +} + +variable "slack_channel" { + description = "Slack channel to send notifications to (e.g., '#1s-and-0s')" + type = string +} + +variable "slack_mention_users" { + description = "List of Slack users or groups to mention in alerts (e.g., ['@user', '@group'])" + type = list(string) + default = [] +} + +variable "alert_project_id" { + description = "GCP project ID where monitoring and alerting resources will be created (defaults to project_id)" + type = string + default = null +}