From 1df53128deb259abf315828bd6a5795848dcad21 Mon Sep 17 00:00:00 2001 From: akshaysridhar Date: Fri, 11 Jul 2025 10:11:52 -0700 Subject: [PATCH 1/3] new file: buildkite_analysis.jl new file: test_buildkite_analysis.jl --- .buildkite/buildkite_analysis.jl | 548 ++++++++++++++++++++++++++ .buildkite/test_buildkite_analysis.jl | 67 ++++ 2 files changed, 615 insertions(+) create mode 100644 .buildkite/buildkite_analysis.jl create mode 100644 .buildkite/test_buildkite_analysis.jl diff --git a/.buildkite/buildkite_analysis.jl b/.buildkite/buildkite_analysis.jl new file mode 100644 index 0000000000..a8a842d434 --- /dev/null +++ b/.buildkite/buildkite_analysis.jl @@ -0,0 +1,548 @@ +module BuildkiteAnalysis + +import HTTP +import JSON +using Dates +using Logging + +# Export public functions +export set_offline_mode, + analyze_buildkite_log, + summarize_buildkite_errors, + BuildkiteError, + format_simulation_time + +""" + BuildkiteError +Structure containing error information from a Buildkite job +""" +struct BuildkiteError + error_code::Union{Int, Nothing} + last_sim_time::Union{Float64, Nothing} + job_id::String + build_id::String + step_name::String + timestamp::DateTime + clima_job_id::String # ClimaAtmos job identifier + sypd::Union{Float64, Nothing} # Simulated Years Per Day + build_status::String # Success/Failure/Cancelled status +end + +# Add offline mode configuration +mutable struct Config + offline_mode::Bool + log_directory::String +end + +# Global configuration +const GLOBAL_CONFIG = Config(false, "buildkite_logs") + +""" + set_offline_mode(;enabled::Bool = true, log_directory::String = "buildkite_logs") +Configure the module to run in offline mode using local log files. +""" +function set_offline_mode(; + enabled::Bool = true, + log_directory::String = "buildkite_logs", +) + GLOBAL_CONFIG.offline_mode = enabled + GLOBAL_CONFIG.log_directory = log_directory +end + +""" + save_log_locally(build_id::String, job_id::String, content::String) +Save a Buildkite log to a local file for offline use. +""" +function save_log_locally(build_id::String, job_id::String, content::String) + # Create log directory if it doesn't exist + mkpath(GLOBAL_CONFIG.log_directory) + + # Create a filename based on build and job IDs + filename = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", + ) + + # Save the content + write(filename, content) + @info "Saved log to $filename" +end + +""" + read_log_locally(build_id::String, job_id::String) +Read a Buildkite log from a local file. +""" +function read_log_locally(build_id::String, job_id::String) + filename = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", + ) + if !isfile(filename) + error("Log file not found: $filename") + end + return read(filename, String) +end + +""" + extract_sim_time(log_text::String) +Extract the simulation time from ClimaAtmos progress reports. +Returns the time in seconds if found, nothing otherwise. +""" +function extract_sim_time(log_text::String) + # Look for the progress report format + pattern = r"simulation_time = \"(\d+) weeks?, (\d+) days?\"" + + # Find all matches and take the last one (most recent) + matches = collect(eachmatch(pattern, log_text)) + if !isempty(matches) + last_match = last(matches) + weeks = parse(Int, last_match[1]) + days = parse(Int, last_match[2]) + + # Convert to seconds + total_days = weeks * 7 + days + return total_days * 86400.0 # days to seconds + end + + return nothing +end + +""" + extract_error_code(log_text::String) +Extract error code from log text. +Returns nothing if no error code is found. +""" +function extract_error_code(log_text::String) + # Common patterns in Buildkite logs + patterns = [ + r"Error: (\d+)", + r"exit code (\d+)", + r"Exit status: (\d+)", + r"Process exited with status (\d+)", + r"ERROR: Process completed with exit code (\d+)", + r"Test Failed at .* status (\d+)", + r"\[error\] Exit code: (\d+)", + ] + + for pattern in patterns + match_result = match(pattern, log_text) + if !isnothing(match_result) + return parse(Int, match_result[1]) + end + end + + # If we find specific error patterns without codes, assign custom codes + error_patterns = [ + (r"ERROR: LoadError:", 1), + (r"signal \((\d+)\)", -1), # Capture the signal number as negative + (r"ERROR: SystemError", 2), + (r"ERROR: ArgumentError", 3), + (r"Test Failed at", 1), + (r"ERROR: AssertionError", 4), + ] + + for (pattern, code) in error_patterns + if match(pattern, log_text) !== nothing + return code + end + end + + # Check for timeout or cancellation + if contains(log_text, "Cancelling") || contains(log_text, "Canceled") + return -15 # Common signal for termination + end + + return nothing +end + +""" + fetch_buildkite_log(build_id::String, job_id::String) +Fetch log content from Buildkite API or local file system depending on mode. +""" +function fetch_buildkite_log(build_id::String, job_id::String) + if GLOBAL_CONFIG.offline_mode + log_file = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", + ) + @info "Reading log for specific job" build_id job_id log_file + return read_log_locally(build_id, job_id) + else + token = get(ENV, "BUILDKITE_TOKEN", nothing) + if isnothing(token) + error( + "BUILDKITE_TOKEN environment variable must be set when not in offline mode", + ) + end + + # Note: This API endpoint is specifically for a single job's logs + api_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/climaatmos-gpulongruns/builds/$build_id/jobs/$job_id/log" + @info "Fetching log for specific job" build_id job_id api_url + + headers = Dict("Authorization" => "Bearer $token") + response = HTTP.get(api_url, headers) + content = String(response.body) + + # Save the job-specific log locally for future offline use + try + save_log_locally(build_id, job_id, content) + catch e + @warn "Failed to save log locally" build_id job_id exception=e + end + + return content + end +end + +""" + extract_job_id(log_text::String) +Extract the ClimaAtmos job identifier from the log. +Returns the job_id string if found, "unknown" otherwise. +""" +function extract_job_id(log_text::String) + # Look for the job_id pattern in the log + pattern = r"job_id = \"([^\"]+)\"" + match_result = match(pattern, log_text) + + if !isnothing(match_result) + return match_result[1] + end + + return "unknown" +end + +""" + extract_sypd(log_text::String) +Extract the estimated SYPD (Simulated Years Per Day) from the log. +Returns the SYPD as a float if found, nothing otherwise. +""" +function extract_sypd(log_text::String) + # Look for the SYPD pattern in the log + pattern = r"estimated_sypd = \"([0-9.]+)\"" + matches = collect(eachmatch(pattern, log_text)) + + if !isempty(matches) + # Take the last reported SYPD value + return parse(Float64, last(matches)[1]) + end + + return nothing +end + +""" + extract_build_status(log_text::String) +Determine if the build was successful, failed, or cancelled based on log patterns. +Returns "Success", "Failed", or "Cancelled". +""" +function extract_build_status(log_text::String) + # First verify we're looking at a single job's log + if !occursin(r"job_id = \"[^\"]+\"", log_text) + @warn "Log content might not be for a specific job" + end + + # First check for cancellation patterns (highest priority) + cancel_patterns = [ + r"Cancelling", + r"Canceled", + r"Build was canceled", + r"Terminated", + r"signal \(15\)", # SIGTERM + ] + + # Check for critical failure patterns + critical_failure_patterns = [ + r"Test Failed at", + r"Process exited with status [1-9]", + r"Build Failed", + r"Error running command", + r"signal \(9\)", # SIGKILL + r"ERROR: LoadError:", + ] + + # Check for success patterns + success_patterns = [ + r"All tests pass", + r"Process exited with status 0", + r"Build Finished", + r"Simulation completed successfully", + r"percent_complete = \"100%\"", + r"percent_complete = \"[9][5-9]\.?\d*%\"", # 95-99.x% + r"estimated_finish_date", # Progress report indicates running simulation + ] + + # First check for cancellation + for pattern in cancel_patterns + if occursin(pattern, log_text) + return "Cancelled" + end + end + + # Then check for critical failures + for pattern in critical_failure_patterns + if occursin(pattern, log_text) + return "Failed" + end + end + + # Check for success indicators + for pattern in success_patterns + if occursin(pattern, log_text) + # Additional verification: if we find a success pattern, + # make sure there's no critical failure after it + match_pos = findfirst(pattern, log_text) + if !isnothing(match_pos) + # Only check the log after this success pattern + remaining_log = log_text[match_pos.stop:end] + has_failure = any( + p -> occursin(p, remaining_log), + critical_failure_patterns, + ) + if !has_failure + return "Success" + end + end + end + end + + # For ClimaAtmos, if we see progress reports with high completion percentage + # and no critical failures, consider it a success + if occursin(r"percent_complete = \"[0-9]+\.?[0-9]*%\"", log_text) && + !any(p -> occursin(p, log_text), critical_failure_patterns) + return "Success" + end + + # If we have simulation progress and no critical failures, consider it running successfully + if occursin(r"simulation_time = \".*\"", log_text) && + !any(p -> occursin(p, log_text), critical_failure_patterns) + return "Success" + end + + # Default to Failed only if we can't determine success + return "Failed" +end + +""" + analyze_buildkite_log(build_id::String, job_id::String, step_name::String) +Analyze a single Buildkite job log and return structured error information. +""" +function analyze_buildkite_log( + build_id::String, + job_id::String, + step_name::String, +) + @info "Analyzing specific job log" build_id job_id step_name + + log_content = fetch_buildkite_log(build_id, job_id) + + # Verify we have the correct job's log by checking for job_id in the content + if !isnothing(match(r"job_id = \"([^\"]+)\"", log_content)) + extracted_job_id = extract_job_id(log_content) + @info "Found job identifier in log" extracted_job_id + end + + error_code = extract_error_code(log_content) + sim_time = extract_sim_time(log_content) + clima_job_id = extract_job_id(log_content) + sypd = extract_sypd(log_content) + build_status = extract_build_status(log_content) + + @info "Analysis results for job" build_id job_id build_status error_code + + return BuildkiteError( + error_code, + sim_time, + job_id, + build_id, + step_name, + now(), + clima_job_id, + sypd, + build_status, + ) +end + +""" + format_simulation_time(seconds::Float64) +Convert simulation time to weeks and days format as used in ClimaAtmos. +""" +function format_simulation_time(seconds::Float64) + total_days = seconds / 86400 # Convert seconds to days + weeks = floor(Int, total_days / 7) + remaining_days = round(Int, total_days % 7) + + # Format string components + week_str = weeks == 1 ? "week" : "weeks" + day_str = remaining_days == 1 ? "day" : "days" + + if weeks > 0 + return "$(weeks) $(week_str), $(remaining_days) $(day_str)" + else + return "$(remaining_days) $(day_str)" + end +end + +""" + generate_build_report(current::BuildkiteError, previous::Union{BuildkiteError, Nothing}=nothing) +Generate a report comparing two builds, or just current build if previous is nothing. +""" +function generate_build_report( + current::BuildkiteError, + previous::Union{BuildkiteError, Nothing} = nothing, +) + status_emoji = Dict("Success" => "✅", "Failed" => "❌", "Cancelled" => "⚠️") + + report = String[] + + # Build Information + push!(report, "Build Information:") + push!(report, "─" * repeat("─", 30)) + push!(report, "ClimaAtmos Job ID: $(current.clima_job_id)") + push!( + report, + "Current Build: $(current.build_id) $(get(status_emoji, current.build_status, "❓")) ($(current.build_status))", + ) + push!( + report, + "Current Build Date: $(Dates.format(current.timestamp, "yyyy-mm-dd HH:MM:SS"))", + ) + push!( + report, + isnothing(previous) ? "Previous Build: Not Available" : + "Previous Build: $(previous.build_id) $(get(status_emoji, previous.build_status, "❓")) ($(previous.build_status))", + ) + push!( + report, + isnothing(previous) ? "Previous Build Date: Not Available" : + "Previous Build Date: $(Dates.format(previous.timestamp, "yyyy-mm-dd HH:MM:SS"))", + ) + push!(report, "") + + # Simulation Progress + push!(report, "Simulation Progress:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run: $(isnothing(current.last_sim_time) ? "No simulation time found" : format_simulation_time(current.last_sim_time))", + ) + + if !isnothing(previous) + push!( + report, + "Previous Run: $(isnothing(previous.last_sim_time) ? "No simulation time found" : format_simulation_time(previous.last_sim_time))", + ) + if !isnothing(current.last_sim_time) && + !isnothing(previous.last_sim_time) + time_diff = current.last_sim_time - previous.last_sim_time + diff_days = abs(time_diff) / 86400 + diff_str = + diff_days >= 7 ? + "$(floor(Int, diff_days/7)) weeks, $(round(Int, diff_days%7)) days" : + "$(round(Int, diff_days)) days" + push!( + report, + "Progress Difference: $(diff_str) ($(time_diff < 0 ? "behind" : "ahead"))", + ) + end + else + push!(report, "Previous Run: Not Available") + end + push!(report, "") + + # Performance Metrics + push!(report, "Performance Metrics:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run SYPD: $(isnothing(current.sypd) ? "Not available" : round(current.sypd, digits=3))", + ) + + if !isnothing(previous) + push!( + report, + "Previous Run SYPD: $(isnothing(previous.sypd) ? "Not available" : round(previous.sypd, digits=3))", + ) + if !isnothing(current.sypd) && !isnothing(previous.sypd) + sypd_diff = current.sypd - previous.sypd + sypd_change_pct = (sypd_diff / previous.sypd) * 100 + push!( + report, + "SYPD Change: $(round(sypd_diff, digits=3)) ($(round(sypd_change_pct, digits=1))%)", + ) + end + else + push!(report, "Previous Run SYPD: Not Available") + end + push!(report, "") + + # Error Information + push!(report, "Error Information:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run Error Code: $(isnothing(current.error_code) ? "None" : current.error_code)", + ) + if !isnothing(previous) + push!( + report, + "Previous Run Error Code: $(isnothing(previous.error_code) ? "None" : previous.error_code)", + ) + if !isnothing(current.error_code) && + !isnothing(previous.error_code) && + current.error_code != previous.error_code + push!( + report, + "Error Code Changed: $(previous.error_code) → $(current.error_code)", + ) + end + else + push!(report, "Previous Run: Not Available") + end + + return join(report, "\n") +end + +""" + summarize_buildkite_errors(current_build::String, previous_build::String, job_ids::Vector{Tuple{String,String}}) +Generate summary reports for multiple jobs, comparing current and previous builds. +""" +function summarize_buildkite_errors( + current_build::String, + previous_build::String, + job_ids::Vector{Tuple{String, String}}, +) + # Temporarily disable info logging + current_logger = Logging.global_logger() + Logging.global_logger(Logging.SimpleLogger(stderr, Logging.Warn)) + + try + summaries = String[] + for (job_id, step_name) in job_ids + try + current_error = + analyze_buildkite_log(current_build, job_id, step_name) + previous_error = try + analyze_buildkite_log(previous_build, job_id, step_name) + catch e + isa(e, SystemError) || + contains(string(e), "Log file not found") ? + nothing : rethrow(e) + end + push!( + summaries, + generate_build_report(current_error, previous_error), + ) + catch e + push!( + summaries, + "Failed to analyze $step_name: $(sprint(showerror, e))", + ) + end + end + return join(summaries, "\n\n") + finally + # Restore original logger + Logging.global_logger(current_logger) + end +end + +end # module diff --git a/.buildkite/test_buildkite_analysis.jl b/.buildkite/test_buildkite_analysis.jl new file mode 100644 index 0000000000..54daefd220 --- /dev/null +++ b/.buildkite/test_buildkite_analysis.jl @@ -0,0 +1,67 @@ +using Pkg +Pkg.activate(@__DIR__) + +# First include and import the module +include("buildkite_analysis.jl") +using .BuildkiteAnalysis: + set_offline_mode, + analyze_buildkite_log, + summarize_buildkite_errors, + format_simulation_time + +# Test configuration +current_build_id = "577" +previous_build_id = "576" +job_ids = [("computer-hydrostatic-balance", "GPU Long Run Test")] + +# Function to set up offline test environment with actual log file +function setup_offline_test() + set_offline_mode(enabled = true, log_directory = "test_logs") + mkpath("test_logs") + + # Path to the actual log file in Downloads + downloads_log = joinpath( + homedir(), + "Downloads", + "climaatmos-gpulongruns_build_577_computer-hydrostatic-balance.log", + ) + + if !isfile(downloads_log) + error("Log file not found at: $downloads_log") + end + + # Copy the log file to our test directory with the expected naming format + cp( + downloads_log, + joinpath("test_logs", "build_577_job_computer-hydrostatic-balance.log"), + ) +end + +# Function to clean up test files +function cleanup_test() + rm("test_logs", recursive = true, force = true) +end + +# Run the analysis with actual log file +function run_analysis() + setup_offline_test() + + try + summary = summarize_buildkite_errors( + current_build_id, + previous_build_id, + job_ids, + ) + println("\nBuildkite Analysis Summary:") + println("=================================") + println(summary) + catch e + println("Error during analysis: ", e) + println(sprint(showerror, e, catch_backtrace())) + finally + cleanup_test() + end +end + +# Run the analysis +run_analysis() From 266b78e53f0154b331cb107d4eff9434f2490121 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Fri, 11 Jul 2025 10:36:35 -0700 Subject: [PATCH 2/3] restructure module --- .buildkite/buildkite_analysis.jl | 590 ++++++++++++++++--------------- 1 file changed, 306 insertions(+), 284 deletions(-) diff --git a/.buildkite/buildkite_analysis.jl b/.buildkite/buildkite_analysis.jl index a8a842d434..e9b4333ced 100644 --- a/.buildkite/buildkite_analysis.jl +++ b/.buildkite/buildkite_analysis.jl @@ -38,53 +38,281 @@ end const GLOBAL_CONFIG = Config(false, "buildkite_logs") """ - set_offline_mode(;enabled::Bool = true, log_directory::String = "buildkite_logs") -Configure the module to run in offline mode using local log files. + generate_build_report(current::BuildkiteError, previous::Union{BuildkiteError, Nothing}=nothing) + +The main function of this module, used to generate a report comparing two builds. +Generate a report comparing two builds, or just current build if previous is nothing. """ -function set_offline_mode(; - enabled::Bool = true, - log_directory::String = "buildkite_logs", +function generate_build_report( + current::BuildkiteError, + previous::Union{BuildkiteError, Nothing} = nothing, ) - GLOBAL_CONFIG.offline_mode = enabled - GLOBAL_CONFIG.log_directory = log_directory + status_emoji = Dict("Success" => "✅", "Failed" => "❌", "Cancelled" => "⚠️") + + report = String[] + + # Build Information + push!(report, "Build Information:") + push!(report, "─" * repeat("─", 30)) + push!(report, "ClimaAtmos Job ID: $(current.clima_job_id)") + push!( + report, + "Current Build: $(current.build_id) $(get(status_emoji, current.build_status, "❓")) ($(current.build_status))", + ) + push!( + report, + "Current Build Date: $(Dates.format(current.timestamp, "yyyy-mm-dd HH:MM:SS"))", + ) + push!( + report, + isnothing(previous) ? "Previous Build: Not Available" : + "Previous Build: $(previous.build_id) $(get(status_emoji, previous.build_status, "❓")) ($(previous.build_status))", + ) + push!( + report, + isnothing(previous) ? "Previous Build Date: Not Available" : + "Previous Build Date: $(Dates.format(previous.timestamp, "yyyy-mm-dd HH:MM:SS"))", + ) + push!(report, "") + + # Simulation Progress + push!(report, "Simulation Progress:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run: $(isnothing(current.last_sim_time) ? "No simulation time found" : format_simulation_time(current.last_sim_time))", + ) + + if !isnothing(previous) + push!( + report, + "Previous Run: $(isnothing(previous.last_sim_time) ? "No simulation time found" : format_simulation_time(previous.last_sim_time))", + ) + if !isnothing(current.last_sim_time) && + !isnothing(previous.last_sim_time) + time_diff = current.last_sim_time - previous.last_sim_time + diff_days = abs(time_diff) / 86400 + diff_str = + diff_days >= 7 ? + "$(floor(Int, diff_days/7)) weeks, $(round(Int, diff_days%7)) days" : + "$(round(Int, diff_days)) days" + push!( + report, + "Progress Difference: $(diff_str) ($(time_diff < 0 ? "behind" : "ahead"))", + ) + end + else + push!(report, "Previous Run: Not Available") + end + push!(report, "") + + # Performance Metrics + push!(report, "Performance Metrics:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run SYPD: $(isnothing(current.sypd) ? "Not available" : round(current.sypd, digits=3))", + ) + + if !isnothing(previous) + push!( + report, + "Previous Run SYPD: $(isnothing(previous.sypd) ? "Not available" : round(previous.sypd, digits=3))", + ) + if !isnothing(current.sypd) && !isnothing(previous.sypd) + sypd_diff = current.sypd - previous.sypd + sypd_change_pct = (sypd_diff / previous.sypd) * 100 + push!( + report, + "SYPD Change: $(round(sypd_diff, digits=3)) ($(round(sypd_change_pct, digits=1))%)", + ) + end + else + push!(report, "Previous Run SYPD: Not Available") + end + push!(report, "") + + # Error Information + push!(report, "Error Information:") + push!(report, "─" * repeat("─", 30)) + push!( + report, + "Current Run Error Code: $(isnothing(current.error_code) ? "None" : current.error_code)", + ) + if !isnothing(previous) + push!( + report, + "Previous Run Error Code: $(isnothing(previous.error_code) ? "None" : previous.error_code)", + ) + if !isnothing(current.error_code) && + !isnothing(previous.error_code) && + current.error_code != previous.error_code + push!( + report, + "Error Code Changed: $(previous.error_code) → $(current.error_code)", + ) + end + else + push!(report, "Previous Run: Not Available") + end + + return join(report, "\n") end """ - save_log_locally(build_id::String, job_id::String, content::String) -Save a Buildkite log to a local file for offline use. + fetch_buildkite_log(build_id::String, job_id::String) + +Fetch log content from Buildkite API or local file system depending on mode. """ -function save_log_locally(build_id::String, job_id::String, content::String) - # Create log directory if it doesn't exist - mkpath(GLOBAL_CONFIG.log_directory) +function fetch_buildkite_log(build_id::String, job_id::String) + if GLOBAL_CONFIG.offline_mode + log_file = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", + ) + @info "Reading log for specific job" build_id job_id log_file + return read_log_locally(build_id, job_id) + else + token = get(ENV, "BUILDKITE_TOKEN", nothing) + if isnothing(token) + error( + "BUILDKITE_TOKEN environment variable must be set when not in offline mode", + ) + end - # Create a filename based on build and job IDs - filename = joinpath( - GLOBAL_CONFIG.log_directory, - "build_$(build_id)_job_$(job_id).log", - ) + # Note: This API endpoint is specifically for a single job's logs + api_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/climaatmos-gpulongruns/builds/$build_id/jobs/$job_id/log" + @info "Fetching log for specific job" build_id job_id api_url - # Save the content - write(filename, content) - @info "Saved log to $filename" + headers = Dict("Authorization" => "Bearer $token") + response = HTTP.get(api_url, headers) + content = String(response.body) + + # Save the job-specific log locally for future offline use + try + save_log_locally(build_id, job_id, content) + catch e + @warn "Failed to save log locally" build_id job_id exception=e + end + + return content + end end """ - read_log_locally(build_id::String, job_id::String) -Read a Buildkite log from a local file. + analyze_buildkite_log(build_id::String, job_id::String, step_name::String) + +Analyze a single Buildkite job log and return structured error information. """ -function read_log_locally(build_id::String, job_id::String) - filename = joinpath( - GLOBAL_CONFIG.log_directory, - "build_$(build_id)_job_$(job_id).log", +function analyze_buildkite_log( + build_id::String, + job_id::String, + step_name::String, +) + @info "Analyzing specific job log" build_id job_id step_name + + log_content = fetch_buildkite_log(build_id, job_id) + + # Verify we have the correct job's log by checking for job_id in the content + if !isnothing(match(r"job_id = \"([^\"]+)\"", log_content)) + extracted_job_id = extract_job_id(log_content) + @info "Found job identifier in log" extracted_job_id + end + + error_code = extract_error_code(log_content) + sim_time = extract_sim_time(log_content) + clima_job_id = extract_job_id(log_content) + sypd = extract_sypd(log_content) + build_status = extract_build_status(log_content) + + @info "Analysis results for job" build_id job_id build_status error_code + + return BuildkiteError( + error_code, + sim_time, + job_id, + build_id, + step_name, + now(), + clima_job_id, + sypd, + build_status, ) - if !isfile(filename) - error("Log file not found: $filename") +end + +""" + format_simulation_time(seconds::Float64) + +Convert simulation time to weeks and days format as used in ClimaAtmos. +""" +function format_simulation_time(seconds::Float64) + total_days = seconds / 86400 # Convert seconds to days + weeks = floor(Int, total_days / 7) + remaining_days = round(Int, total_days % 7) + + # Format string components + week_str = weeks == 1 ? "week" : "weeks" + day_str = remaining_days == 1 ? "day" : "days" + + if weeks > 0 + return "$(weeks) $(week_str), $(remaining_days) $(day_str)" + else + return "$(remaining_days) $(day_str)" + end +end + +""" + summarize_buildkite_errors(current_build::String, previous_build::String, job_ids::Vector{Tuple{String,String}}) + +Generate summary reports for multiple jobs, comparing current and previous builds. +""" +function summarize_buildkite_errors( + current_build::String, + previous_build::String, + job_ids::Vector{Tuple{String, String}}, +) + # Temporarily disable info logging + current_logger = Logging.global_logger() + Logging.global_logger(Logging.SimpleLogger(stderr, Logging.Warn)) + + try + summaries = String[] + for (job_id, step_name) in job_ids + try + current_error = + analyze_buildkite_log(current_build, job_id, step_name) + previous_error = try + analyze_buildkite_log(previous_build, job_id, step_name) + catch e + isa(e, SystemError) || + contains(string(e), "Log file not found") ? + nothing : rethrow(e) + end + push!( + summaries, + generate_build_report(current_error, previous_error), + ) + catch e + push!( + summaries, + "Failed to analyze $step_name: $(sprint(showerror, e))", + ) + end + end + return join(summaries, "\n\n") + finally + # Restore original logger + Logging.global_logger(current_logger) end - return read(filename, String) end +########### +# Helper functions for parsing information from logs +########### + """ extract_sim_time(log_text::String) + Extract the simulation time from ClimaAtmos progress reports. Returns the time in seconds if found, nothing otherwise. """ @@ -109,6 +337,7 @@ end """ extract_error_code(log_text::String) + Extract error code from log text. Returns nothing if no error code is found. """ @@ -141,61 +370,23 @@ function extract_error_code(log_text::String) (r"ERROR: AssertionError", 4), ] - for (pattern, code) in error_patterns - if match(pattern, log_text) !== nothing - return code - end - end - - # Check for timeout or cancellation - if contains(log_text, "Cancelling") || contains(log_text, "Canceled") - return -15 # Common signal for termination - end - - return nothing -end - -""" - fetch_buildkite_log(build_id::String, job_id::String) -Fetch log content from Buildkite API or local file system depending on mode. -""" -function fetch_buildkite_log(build_id::String, job_id::String) - if GLOBAL_CONFIG.offline_mode - log_file = joinpath( - GLOBAL_CONFIG.log_directory, - "build_$(build_id)_job_$(job_id).log", - ) - @info "Reading log for specific job" build_id job_id log_file - return read_log_locally(build_id, job_id) - else - token = get(ENV, "BUILDKITE_TOKEN", nothing) - if isnothing(token) - error( - "BUILDKITE_TOKEN environment variable must be set when not in offline mode", - ) - end - - # Note: This API endpoint is specifically for a single job's logs - api_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/climaatmos-gpulongruns/builds/$build_id/jobs/$job_id/log" - @info "Fetching log for specific job" build_id job_id api_url - - headers = Dict("Authorization" => "Bearer $token") - response = HTTP.get(api_url, headers) - content = String(response.body) - - # Save the job-specific log locally for future offline use - try - save_log_locally(build_id, job_id, content) - catch e - @warn "Failed to save log locally" build_id job_id exception=e + for (pattern, code) in error_patterns + if match(pattern, log_text) !== nothing + return code end + end - return content + # Check for timeout or cancellation + if contains(log_text, "Cancelling") || contains(log_text, "Canceled") + return -15 # Common signal for termination end + + return nothing end """ extract_job_id(log_text::String) + Extract the ClimaAtmos job identifier from the log. Returns the job_id string if found, "unknown" otherwise. """ @@ -213,6 +404,7 @@ end """ extract_sypd(log_text::String) + Extract the estimated SYPD (Simulated Years Per Day) from the log. Returns the SYPD as a float if found, nothing otherwise. """ @@ -231,6 +423,7 @@ end """ extract_build_status(log_text::String) + Determine if the build was successful, failed, or cancelled based on log patterns. Returns "Success", "Failed", or "Cancelled". """ @@ -321,228 +514,57 @@ function extract_build_status(log_text::String) return "Failed" end +############ +# Helper functions for running local tests +############ + """ - analyze_buildkite_log(build_id::String, job_id::String, step_name::String) -Analyze a single Buildkite job log and return structured error information. + set_offline_mode(;enabled::Bool = true, log_directory::String = "buildkite_logs") + +Configure the module to run in offline mode using local log files. """ -function analyze_buildkite_log( - build_id::String, - job_id::String, - step_name::String, +function set_offline_mode(; + enabled::Bool = true, + log_directory::String = "buildkite_logs", ) - @info "Analyzing specific job log" build_id job_id step_name - - log_content = fetch_buildkite_log(build_id, job_id) - - # Verify we have the correct job's log by checking for job_id in the content - if !isnothing(match(r"job_id = \"([^\"]+)\"", log_content)) - extracted_job_id = extract_job_id(log_content) - @info "Found job identifier in log" extracted_job_id - end - - error_code = extract_error_code(log_content) - sim_time = extract_sim_time(log_content) - clima_job_id = extract_job_id(log_content) - sypd = extract_sypd(log_content) - build_status = extract_build_status(log_content) - - @info "Analysis results for job" build_id job_id build_status error_code - - return BuildkiteError( - error_code, - sim_time, - job_id, - build_id, - step_name, - now(), - clima_job_id, - sypd, - build_status, - ) + GLOBAL_CONFIG.offline_mode = enabled + GLOBAL_CONFIG.log_directory = log_directory end """ - format_simulation_time(seconds::Float64) -Convert simulation time to weeks and days format as used in ClimaAtmos. -""" -function format_simulation_time(seconds::Float64) - total_days = seconds / 86400 # Convert seconds to days - weeks = floor(Int, total_days / 7) - remaining_days = round(Int, total_days % 7) - - # Format string components - week_str = weeks == 1 ? "week" : "weeks" - day_str = remaining_days == 1 ? "day" : "days" - - if weeks > 0 - return "$(weeks) $(week_str), $(remaining_days) $(day_str)" - else - return "$(remaining_days) $(day_str)" - end -end + save_log_locally(build_id::String, job_id::String, content::String) +Save a Buildkite log to a local file for offline use. """ - generate_build_report(current::BuildkiteError, previous::Union{BuildkiteError, Nothing}=nothing) -Generate a report comparing two builds, or just current build if previous is nothing. -""" -function generate_build_report( - current::BuildkiteError, - previous::Union{BuildkiteError, Nothing} = nothing, -) - status_emoji = Dict("Success" => "✅", "Failed" => "❌", "Cancelled" => "⚠️") - - report = String[] - - # Build Information - push!(report, "Build Information:") - push!(report, "─" * repeat("─", 30)) - push!(report, "ClimaAtmos Job ID: $(current.clima_job_id)") - push!( - report, - "Current Build: $(current.build_id) $(get(status_emoji, current.build_status, "❓")) ($(current.build_status))", - ) - push!( - report, - "Current Build Date: $(Dates.format(current.timestamp, "yyyy-mm-dd HH:MM:SS"))", - ) - push!( - report, - isnothing(previous) ? "Previous Build: Not Available" : - "Previous Build: $(previous.build_id) $(get(status_emoji, previous.build_status, "❓")) ($(previous.build_status))", - ) - push!( - report, - isnothing(previous) ? "Previous Build Date: Not Available" : - "Previous Build Date: $(Dates.format(previous.timestamp, "yyyy-mm-dd HH:MM:SS"))", - ) - push!(report, "") - - # Simulation Progress - push!(report, "Simulation Progress:") - push!(report, "─" * repeat("─", 30)) - push!( - report, - "Current Run: $(isnothing(current.last_sim_time) ? "No simulation time found" : format_simulation_time(current.last_sim_time))", - ) - - if !isnothing(previous) - push!( - report, - "Previous Run: $(isnothing(previous.last_sim_time) ? "No simulation time found" : format_simulation_time(previous.last_sim_time))", - ) - if !isnothing(current.last_sim_time) && - !isnothing(previous.last_sim_time) - time_diff = current.last_sim_time - previous.last_sim_time - diff_days = abs(time_diff) / 86400 - diff_str = - diff_days >= 7 ? - "$(floor(Int, diff_days/7)) weeks, $(round(Int, diff_days%7)) days" : - "$(round(Int, diff_days)) days" - push!( - report, - "Progress Difference: $(diff_str) ($(time_diff < 0 ? "behind" : "ahead"))", - ) - end - else - push!(report, "Previous Run: Not Available") - end - push!(report, "") - - # Performance Metrics - push!(report, "Performance Metrics:") - push!(report, "─" * repeat("─", 30)) - push!( - report, - "Current Run SYPD: $(isnothing(current.sypd) ? "Not available" : round(current.sypd, digits=3))", - ) - - if !isnothing(previous) - push!( - report, - "Previous Run SYPD: $(isnothing(previous.sypd) ? "Not available" : round(previous.sypd, digits=3))", - ) - if !isnothing(current.sypd) && !isnothing(previous.sypd) - sypd_diff = current.sypd - previous.sypd - sypd_change_pct = (sypd_diff / previous.sypd) * 100 - push!( - report, - "SYPD Change: $(round(sypd_diff, digits=3)) ($(round(sypd_change_pct, digits=1))%)", - ) - end - else - push!(report, "Previous Run SYPD: Not Available") - end - push!(report, "") +function save_log_locally(build_id::String, job_id::String, content::String) + # Create log directory if it doesn't exist + mkpath(GLOBAL_CONFIG.log_directory) - # Error Information - push!(report, "Error Information:") - push!(report, "─" * repeat("─", 30)) - push!( - report, - "Current Run Error Code: $(isnothing(current.error_code) ? "None" : current.error_code)", + # Create a filename based on build and job IDs + filename = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", ) - if !isnothing(previous) - push!( - report, - "Previous Run Error Code: $(isnothing(previous.error_code) ? "None" : previous.error_code)", - ) - if !isnothing(current.error_code) && - !isnothing(previous.error_code) && - current.error_code != previous.error_code - push!( - report, - "Error Code Changed: $(previous.error_code) → $(current.error_code)", - ) - end - else - push!(report, "Previous Run: Not Available") - end - return join(report, "\n") + # Save the content + write(filename, content) + @info "Saved log to $filename" end """ - summarize_buildkite_errors(current_build::String, previous_build::String, job_ids::Vector{Tuple{String,String}}) -Generate summary reports for multiple jobs, comparing current and previous builds. -""" -function summarize_buildkite_errors( - current_build::String, - previous_build::String, - job_ids::Vector{Tuple{String, String}}, -) - # Temporarily disable info logging - current_logger = Logging.global_logger() - Logging.global_logger(Logging.SimpleLogger(stderr, Logging.Warn)) + read_log_locally(build_id::String, job_id::String) - try - summaries = String[] - for (job_id, step_name) in job_ids - try - current_error = - analyze_buildkite_log(current_build, job_id, step_name) - previous_error = try - analyze_buildkite_log(previous_build, job_id, step_name) - catch e - isa(e, SystemError) || - contains(string(e), "Log file not found") ? - nothing : rethrow(e) - end - push!( - summaries, - generate_build_report(current_error, previous_error), - ) - catch e - push!( - summaries, - "Failed to analyze $step_name: $(sprint(showerror, e))", - ) - end - end - return join(summaries, "\n\n") - finally - # Restore original logger - Logging.global_logger(current_logger) +Read a Buildkite log from a local file. +""" +function read_log_locally(build_id::String, job_id::String) + filename = joinpath( + GLOBAL_CONFIG.log_directory, + "build_$(build_id)_job_$(job_id).log", + ) + if !isfile(filename) + error("Log file not found: $filename") end + return read(filename, String) end -end # module +end # module From 9a6a9681679f17eaccad3c4ee6012c2f05c0f1f3 Mon Sep 17 00:00:00 2001 From: Julia Sloan Date: Tue, 15 Jul 2025 11:26:29 -0700 Subject: [PATCH 3/3] debugging http buildkite requests --- .buildkite/buildkite_analysis.jl | 69 ++++++++++++++++++++------- .buildkite/test_buildkite_analysis.jl | 19 +++++--- 2 files changed, 65 insertions(+), 23 deletions(-) diff --git a/.buildkite/buildkite_analysis.jl b/.buildkite/buildkite_analysis.jl index e9b4333ced..afa8de7799 100644 --- a/.buildkite/buildkite_analysis.jl +++ b/.buildkite/buildkite_analysis.jl @@ -23,7 +23,7 @@ struct BuildkiteError build_id::String step_name::String timestamp::DateTime - clima_job_id::String # ClimaAtmos job identifier + clima_job_id::String # ClimaCoupler job identifier sypd::Union{Float64, Nothing} # Simulated Years Per Day build_status::String # Success/Failure/Cancelled status end @@ -54,7 +54,7 @@ function generate_build_report( # Build Information push!(report, "Build Information:") push!(report, "─" * repeat("─", 30)) - push!(report, "ClimaAtmos Job ID: $(current.clima_job_id)") + push!(report, "ClimaCoupler Job ID: $(current.clima_job_id)") push!( report, "Current Build: $(current.build_id) $(get(status_emoji, current.build_status, "❓")) ($(current.build_status))", @@ -160,11 +160,11 @@ function generate_build_report( end """ - fetch_buildkite_log(build_id::String, job_id::String) + fetch_buildkite_log(pipeline::String, build_id::String, job_id::String) Fetch log content from Buildkite API or local file system depending on mode. """ -function fetch_buildkite_log(build_id::String, job_id::String) +function fetch_buildkite_log(pipeline::String, build_id::String, job_id::String) if GLOBAL_CONFIG.offline_mode log_file = joinpath( GLOBAL_CONFIG.log_directory, @@ -180,12 +180,45 @@ function fetch_buildkite_log(build_id::String, job_id::String) ) end + # # Fetch all jobs for this build + # job_list_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/climacoupler-longruns/builds/915/jobs" + # headers = headers = [ + # "Authorization" => "Bearer $token", + # "Accept" => "application/json" + # ] + + + # @info "fetching jobs list" + # response = HTTP.get(job_list_url, headers) + # jobs = JSON3.read(String(response.body)) + + # for job in jobs + # println("Job ID: ", job["id"]) + # println("Type: ", job["type"]) + # println("Command: ", get(job, "command", "N/A")) + # println("---") + # end + + # @assert false + # Note: This API endpoint is specifically for a single job's logs - api_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/climaatmos-gpulongruns/builds/$build_id/jobs/$job_id/log" - @info "Fetching log for specific job" build_id job_id api_url + api_url = "https://api.buildkite.com/v2/organizations/clima/pipelines/$pipeline/builds/$build_id/jobs/$job_id/log" + @show "Fetching log for specific job" build_id job_id api_url + + headers = [ + "Authorization" => "Bearer $token", + "Accept" => "application/json" + ] + + + @info "Final API URL: $api_url" + @info "Headers: $headers" - headers = Dict("Authorization" => "Bearer $token") response = HTTP.get(api_url, headers) + + @info "Response: $(response.status) $(String(response.body))" + println("Status: ", response.status) + println("Body: ", String(response.body)) content = String(response.body) # Save the job-specific log locally for future offline use @@ -200,18 +233,19 @@ function fetch_buildkite_log(build_id::String, job_id::String) end """ - analyze_buildkite_log(build_id::String, job_id::String, step_name::String) + analyze_buildkite_log(pipeline::String, build_id::String, job_id::String, step_name::String) Analyze a single Buildkite job log and return structured error information. """ function analyze_buildkite_log( + pipeline::String, build_id::String, job_id::String, step_name::String, ) @info "Analyzing specific job log" build_id job_id step_name - log_content = fetch_buildkite_log(build_id, job_id) + log_content = fetch_buildkite_log(pipeline, build_id, job_id) # Verify we have the correct job's log by checking for job_id in the content if !isnothing(match(r"job_id = \"([^\"]+)\"", log_content)) @@ -243,7 +277,7 @@ end """ format_simulation_time(seconds::Float64) -Convert simulation time to weeks and days format as used in ClimaAtmos. +Convert simulation time to weeks and days format as used in ClimaCoupler. """ function format_simulation_time(seconds::Float64) total_days = seconds / 86400 # Convert seconds to days @@ -262,11 +296,12 @@ function format_simulation_time(seconds::Float64) end """ - summarize_buildkite_errors(current_build::String, previous_build::String, job_ids::Vector{Tuple{String,String}}) + summarize_buildkite_errors(pipeline::String, current_build::String, previous_build::String, job_ids::Vector{Tuple{String,String}}) Generate summary reports for multiple jobs, comparing current and previous builds. """ function summarize_buildkite_errors( + pipeline::String, current_build::String, previous_build::String, job_ids::Vector{Tuple{String, String}}, @@ -275,14 +310,16 @@ function summarize_buildkite_errors( current_logger = Logging.global_logger() Logging.global_logger(Logging.SimpleLogger(stderr, Logging.Warn)) + @info "Summarizing errors for pipeline" pipeline current_build previous_build + try summaries = String[] for (job_id, step_name) in job_ids try current_error = - analyze_buildkite_log(current_build, job_id, step_name) + analyze_buildkite_log(pipeline, current_build, job_id, step_name) previous_error = try - analyze_buildkite_log(previous_build, job_id, step_name) + analyze_buildkite_log(pipeline, previous_build, job_id, step_name) catch e isa(e, SystemError) || contains(string(e), "Log file not found") ? @@ -313,7 +350,7 @@ end """ extract_sim_time(log_text::String) -Extract the simulation time from ClimaAtmos progress reports. +Extract the simulation time from ClimaCoupler progress reports. Returns the time in seconds if found, nothing otherwise. """ function extract_sim_time(log_text::String) @@ -387,7 +424,7 @@ end """ extract_job_id(log_text::String) -Extract the ClimaAtmos job identifier from the log. +Extract the ClimaCoupler job identifier from the log. Returns the job_id string if found, "unknown" otherwise. """ function extract_job_id(log_text::String) @@ -497,7 +534,7 @@ function extract_build_status(log_text::String) end end - # For ClimaAtmos, if we see progress reports with high completion percentage + # For ClimaCoupler, if we see progress reports with high completion percentage # and no critical failures, consider it a success if occursin(r"percent_complete = \"[0-9]+\.?[0-9]*%\"", log_text) && !any(p -> occursin(p, log_text), critical_failure_patterns) diff --git a/.buildkite/test_buildkite_analysis.jl b/.buildkite/test_buildkite_analysis.jl index 54daefd220..f58716636a 100644 --- a/.buildkite/test_buildkite_analysis.jl +++ b/.buildkite/test_buildkite_analysis.jl @@ -10,9 +10,11 @@ using .BuildkiteAnalysis: format_simulation_time # Test configuration -current_build_id = "577" -previous_build_id = "576" -job_ids = [("computer-hydrostatic-balance", "GPU Long Run Test")] +pipeline = "climacoupler-longruns" +current_build_id = "915" +previous_build_id = "914" +job_id_step_names = [("amip_diagedmf_topo_integrated_land_gpu", "GPU AMIP + diag. EDMF + Earth topography + integrated land"), +("amip_edonly_topo_integrated_land_gpu", "GPU AMIP + ED only + Earth topography + integrated land")] # Function to set up offline test environment with actual log file function setup_offline_test() @@ -43,14 +45,17 @@ function cleanup_test() end # Run the analysis with actual log file -function run_analysis() - setup_offline_test() +function run_analysis(is_offline::Bool = true) + is_offline && setup_offline_test() + + pipeline = "climacoupler-longruns" try summary = summarize_buildkite_errors( + pipeline, current_build_id, previous_build_id, - job_ids, + job_id_step_names, ) println("\nBuildkite Analysis Summary:") println("=================================") @@ -64,4 +69,4 @@ function run_analysis() end # Run the analysis -run_analysis() +run_analysis(false)