From 4e73179ef8cd0324806f61dabb24d33d10066b16 Mon Sep 17 00:00:00 2001 From: Taher Kathanawala Date: Thu, 13 Feb 2025 14:39:24 +0530 Subject: [PATCH 1/2] update runner group in case of error --- main.sh | 54 ++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 6 deletions(-) diff --git a/main.sh b/main.sh index fb73059..ed8bc89 100755 --- a/main.sh +++ b/main.sh @@ -126,6 +126,7 @@ log_date() { #{{{ err() { #{{{ printf "%s ${C_RED_BOLD}ERROR: ${C_RESET}%s${C_BOLD} %s${C_RESET} %s\n" "$(log_date)" "${1}" "${2}" "${@:3}" >&2 + update_runner_group "${1} ${2} ${@:3}" } #}}}: err @@ -135,6 +136,7 @@ log_err() { #{{{ msg="$(tail -n1 "$LOG_FILE" | cut -d":" -f2-)" err="$(tail -n1 "$LOG_FILE" | cut -d":" -f1)" printf "%s ${C_RED_BOLD}ERROR: ${C_RESET}%s${C_BOLD} %s${C_RESET}\n" "$(log_date)" "$err" "$msg" >&2 + update_runner_group "${err} ${msg}" } #}}}: log_err @@ -339,6 +341,7 @@ check_fluentbit_status() { #{{{ if ignore_fluentbit_errors; then debug "Ignoring Fluentbit error(s) $err_msg" else + # TODO: update runner group with error err "Fluentbit encountered error(s)" "$err_msg" if ! no_clean_on_fail; then clean_local_setup & spinner "$!" "Starting cleanup" @@ -348,6 +351,7 @@ check_fluentbit_status() { #{{{ info "WARNING:" "If retrying a new registration, do not use --no-clean-on-fail as it leaves the system in an inconsistent state only useful for debugging purposes" fi info "Use --ignore-fluentbit-errors to ignore errors and proceed with the registration process" + # TODO: update runner group with error exit 1 fi else @@ -491,24 +495,37 @@ cgroupsv2() { #{{{ } #}}}: cgroupsv2 +update_runner_group(){ + url="${SG_BASE_API}/orgs/${ORGANIZATION_ID}/runnergroups/${RUNNER_GROUP_ID}/" + + payload="{ \"RegistrationErrors\": { \"${RUNNER_ID}\": \"$(echo -n "$1" | sed 's/[][\\.^$*\/|(){}?&"\'']//g')\" } }" + + if api_call "PATCH" "$payload"; then + debug "updated runner group with error msg" + else + debug "failed to update runner group with error msg" + fi +} + api_call() { #{{{ # TODO: Support draining of instance - if [[ -n "$1" ]]; then + if [[ -n "$2" ]]; then response=$(curl --max-time 10 -i -s \ - -X POST \ + -X "$1" \ -H "Authorization: apikey ${SG_NODE_TOKEN}" \ -H "Content-Type: application/json" \ - -d "$1" \ + -d "$2" \ "${url}") else response=$(curl --max-time 10 -i -s \ - -X POST \ + -X "$1" \ -H "Authorization: apikey ${SG_NODE_TOKEN}" \ -H "Content-Type: application/json" \ "${url}") fi if [[ -z "$response" ]]; then + # TODO: update runner group with error exit 1 else full_response="$response" @@ -538,6 +555,7 @@ api_call() { #{{{ if [[ -z "$status_code" ]]; then err "Unknown status code." + # TODO: update runner group with error exit 1 elif [ "$status_code" != "200" ] && [ "$status_code" != "201" ] && [ "$status_code" != "100" ]; then return 1 @@ -694,6 +712,8 @@ clean_local_setup() { #{{{ clean_cron + [[ -e "/tmp/env_variables.sh" ]] && source /tmp/env_variables.sh || : + # Wait for AWS SSM Managed Instance to deregister on AWS side sleep 10s @@ -703,6 +723,7 @@ clean_local_setup() { #{{{ check_variable_value() { #{{{ local variable_name=$1 + # TODO: update runner group with error [[ -z "${!variable_name}" ]] && \ err "Variable can't be empty" "$variable_name" && exit 1 return 0 @@ -991,18 +1012,21 @@ fetch_organization_info() { #{{{ debug "Calling URL:" "${url}" - if api_call; then + if api_call "POST"; then spinner_msg "Trying to fetch registration data" 0 spinner_wait "Preparing environment.." metadata="$(echo "${response}" | jq -r '.data.RegistrationMetadata[0]')" if [[ "$metadata" == "null" || -z "$metadata" ]]; then spinner_msg "Preparing environment.." 1 + # TODO: update runner group with error err "API data missing registration metadata." exit 1 fi else spinner_msg "Trying to fetch registration data" 1 + # TODO: update runner group with error err "Could not fetch data from API." "$status_code" "$message" + # TODO: update runner group with error exit 1 fi spinner_msg "Preparing environment" 0 @@ -1049,6 +1073,7 @@ fetch_organization_info() { #{{{ info "AWS role is used for S3 Storage Backend auth" "$S3_AWS_ROLE_ARN" else err "Auth for storage backend is not correctly configured, neither AWS_STATIC nor AWS_RBAC integration is provided" + # TODO: update runner group with error exit 1 fi elif [[ "$STORAGE_BACKEND_TYPE" == "azure_blob_storage" ]]; then @@ -1057,6 +1082,7 @@ fetch_organization_info() { #{{{ done else err "Unsupported storage backend type!" + # TODO: update runner group with error exit 1 fi @@ -1200,6 +1226,7 @@ register_instance() { #{{{ debug "Response:" "$(cat $LOG_FILE)" spinner_msg "Downloading support files" 1 err "Unable to download" "ecs-anywhere-install.sh" "script" + # TODO: update runner group with error exit 1 fi spinner_msg "Downloading support files" 0 @@ -1248,6 +1275,7 @@ register_instance() { #{{{ kill "$ecs_anywhere_pid" >&/dev/null sleep 2 echo "${err}:${msg}" >> "$LOG_FILE" + # TODO: update runner group with error exit 1 fi done & spinner "$!" "Verifying registration of this runner" @@ -1278,6 +1306,7 @@ deregister_instance() { #{{{ | jq -r '.sg_runner_group_id')" if [[ "$RUNNER_GROUP_ID_ECS_CONFIG" != "$RUNNER_GROUP_ID" ]]; then err "Different configured and provided --runner-group. Configured: $RUNNER_GROUP_ID_ECS_CONFIG, Provided: $RUNNER_GROUP_ID" + # TODO: update runner group with error exit 1 fi RUNNER_ID="$(grep ECS_INSTANCE_ATTRIBUTES /etc/ecs/ecs.config \ @@ -1287,6 +1316,7 @@ deregister_instance() { #{{{ if ! force_exec; then err "Instance probably deregistered" cmd_example "Try rerunning with" "-f/--force" "to force local cleanup" + # TODO: update runner group with error exit 1 fi fi @@ -1301,7 +1331,7 @@ deregister_instance() { #{{{ spinner_wait "Trying to deregister instance.." if [[ "$LOG_DEBUG" =~ true|True ]]; then printf "\n"; fi - if api_call "$payload"; then + if api_call "POST" "$payload"; then spinner_msg "Trying to deregister instance" 0 # if deregister_ssm_instance_response is [], remove /var/lib/amazon/ssm dir as the MI does not exist anymore, the below logic fails if the ssm deregister api does not return a response but the instance is still alive # echo $data @@ -1321,6 +1351,7 @@ deregister_instance() { #{{{ else info "Deregister with -f/--force to force local cleanup. Needed if you are registering this machine again." fi + # TODO: update runner group with error exit 1 fi } @@ -1429,9 +1460,11 @@ check_arg_value() { #{{{ ## TODO: make sure to validate double parameter input if [[ "${2:0:2}" == "--" ]]; then err "Argument" "${1}" "has invalid value: $2" + # TODO: update runner group with error exit 1 elif [[ -z "${2}" ]]; then err "Argument" "${1}" "can't be empty" + # TODO: update runner group with error exit 1 fi return 0 @@ -1441,6 +1474,7 @@ check_arg_value() { #{{{ is_root() { #{{{ if (( $(id -u) != 0 )); then err "This script must be run as" "root" + # TODO: update runner group with error exit 1 fi return 0 @@ -1450,15 +1484,18 @@ is_root() { #{{{ init_args_are_valid() { #{{{ if [[ ! "$1" =~ ^register$|^deregister$|^status$|^info$|^prune$|^cgroupsv2|^clean$ ]]; then err "Provided option" "${1}" "is invalid" + # TODO: update runner group with error exit 1 elif [[ "$1" == "cgroupsv2" && ! "$2" =~ ^enable$|^disable$ ]]; then err "Arguments:" "enable, disable" "are required." + # TODO: update runner group with error exit 1 elif [[ "$1" =~ register|deregister && \ ( ! "$*" =~ --sg-node-token || \ ! "$*" =~ --organization || \ ! "$*" =~ --runner-group ) ]]; then err "Arguments:" "--sg-node-token, --organization, --runner-group" "are required" + # TODO: update runner group with error exit 1 fi return 0 @@ -1470,6 +1507,7 @@ check_sg_args() { #{{{ || -z "${ORGANIZATION_ID}" \ || -z "${RUNNER_GROUP_ID}" ]]; then err "Arguments: " "--sg-node-token, --organization, --runner-group" "are required" + # TODO: update runner group with error exit 1 fi return 0 @@ -1523,6 +1561,7 @@ parse_arguments() { #{{{ *) [[ -z "${1}" ]] && break err "Invalid argument:" "${1}" + # TODO: update runner group with error exit 1 ;; esac @@ -1598,6 +1637,7 @@ main() { #{{{ if [[ ! -d /run/systemd/system ]]; then err "Private runner is only available for" "systemd-based" "systems" + # TODO: update runner group with error exit 1 fi @@ -1624,6 +1664,7 @@ main() { #{{{ cmds+=( "$cmd" ) fi done + # TODO: update runner group with error (( ${#cmds[@]}>0 )) && \ err "Commands" "${cmds[*]}" "not installed" && exit 1 @@ -1643,6 +1684,7 @@ main() { #{{{ if [[ -z $CONTAINER_ORCHESTRATOR ]]; then err "One of following container orchestrators required:" "${CONTAINER_ORCHESTRATORS[*]}" + # TODO: update runner group with error exit 1 fi From 1aab2ee0a026c8753e846872ee031d749746e734 Mon Sep 17 00:00:00 2001 From: Taher Kathanawala Date: Wed, 19 Feb 2025 18:05:31 +0530 Subject: [PATCH 2/2] change error dict fields --- main.sh | 32 +++++--------------------------- 1 file changed, 5 insertions(+), 27 deletions(-) diff --git a/main.sh b/main.sh index ed8bc89..4fa5668 100755 --- a/main.sh +++ b/main.sh @@ -341,7 +341,6 @@ check_fluentbit_status() { #{{{ if ignore_fluentbit_errors; then debug "Ignoring Fluentbit error(s) $err_msg" else - # TODO: update runner group with error err "Fluentbit encountered error(s)" "$err_msg" if ! no_clean_on_fail; then clean_local_setup & spinner "$!" "Starting cleanup" @@ -351,7 +350,6 @@ check_fluentbit_status() { #{{{ info "WARNING:" "If retrying a new registration, do not use --no-clean-on-fail as it leaves the system in an inconsistent state only useful for debugging purposes" fi info "Use --ignore-fluentbit-errors to ignore errors and proceed with the registration process" - # TODO: update runner group with error exit 1 fi else @@ -498,7 +496,11 @@ cgroupsv2() { #{{{ update_runner_group(){ url="${SG_BASE_API}/orgs/${ORGANIZATION_ID}/runnergroups/${RUNNER_GROUP_ID}/" - payload="{ \"RegistrationErrors\": { \"${RUNNER_ID}\": \"$(echo -n "$1" | sed 's/[][\\.^$*\/|(){}?&"\'']//g')\" } }" + err_msg=$(echo -n "$1" | tr -cd "[:print:]") + + debug "Error message ${err_msg}" + + payload="{ \"RunnerRegistrationErrors\": { \"$(ip route | grep default | cut -d" " -f9)\" : { \"RunnerId\": \"${RUNNER_ID}\" , \"error\": \"${err_msg}\", \"timestamp\": \"$( date -u -Iseconds )\", \"command\": \"${0} ${@}\" } } }" if api_call "PATCH" "$payload"; then debug "updated runner group with error msg" @@ -525,7 +527,6 @@ api_call() { #{{{ fi if [[ -z "$response" ]]; then - # TODO: update runner group with error exit 1 else full_response="$response" @@ -555,7 +556,6 @@ api_call() { #{{{ if [[ -z "$status_code" ]]; then err "Unknown status code." - # TODO: update runner group with error exit 1 elif [ "$status_code" != "200" ] && [ "$status_code" != "201" ] && [ "$status_code" != "100" ]; then return 1 @@ -723,7 +723,6 @@ clean_local_setup() { #{{{ check_variable_value() { #{{{ local variable_name=$1 - # TODO: update runner group with error [[ -z "${!variable_name}" ]] && \ err "Variable can't be empty" "$variable_name" && exit 1 return 0 @@ -1018,15 +1017,12 @@ fetch_organization_info() { #{{{ metadata="$(echo "${response}" | jq -r '.data.RegistrationMetadata[0]')" if [[ "$metadata" == "null" || -z "$metadata" ]]; then spinner_msg "Preparing environment.." 1 - # TODO: update runner group with error err "API data missing registration metadata." exit 1 fi else spinner_msg "Trying to fetch registration data" 1 - # TODO: update runner group with error err "Could not fetch data from API." "$status_code" "$message" - # TODO: update runner group with error exit 1 fi spinner_msg "Preparing environment" 0 @@ -1073,7 +1069,6 @@ fetch_organization_info() { #{{{ info "AWS role is used for S3 Storage Backend auth" "$S3_AWS_ROLE_ARN" else err "Auth for storage backend is not correctly configured, neither AWS_STATIC nor AWS_RBAC integration is provided" - # TODO: update runner group with error exit 1 fi elif [[ "$STORAGE_BACKEND_TYPE" == "azure_blob_storage" ]]; then @@ -1082,7 +1077,6 @@ fetch_organization_info() { #{{{ done else err "Unsupported storage backend type!" - # TODO: update runner group with error exit 1 fi @@ -1226,7 +1220,6 @@ register_instance() { #{{{ debug "Response:" "$(cat $LOG_FILE)" spinner_msg "Downloading support files" 1 err "Unable to download" "ecs-anywhere-install.sh" "script" - # TODO: update runner group with error exit 1 fi spinner_msg "Downloading support files" 0 @@ -1275,7 +1268,6 @@ register_instance() { #{{{ kill "$ecs_anywhere_pid" >&/dev/null sleep 2 echo "${err}:${msg}" >> "$LOG_FILE" - # TODO: update runner group with error exit 1 fi done & spinner "$!" "Verifying registration of this runner" @@ -1306,7 +1298,6 @@ deregister_instance() { #{{{ | jq -r '.sg_runner_group_id')" if [[ "$RUNNER_GROUP_ID_ECS_CONFIG" != "$RUNNER_GROUP_ID" ]]; then err "Different configured and provided --runner-group. Configured: $RUNNER_GROUP_ID_ECS_CONFIG, Provided: $RUNNER_GROUP_ID" - # TODO: update runner group with error exit 1 fi RUNNER_ID="$(grep ECS_INSTANCE_ATTRIBUTES /etc/ecs/ecs.config \ @@ -1316,7 +1307,6 @@ deregister_instance() { #{{{ if ! force_exec; then err "Instance probably deregistered" cmd_example "Try rerunning with" "-f/--force" "to force local cleanup" - # TODO: update runner group with error exit 1 fi fi @@ -1351,7 +1341,6 @@ deregister_instance() { #{{{ else info "Deregister with -f/--force to force local cleanup. Needed if you are registering this machine again." fi - # TODO: update runner group with error exit 1 fi } @@ -1460,11 +1449,9 @@ check_arg_value() { #{{{ ## TODO: make sure to validate double parameter input if [[ "${2:0:2}" == "--" ]]; then err "Argument" "${1}" "has invalid value: $2" - # TODO: update runner group with error exit 1 elif [[ -z "${2}" ]]; then err "Argument" "${1}" "can't be empty" - # TODO: update runner group with error exit 1 fi return 0 @@ -1474,7 +1461,6 @@ check_arg_value() { #{{{ is_root() { #{{{ if (( $(id -u) != 0 )); then err "This script must be run as" "root" - # TODO: update runner group with error exit 1 fi return 0 @@ -1484,18 +1470,15 @@ is_root() { #{{{ init_args_are_valid() { #{{{ if [[ ! "$1" =~ ^register$|^deregister$|^status$|^info$|^prune$|^cgroupsv2|^clean$ ]]; then err "Provided option" "${1}" "is invalid" - # TODO: update runner group with error exit 1 elif [[ "$1" == "cgroupsv2" && ! "$2" =~ ^enable$|^disable$ ]]; then err "Arguments:" "enable, disable" "are required." - # TODO: update runner group with error exit 1 elif [[ "$1" =~ register|deregister && \ ( ! "$*" =~ --sg-node-token || \ ! "$*" =~ --organization || \ ! "$*" =~ --runner-group ) ]]; then err "Arguments:" "--sg-node-token, --organization, --runner-group" "are required" - # TODO: update runner group with error exit 1 fi return 0 @@ -1507,7 +1490,6 @@ check_sg_args() { #{{{ || -z "${ORGANIZATION_ID}" \ || -z "${RUNNER_GROUP_ID}" ]]; then err "Arguments: " "--sg-node-token, --organization, --runner-group" "are required" - # TODO: update runner group with error exit 1 fi return 0 @@ -1561,7 +1543,6 @@ parse_arguments() { #{{{ *) [[ -z "${1}" ]] && break err "Invalid argument:" "${1}" - # TODO: update runner group with error exit 1 ;; esac @@ -1637,7 +1618,6 @@ main() { #{{{ if [[ ! -d /run/systemd/system ]]; then err "Private runner is only available for" "systemd-based" "systems" - # TODO: update runner group with error exit 1 fi @@ -1664,7 +1644,6 @@ main() { #{{{ cmds+=( "$cmd" ) fi done - # TODO: update runner group with error (( ${#cmds[@]}>0 )) && \ err "Commands" "${cmds[*]}" "not installed" && exit 1 @@ -1684,7 +1663,6 @@ main() { #{{{ if [[ -z $CONTAINER_ORCHESTRATOR ]]; then err "One of following container orchestrators required:" "${CONTAINER_ORCHESTRATORS[*]}" - # TODO: update runner group with error exit 1 fi