diff --git a/01_install_requirements.sh b/01_install_requirements.sh index 8fe354c05..714f6051e 100755 --- a/01_install_requirements.sh +++ b/01_install_requirements.sh @@ -39,8 +39,8 @@ sudo dnf -y clean all old_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) -# Update to latest packages first -sudo dnf -y upgrade --nobest +dnf_with_retries -y upgrade --nobest +echo "System upgraded successfully." new_version=$(sudo dnf info NetworkManager | grep Version | cut -d ':' -f 2) # If NetworkManager was upgraded it needs to be restarted @@ -57,7 +57,7 @@ source /etc/os-release # All of those are needed because we're still behind for OS support. # passlib needs to be installed as system dependency if [[ -x "/usr/libexec/platform-python" ]]; then - sudo /usr/libexec/platform-python -m pip install passlib || sudo dnf -y install python3-pip && sudo /usr/libexec/platform-python -m pip install passlib + sudo /usr/libexec/platform-python -m pip install passlib || sudo dnf_with_retries -y install --nobest python3-pip && sudo /usr/libexec/platform-python -m pip install passlib fi # Install ansible, other packages are installed via @@ -65,16 +65,16 @@ fi case $DISTRO in "centos8"|"rhel8"|"almalinux8"|"rocky8") # install network-scripts package to be able to use legacy network commands - sudo dnf install -y network-scripts + dnf_with_retries install -y --nobest network-scripts if [[ $DISTRO == "centos8" ]] && [[ "$NAME" != *"Stream"* ]]; then echo "CentOS is not supported, please switch to CentOS Stream / RHEL / Rocky / Alma" exit 1 fi if [[ $DISTRO == "centos8" || $DISTRO == "almalinux8" || $DISTRO == "rocky8" ]]; then - sudo dnf -y install epel-release dnf --enablerepo=extras + dnf_with_retries -y install --nobest epel-release dnf --enablerepo=extras elif [[ $DISTRO == "rhel8" ]]; then # Enable EPEL for python3-passlib and python3-bcrypt required by metal3-dev-env - sudo dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm + dnf_with_retries dnf -y install --nobest https://dl.fedoraproject.org/pub/epel/epel-release-latest-8.noarch.rpm if sudo subscription-manager repos --list-enabled 2>&1 | grep "ansible-2-for-rhel-8-$(uname -m)-rpms"; then # The packaged 2.x ansible is too old for compatibility with metal3-dev-env sudo dnf erase -y ansible @@ -82,17 +82,17 @@ case $DISTRO in fi fi # Note recent ansible needs python >= 3.8 so we install 3.9 here - sudo dnf -y install python39 + dnf_with_retries -y install --nobest python39 sudo alternatives --set python /usr/bin/python3.9 sudo alternatives --set python3 /usr/bin/python3.9 sudo update-alternatives --install /usr/bin/pip3 pip3 /usr/bin/pip3.9 1 PYTHON_DEVEL="python39-devel" ;; "centos9"|"rhel9"|"almalinux9"|"rocky9") - sudo dnf -y install python3-pip + dnf_with_retries -y install --nobest python3-pip if [[ $DISTRO == "centos9" || $DISTRO == "almalinux9" || $DISTRO == "rocky9" ]] ; then sudo dnf config-manager --set-enabled crb - sudo dnf -y install epel-release + dnf_with_retries -y install --nobest epel-release elif [[ $DISTRO == "rhel9" ]]; then # NOTE(raukadah): If a system is subscribed to RHEL subscription then # sudo subscription-manager identity will return exit 0 else 1. @@ -101,7 +101,7 @@ case $DISTRO in # enable the CRB repository sudo subscription-manager repos --enable codeready-builder-for-rhel-9-$(arch)-rpms fi - sudo dnf -y install https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm + dnf_with_retries -y install --nobest https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm fi sudo ln -s /usr/bin/python3 /usr/bin/python || true PYTHON_DEVEL="python3-devel" @@ -127,7 +127,7 @@ GO_VERSION=${GO_VERSION:-1.22.3} GOARCH=$(uname -m) if [[ $GOARCH == "aarch64" ]]; then GOARCH="arm64" - sudo dnf -y install $PYTHON_DEVEL libxml2-devel libxslt-devel + dnf_with_retries -y install --nobest $PYTHON_DEVEL libxml2-devel libxslt-devel elif [[ $GOARCH == "x86_64" ]]; then GOARCH="amd64" fi @@ -155,16 +155,16 @@ popd if [ -n "${KNI_INSTALL_FROM_GIT}" ]; then # zip is required for building the installer from source - sudo dnf -y install zip + dnf_with_retries -y --nobest install zip fi # Install nfs for persistent volumes if [ "${PERSISTENT_IMAGEREG}" == true ] ; then - sudo dnf -y install nfs-utils + dnf_with_retries -y --nobest install nfs-utils fi if [[ "${NODES_PLATFORM}" == "baremetal" ]] ; then - sudo dnf -y install ipmitool + dnf_with_retries -y --nobest install ipmitool fi # needed if we are using locally built images diff --git a/agent/01_agent_requirements.sh b/agent/01_agent_requirements.sh index 1206e8d43..a2db79149 100755 --- a/agent/01_agent_requirements.sh +++ b/agent/01_agent_requirements.sh @@ -51,9 +51,9 @@ fi if [[ "${AGENT_E2E_TEST_BOOT_MODE}" == "ISCSI" ]]; then # Install shell to administer local storage - sudo dnf -y install targetcli + dnf_with_retries -y --nobest install targetcli fi if [[ "${AGENT_E2E_TEST_BOOT_MODE}" == "ISO_NO_REGISTRY" ]]; then - sudo dnf -y install xorriso coreos-installer syslinux skopeo + dnf_with_retries -y --nobest install xorriso coreos-installer syslinux skopeo fi diff --git a/ocp_install_env.sh b/ocp_install_env.sh index 241c62054..521ca000d 100644 --- a/ocp_install_env.sh +++ b/ocp_install_env.sh @@ -19,15 +19,33 @@ function extract_command() { local cmd local outdir local extract_dir + local MAX_RETRIES=5 + local SLEEP_BETWEEN=10 cmd="$1" release_image="$2" outdir="$3" - extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - _tmpfiles="$_tmpfiles $extract_dir" + # Retry loop for oc adm release extract to handle quay.io blips + for attempt in $(seq 1 $MAX_RETRIES); do + extract_dir=$(mktemp --tmpdir -d "installer--XXXXXXXXXX") - oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command=$cmd --to "${extract_dir}" ${release_image} + if oc adm release extract --registry-config "${PULL_SECRET_FILE}" --command="$cmd" --to "${extract_dir}" "${release_image}"; then + echo "Successfully extracted $cmd" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Extraction failed, retrying in ${SLEEP_BETWEEN}s..." + rm -rf "${extract_dir}" + sleep "${SLEEP_BETWEEN}" + else + echo "Failed to extract $cmd from ${release_image} after $MAX_RETRIES attempts" + return 1 + fi + done + + _tmpfiles="$_tmpfiles $extract_dir" if [[ $cmd == "oc.rhel8" ]]; then cmd="oc" diff --git a/utils.sh b/utils.sh index 454c28cf2..2c5cd6e4f 100755 --- a/utils.sh +++ b/utils.sh @@ -28,6 +28,35 @@ function retry_with_timeout() { return $(( exit_code )) } +# Run a dnf command with retries and cache cleaning +dnf_with_retries() { + local max_retries=5 + local delay=15 + local attempt=1 + + while (( attempt <= max_retries )); do + echo "Attempt $attempt of $max_retries: sudo dnf $*" + + if sudo dnf "$@"; then + echo "sudo dnf $* succeeded." + return 0 + fi + + echo "sudo dnf $* failed on attempt $attempt." + if (( attempt < max_retries )); then + echo "Cleaning DNF cache and retrying after $delay seconds..." + sudo dnf clean all || true + sudo rm -rf /var/cache/dnf/* || true + sleep "$delay" + fi + + (( attempt++ )) + done + + echo "ERROR: sudo dnf $* failed after $max_retries attempts." + return 1 +} + function generate_assets() { rm -rf assets/generated && mkdir assets/generated for file in $(find assets/templates/ -iname '*.yaml' -type f -printf "%P\n"); do @@ -617,6 +646,25 @@ EOF if [[ "$reg_state" != "running" || $restart_registry -eq 1 ]]; then sudo podman rm registry -f || true + MAX_RETRIES=5 + _PULL_RETRY_DELAY=10 + + # Try pulling the image first to tolerate quay.io errors like 504s. + for attempt in $(seq 1 $MAX_RETRIES); do + if sudo podman pull "${DOCKER_REGISTRY_IMAGE}"; then + echo "Successfully pulled ${DOCKER_REGISTRY_IMAGE}" + break + fi + + if [[ $attempt -lt $MAX_RETRIES ]]; then + echo "Pull failed, retrying in ${_PULL_RETRY_DELAY}s..." + sleep "${_PULL_RETRY_DELAY}" + else + echo "Failed to pull ${DOCKER_REGISTRY_IMAGE} after $MAX_RETRIES attempts" + exit 1 + fi + done + sudo podman run -d --name registry --net=host --privileged \ -v ${REGISTRY_DIR}/data:/var/lib/registry:z \ -v ${REGISTRY_DIR}/auth:/auth:z \