Skip to content

Commit bf41d14

Browse files
jlebondustymabe
authored andcommitted
buildupload: bump retry period to 5 minutes
It's incredibly expensive when we flake on something at the very end of the pipeline when uploading S3 artifacts; all the created artifacts are lost and we have to rerun a whole new build. We currently only retry for 10 seconds, which makes sense for truly transient flakes but for uploads, given the stakes, let's be more resilient to flakes that could take a bit longer to resolve as well, like DNS resolution issues. Retry for 5 minutes, with an exponential backoff of up to 20 seconds.
1 parent 4a443c2 commit bf41d14

File tree

2 files changed

+15
-4
lines changed

2 files changed

+15
-4
lines changed

src/cmd-buildupload

+11-3
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,13 @@ CACHE_MAX_AGE_ARTIFACT = 60 * 60 * 24 * 365
2222
# set metadata caching to 5m
2323
CACHE_MAX_AGE_METADATA = 60 * 5
2424
from cosalib.builds import Builds, BUILDFILES
25-
from cosalib.cmdlib import load_json, retry_stop, retry_boto_exception, retry_callback # noqa: E402
25+
from cosalib.cmdlib import (
26+
load_json,
27+
retry_stop_long,
28+
retry_wait_long,
29+
retry_boto_exception,
30+
retry_callback
31+
)
2632

2733

2834
def main():
@@ -188,7 +194,8 @@ def s3_upload_build(s3_client, args, builddir, bucket, prefix):
188194
dry_run=args.dry_run)
189195

190196

191-
@retry(stop=retry_stop, retry=retry_boto_exception, before_sleep=retry_callback)
197+
@retry(stop=retry_stop_long, wait=retry_wait_long,
198+
retry=retry_boto_exception, before_sleep=retry_callback)
192199
def s3_check_exists(s3_client, bucket, key, dry_run=False):
193200
print(f"Checking if bucket '{bucket}' has key '{key}'")
194201
try:
@@ -205,7 +212,8 @@ def s3_check_exists(s3_client, bucket, key, dry_run=False):
205212
return True
206213

207214

208-
@retry(stop=retry_stop, retry=retry_boto_exception, retry_error_callback=retry_callback)
215+
@retry(stop=retry_stop_long, wait=retry_wait_long,
216+
retry=retry_boto_exception, retry_error_callback=retry_callback)
209217
def s3_copy(s3_client, src, bucket, key, max_age, acl, extra_args={}, dry_run=False):
210218
extra_args = dict(extra_args)
211219
if 'ContentType' not in extra_args:

src/cosalib/cmdlib.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from flufl.lock import Lock
2525

2626
from tenacity import (
27-
stop_after_delay, stop_after_attempt, retry_if_exception_type)
27+
stop_after_delay, stop_after_attempt, retry_if_exception_type, wait_exponential)
2828

2929
gi.require_version("RpmOstree", "1.0")
3030
from gi.repository import RpmOstree
@@ -39,6 +39,9 @@
3939
LOCK_DEFAULT_LIFETIME = datetime.timedelta(weeks=52)
4040

4141
retry_stop = (stop_after_delay(10) | stop_after_attempt(5))
42+
# for operations that want to be more persistent
43+
retry_stop_long = (stop_after_delay(60 * 5)) # 5 minutes
44+
retry_wait_long = (wait_exponential(max=10))
4245
retry_boto_exception = (retry_if_exception_type(ConnectionClosedError) |
4346
retry_if_exception_type(ConnectTimeoutError) |
4447
retry_if_exception_type(IncompleteReadError) |

0 commit comments

Comments
 (0)