diff --git a/metaflow/metaflow_config.py b/metaflow/metaflow_config.py index 415a934cbe4..9bb44d2b871 100644 --- a/metaflow/metaflow_config.py +++ b/metaflow/metaflow_config.py @@ -507,7 +507,7 @@ # # Note also that DataStoreSet resolves the latest attempt_id using # lexicographic ordering of attempts. This won't work if MAX_ATTEMPTS > 99. -MAX_ATTEMPTS = 6 +MAX_ATTEMPTS = from_conf("MAX_ATTEMPTS", 6) # Feature flag (experimental features that are *explicitly* unsupported) diff --git a/metaflow/plugins/airflow/airflow.py b/metaflow/plugins/airflow/airflow.py index 304fa9f3bd9..4c20c7f9531 100644 --- a/metaflow/plugins/airflow/airflow.py +++ b/metaflow/plugins/airflow/airflow.py @@ -33,6 +33,7 @@ SERVICE_HEADERS, SERVICE_INTERNAL_URL, AZURE_KEY_VAULT_PREFIX, + MAX_ATTEMPTS, ) from metaflow.metaflow_config_funcs import config_values @@ -385,6 +386,7 @@ def _to_job(self, node): "METAFLOW_KUBERNETES_WORKLOAD": str( 1 ), # This is used by kubernetes decorator. + "METAFLOW_MAX_ATTEMPTS": MAX_ATTEMPTS, "METAFLOW_RUNTIME_ENVIRONMENT": "kubernetes", "METAFLOW_CARD_S3ROOT": CARD_S3ROOT, "METAFLOW_RUN_ID": AIRFLOW_MACROS.RUN_ID, diff --git a/metaflow/plugins/argo/argo_workflows.py b/metaflow/plugins/argo/argo_workflows.py index 336d880da0c..d9978c66d72 100644 --- a/metaflow/plugins/argo/argo_workflows.py +++ b/metaflow/plugins/argo/argo_workflows.py @@ -46,6 +46,7 @@ SERVICE_HEADERS, SERVICE_INTERNAL_URL, UI_URL, + MAX_ATTEMPTS, ) from metaflow.metaflow_config_funcs import config_values from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars @@ -1733,6 +1734,7 @@ def _container_templates(self): "METAFLOW_KUBERNETES_FETCH_EC2_METADATA": KUBERNETES_FETCH_EC2_METADATA, "METAFLOW_RUNTIME_ENVIRONMENT": "kubernetes", "METAFLOW_OWNER": self.username, + "METAFLOW_MAX_ATTEMPTS": MAX_ATTEMPTS, }, **{ # Configuration for Argo Events. Keep these in sync with the diff --git a/metaflow/plugins/aws/batch/batch.py b/metaflow/plugins/aws/batch/batch.py index 16ce9a06cef..f6dbf53ef35 100644 --- a/metaflow/plugins/aws/batch/batch.py +++ b/metaflow/plugins/aws/batch/batch.py @@ -23,6 +23,7 @@ DEFAULT_SECRETS_BACKEND_TYPE, AWS_SECRETS_MANAGER_DEFAULT_REGION, S3_SERVER_SIDE_ENCRYPTION, + MAX_ATTEMPTS, ) from metaflow.metaflow_config_funcs import config_values @@ -264,6 +265,7 @@ def create_job( .environment_variable("METAFLOW_CARD_S3ROOT", CARD_S3ROOT) .environment_variable("METAFLOW_OTEL_ENDPOINT", OTEL_ENDPOINT) .environment_variable("METAFLOW_RUNTIME_ENVIRONMENT", "aws-batch") + .environment_variable("METAFLOW_MAX_ATTEMPTS", MAX_ATTEMPTS) ) # Temporary passing of *some* environment variables. Do not rely on this diff --git a/metaflow/plugins/kubernetes/kubernetes.py b/metaflow/plugins/kubernetes/kubernetes.py index 6625047395a..71eb7d79cdc 100644 --- a/metaflow/plugins/kubernetes/kubernetes.py +++ b/metaflow/plugins/kubernetes/kubernetes.py @@ -39,6 +39,7 @@ SERVICE_HEADERS, KUBERNETES_SECRETS, SERVICE_INTERNAL_URL, + MAX_ATTEMPTS, ) from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK from metaflow.metaflow_config_funcs import config_values @@ -299,6 +300,7 @@ def create_jobset( # assumes metadata is stored in DATASTORE_LOCAL_DIR on the Kubernetes # pod; this happens when METAFLOW_DATASTORE_SYSROOT_LOCAL is NOT set ( # see get_datastore_root_from_config in datastore/local.py). + .environment_variable("METAFLOW_MAX_ATTEMPTS", MAX_ATTEMPTS) ) for k in list( @@ -602,6 +604,7 @@ def create_job_object( # assumes metadata is stored in DATASTORE_LOCAL_DIR on the Kubernetes # pod; this happens when METAFLOW_DATASTORE_SYSROOT_LOCAL is NOT set ( # see get_datastore_root_from_config in datastore/local.py). + .environment_variable("METAFLOW_MAX_ATTEMPTS", MAX_ATTEMPTS) ) # Temporary passing of *some* environment variables. Do not rely on this