From 6b4ca59e88a6957f0bc17cb7d3380d96f1277ae5 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Thu, 29 Jan 2026 23:39:06 +0800 Subject: [PATCH 1/5] fix(pt/pd): fix incompatibility between AutoBatchSize and eval hooks --- deepmd/pd/infer/deep_eval.py | 8 ++++++ deepmd/pt/infer/deep_eval.py | 8 ++++++ deepmd/utils/batch_size.py | 50 ++++++++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 6c0ffed7ec..384869c2a7 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -823,6 +823,8 @@ def eval_descriptor( model = ( self.dp.model["Default"] if isinstance(self.dp, ModelWrapper) else self.dp ) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) self.eval( coords, @@ -835,6 +837,8 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -878,6 +882,8 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) self.eval( coords, @@ -890,4 +896,6 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 6e63ecb2fc..50fce3ccd2 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -793,6 +793,8 @@ def eval_descriptor( Descriptors. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) self.eval( coords, @@ -805,6 +807,8 @@ def eval_descriptor( ) descriptor = model.eval_descriptor() model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -848,6 +852,8 @@ def eval_fitting_last_layer( Fitting output before last layer. """ model = self.dp.model["Default"] + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) self.eval( coords, @@ -860,4 +866,6 @@ def eval_fitting_last_layer( ) fitting_net = model.eval_fitting_last_layer() model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index e701e82ec6..289b8b83a8 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -22,6 +22,39 @@ log = logging.getLogger(__name__) + +class RetrySignal(Exception): + """Signal to retry execution after OOM error.""" + + +# originally copied from dpdispatcher +# https://github.com/deepmodeling/dpdispatcher/blob/9a76542311a02e84c4ae62f15b7edcd30850a64e/dpdispatcher/utils/utils.py#L161-L213 +# license: LGPL-3.0-or-later +def retry(func: Any) -> Callable: + """Decorator to retry the function until it succeeds or fails for certain times. + + Returns + ------- + wrapper: Callable + The wrapper. + + Examples + -------- + >>> @retry + ... def func(): + ... raise RetrySignal("Failed") + """ + + def wrapper(*args: Any, **kwargs: Any) -> Any: + while True: + try: + return func(*args, **kwargs) + except RetrySignal: + log.info("Retry the entire method") + + return wrapper + + class AutoBatchSize(ABC): """This class allows DeePMD-kit to automatically decide the maximum batch size that will not cause an OOM error. @@ -75,6 +108,7 @@ def __init__(self, initial_batch_size: int = 1024, factor: float = 2.0) -> None: ) self.factor = factor + self.oom_retry_mode = False def execute( self, callable: Callable, start_index: int, natoms: int @@ -125,6 +159,8 @@ def execute( ) from e # adjust the next batch size self._adjust_batch_size(1.0 / self.factor) + if self.set_oom_retry_mode: + raise RetrySignal from e return 0, None else: n_tot = n_batch * natoms @@ -147,6 +183,7 @@ def _adjust_batch_size(self, factor: float) -> None: f"Adjust batch size from {old_batch_size} to {self.current_batch_size}" ) + @retry def execute_all( self, callable: Callable, @@ -281,3 +318,16 @@ def is_oom_error(self, e: Exception) -> bool: bool True if the exception is an OOM error """ + + def set_oom_retry_mode(self, enable: bool) -> None: + """Set OOM retry mode. + + In OOM retry mode, all data will be re-executed. + + Parameters + ---------- + enable : bool + True to enable OOM retry mode + """ + self.oom_retry_mode = enable + From 404d1ac71d49a339ebfaadd372d14973580ca92e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 15:42:01 +0000 Subject: [PATCH 2/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/utils/batch_size.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 289b8b83a8..293f6aa92a 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -22,7 +22,6 @@ log = logging.getLogger(__name__) - class RetrySignal(Exception): """Signal to retry execution after OOM error.""" @@ -330,4 +329,3 @@ def set_oom_retry_mode(self, enable: bool) -> None: True to enable OOM retry mode """ self.oom_retry_mode = enable - From 01de666bee46632fa57bb66c006cbf388ef155b6 Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 30 Jan 2026 00:19:48 +0800 Subject: [PATCH 3/5] apply Copilot's suggestions --- deepmd/pd/infer/deep_eval.py | 75 +++++++++++++++++++++++------------- deepmd/pt/infer/deep_eval.py | 75 +++++++++++++++++++++++------------- deepmd/utils/batch_size.py | 30 +-------------- 3 files changed, 99 insertions(+), 81 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 384869c2a7..10973e254a 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -10,6 +10,7 @@ ) import numpy as np +from deepmd.utils.batch_size import RetrySignal import paddle from paddle import inference as paddle_inference @@ -826,19 +827,30 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + descriptor = model.eval_descriptor() + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -885,17 +897,28 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + fitting_net = model.eval_fitting_last_layer() + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 50fce3ccd2..1895919732 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -67,6 +67,7 @@ to_numpy_array, to_torch_tensor, ) +from deepmd.utils.batch_size import RetrySignal from deepmd.utils.econf_embd import ( sort_element_type, ) @@ -796,19 +797,30 @@ def eval_descriptor( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_descriptor_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - descriptor = model.eval_descriptor() - model.set_eval_descriptor_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + descriptor = model.eval_descriptor() + model.set_eval_descriptor_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(descriptor) def eval_fitting_last_layer( @@ -855,17 +867,28 @@ def eval_fitting_last_layer( if self.auto_batch_size is not None: self.auto_batch_size.set_oom_retry_mode(True) model.set_eval_fitting_last_layer_hook(True) - self.eval( - coords, - cells, - atom_types, - atomic=False, - fparam=fparam, - aparam=aparam, - **kwargs, - ) - fitting_net = model.eval_fitting_last_layer() - model.set_eval_fitting_last_layer_hook(False) - if self.auto_batch_size is not None: - self.auto_batch_size.set_oom_retry_mode(False) + try: + self.eval( + coords, + cells, + atom_types, + atomic=False, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + except RetrySignal: + return self.eval_descriptor( + coords, + cells, + atom_types, + fparam=fparam, + aparam=aparam, + **kwargs, + ) + finally: + fitting_net = model.eval_fitting_last_layer() + model.set_eval_fitting_last_layer_hook(False) + if self.auto_batch_size is not None: + self.auto_batch_size.set_oom_retry_mode(False) return to_numpy_array(fitting_net) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 289b8b83a8..f8f96bc1ef 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -27,34 +27,6 @@ class RetrySignal(Exception): """Signal to retry execution after OOM error.""" -# originally copied from dpdispatcher -# https://github.com/deepmodeling/dpdispatcher/blob/9a76542311a02e84c4ae62f15b7edcd30850a64e/dpdispatcher/utils/utils.py#L161-L213 -# license: LGPL-3.0-or-later -def retry(func: Any) -> Callable: - """Decorator to retry the function until it succeeds or fails for certain times. - - Returns - ------- - wrapper: Callable - The wrapper. - - Examples - -------- - >>> @retry - ... def func(): - ... raise RetrySignal("Failed") - """ - - def wrapper(*args: Any, **kwargs: Any) -> Any: - while True: - try: - return func(*args, **kwargs) - except RetrySignal: - log.info("Retry the entire method") - - return wrapper - - class AutoBatchSize(ABC): """This class allows DeePMD-kit to automatically decide the maximum batch size that will not cause an OOM error. @@ -159,7 +131,7 @@ def execute( ) from e # adjust the next batch size self._adjust_batch_size(1.0 / self.factor) - if self.set_oom_retry_mode: + if self.oom_retry_mode: raise RetrySignal from e return 0, None else: From 72c4e367af614d9f5eaad52e0a1a5444ee0fda1f Mon Sep 17 00:00:00 2001 From: Jinzhe Zeng Date: Fri, 30 Jan 2026 00:21:03 +0800 Subject: [PATCH 4/5] rm retry --- deepmd/utils/batch_size.py | 1 - 1 file changed, 1 deletion(-) diff --git a/deepmd/utils/batch_size.py b/deepmd/utils/batch_size.py index 57dbd34585..82de03695c 100644 --- a/deepmd/utils/batch_size.py +++ b/deepmd/utils/batch_size.py @@ -154,7 +154,6 @@ def _adjust_batch_size(self, factor: float) -> None: f"Adjust batch size from {old_batch_size} to {self.current_batch_size}" ) - @retry def execute_all( self, callable: Callable, From f785d619397efa24b6fb1b3ed38252322edce81c Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 29 Jan 2026 16:22:15 +0000 Subject: [PATCH 5/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- deepmd/pd/infer/deep_eval.py | 4 +++- deepmd/pt/infer/deep_eval.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/deepmd/pd/infer/deep_eval.py b/deepmd/pd/infer/deep_eval.py index 10973e254a..2465a2b0de 100644 --- a/deepmd/pd/infer/deep_eval.py +++ b/deepmd/pd/infer/deep_eval.py @@ -10,7 +10,6 @@ ) import numpy as np -from deepmd.utils.batch_size import RetrySignal import paddle from paddle import inference as paddle_inference @@ -65,6 +64,9 @@ to_numpy_array, to_paddle_tensor, ) +from deepmd.utils.batch_size import ( + RetrySignal, +) from deepmd.utils.econf_embd import ( sort_element_type, ) diff --git a/deepmd/pt/infer/deep_eval.py b/deepmd/pt/infer/deep_eval.py index 1895919732..be90fcea78 100644 --- a/deepmd/pt/infer/deep_eval.py +++ b/deepmd/pt/infer/deep_eval.py @@ -67,7 +67,9 @@ to_numpy_array, to_torch_tensor, ) -from deepmd.utils.batch_size import RetrySignal +from deepmd.utils.batch_size import ( + RetrySignal, +) from deepmd.utils.econf_embd import ( sort_element_type, )