Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
449 changes: 391 additions & 58 deletions deepmd/dpmodel/utils/learning_rate.py

Large diffs are not rendered by default.

20 changes: 4 additions & 16 deletions deepmd/pd/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,7 +243,7 @@ def get_sample() -> dict[str, Any]:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -391,11 +391,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
self.warmup_steps = training_params.get("warmup_steps", 0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -584,18 +580,13 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return step / warmup_steps
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type == "Adam":
self.scheduler = paddle.optimizer.lr.LambdaDecay(
learning_rate=self.lr_exp.start_lr,
lr_lambda=lambda step: warm_up_linear(step, self.warmup_steps),
lr_lambda=lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
self.optimizer = paddle.optimizer.Adam(
learning_rate=self.scheduler, parameters=self.wrapper.parameters()
Expand Down Expand Up @@ -759,10 +750,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type == "Adam":
cur_lr = self.scheduler.get_lr()
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr

# disable synchronization in forward-backward manually
# as derivatives exist in model forward
Expand Down
4 changes: 3 additions & 1 deletion deepmd/pd/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PD_PRECISION_DICT

Expand Down Expand Up @@ -257,7 +258,8 @@ def to_numpy_array(
):
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
Comment on lines 259 to 263
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

🧩 Analysis chain

🏁 Script executed:

find . -name "utils.py" -path "*/deepmd/pd/*" | head -20

Repository: deepmodeling/deepmd-kit

Length of output: 128


🏁 Script executed:

cat -n deepmd/pd/utils/utils.py | sed -n '250,280p'

Repository: deepmodeling/deepmd-kit

Length of output: 1144


🏁 Script executed:

cat -n deepmd/pd/utils/utils.py | sed -n '259,300p'

Repository: deepmodeling/deepmd-kit

Length of output: 1759


🏁 Script executed:

python3 << 'EOF'
import numpy as np

# Test NumPy scalar types
scalar_float64 = np.float64(1.0)
scalar_float32 = np.float32(1.0)
scalar_int32 = np.int32(5)

print("NumPy scalar float64 type:", type(scalar_float64))
print("NumPy scalar float64 has dtype:", hasattr(scalar_float64, 'dtype'))
print("NumPy scalar float64 has numpy() method:", hasattr(scalar_float64, 'numpy'))
print("NumPy scalar float64 is np.generic:", isinstance(scalar_float64, np.generic))
print()
print("NumPy scalar float32 type:", type(scalar_float32))
print("NumPy scalar float32 is np.generic:", isinstance(scalar_float32, np.generic))
print()
print("NumPy scalar int32 type:", type(scalar_int32))
print("NumPy scalar int32 is np.generic:", isinstance(scalar_int32, np.generic))
print()

# Test that np.asarray handles NumPy scalars well
result = np.asarray(scalar_float64, dtype=np.float64)
print("np.asarray(np.float64(1.0), dtype=np.float64) result type:", type(result))
print("Result:", result)
EOF

Repository: deepmodeling/deepmd-kit

Length of output: 153


🏁 Script executed:

rg -A 5 -B 5 "to_numpy_array" deepmd/pd --type py | head -60

Repository: deepmodeling/deepmd-kit

Length of output: 2883


🏁 Script executed:

fd -e py "test.*utils" deepmd/pd --exec rg "to_numpy_array" {} \;

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

find . -path "*/test*" -name "*utils*" -path "*/pd/*" -type f | grep -E "\.py$" | head -10

Repository: deepmodeling/deepmd-kit

Length of output: 98


🏁 Script executed:

rg "to_numpy_array" . --type py -l | grep -i test

Repository: deepmodeling/deepmd-kit

Length of output: 2464


🏁 Script executed:

python3 -c "import numpy as np; scalar = np.float64(1.0); print('has dtype:', hasattr(scalar, 'dtype')); print('has numpy():', hasattr(scalar, 'numpy')); print('is np.generic:', isinstance(scalar, np.generic))"

Repository: deepmodeling/deepmd-kit

Length of output: 185


🏁 Script executed:

cat -n source/tests/pd/test_utils.py | head -100

Repository: deepmodeling/deepmd-kit

Length of output: 1323


🏁 Script executed:

rg -A 10 -B 2 "to_numpy_array" source/tests/pd/test_utils.py

Repository: deepmodeling/deepmd-kit

Length of output: 686


🏁 Script executed:

rg -A 15 "test_to_numpy" source/tests/pd/test_utils.py

Repository: deepmodeling/deepmd-kit

Length of output: 684


🏁 Script executed:

cat -n source/tests/pd/test_utils.py | tail -20

Repository: deepmodeling/deepmd-kit

Length of output: 992


🏁 Script executed:

rg "to_numpy_array.*np\." deepmd/pd --type py -A 2

Repository: deepmodeling/deepmd-kit

Length of output: 200


🏁 Script executed:

cat > /tmp/test_numpy_scalars.py << 'EOF'
import sys
sys.path.insert(0, '.')

# Check NumPy scalar types
try:
    import numpy as np
    
    # Test scalar types
    scalar_float64 = np.float64(1.0)
    scalar_float32 = np.float32(2.5)
    scalar_int32 = np.int32(5)
    python_float = 1.5
    python_int = 42
    
    print("=== NumPy Scalar Tests ===")
    print(f"np.float64(1.0) is np.generic: {isinstance(scalar_float64, np.generic)}")
    print(f"np.float64(1.0) has dtype: {hasattr(scalar_float64, 'dtype')}")
    print(f"np.float64(1.0) has numpy(): {hasattr(scalar_float64, 'numpy')}")
    print()
    print(f"np.float32(2.5) is np.generic: {isinstance(scalar_float32, np.generic)}")
    print(f"np.float32(2.5) has dtype: {hasattr(scalar_float32, 'dtype')}")
    print()
    print(f"np.int32(5) is np.generic: {isinstance(scalar_int32, np.generic)}")
    print()
    print("=== Python Scalar Tests ===")
    print(f"float(1.5) is np.generic: {isinstance(python_float, np.generic)}")
    print(f"float(1.5) has dtype: {hasattr(python_float, 'dtype')}")
    print()
    print(f"int(42) is np.generic: {isinstance(python_int, np.generic)}")
    print()
    print("=== np.asarray behavior ===")
    result = np.asarray(scalar_float64, dtype=np.float64)
    print(f"np.asarray(np.float64(1.0), dtype=np.float64) type: {type(result)}")
    print(f"Result is np.ndarray: {isinstance(result, np.ndarray)}")
    
except Exception as e:
    print(f"Error: {e}")
    import traceback
    traceback.print_exc()
EOF
python3 /tmp/test_numpy_scalars.py

Repository: deepmodeling/deepmd-kit

Length of output: 274


🏁 Script executed:

rg "np\.generic" deepmd --type py

Repository: deepmodeling/deepmd-kit

Length of output: 49


🏁 Script executed:

rg "isinstance.*np\." deepmd/pd --type py -A 1 | head -40

Repository: deepmodeling/deepmd-kit

Length of output: 717


🏁 Script executed:

cat -n deepmd/pd/utils/utils.py | sed -n '1,50p'

Repository: deepmodeling/deepmd-kit

Length of output: 1390


🏁 Script executed:

rg "GLOBAL_NP_FLOAT_PRECISION" deepmd/pd/utils/utils.py -B 5 -A 5

Repository: deepmodeling/deepmd-kit

Length of output: 664


Handle NumPy scalar inputs in the scalar fast‑path.
Line 261 currently excludes NumPy scalar types (e.g., np.float32, np.float64), which will fall through and hit xx.numpy() at line 274, raising AttributeError. Add np.generic to catch all NumPy scalar types.

🛠️ Suggested fix
-    if isinstance(xx, (float, int)):
-        return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
+    if isinstance(xx, (float, int, np.generic)):
+        return np.asarray(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
if xx is None:
return None
if isinstance(xx, (float, int, np.generic)):
return np.asarray(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PD_PRECISION_DICT
🤖 Prompt for AI Agents
In `@deepmd/pd/utils/utils.py` around lines 259 - 263, The scalar fast-path
currently checks isinstance(xx, (float, int)) and misses NumPy scalar types,
causing attribute errors later when code assumes tensor-like objects (e.g.,
xx.numpy()); update the check to include NumPy scalars by adding np.generic to
the isinstance tuple (i.e., isinstance(xx, (float, int, np.generic))) so that
NumPy scalar inputs are converted to np.array(xx,
dtype=GLOBAL_NP_FLOAT_PRECISION) early; ensure you import numpy as np if not
already and keep the existing return behavior for None and numeric scalars.

reverse_precision_dict = {v: k for k, v in PD_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
Expand Down
41 changes: 6 additions & 35 deletions deepmd/pt/train/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@ def get_sample() -> Any:
return get_sample

def get_lr(lr_params: dict[str, Any]) -> BaseLR:
lr_params["stop_steps"] = self.num_steps - self.warmup_steps
lr_params["num_steps"] = self.num_steps
lr_schedule = BaseLR(**lr_params)
return lr_schedule

Expand Down Expand Up @@ -437,27 +437,7 @@ def get_lr(lr_params: dict[str, Any]) -> BaseLR:
)

# Learning rate
warmup_steps = training_params.get("warmup_steps", None)
warmup_ratio = training_params.get("warmup_ratio", None)
if warmup_steps is not None:
self.warmup_steps = warmup_steps
elif warmup_ratio is not None:
if not 0 <= warmup_ratio < 1:
raise ValueError(f"warmup_ratio must be in [0, 1), got {warmup_ratio}")
self.warmup_steps = int(warmup_ratio * self.num_steps)
if self.warmup_steps == 0 and warmup_ratio > 0:
log.warning(
f"warmup_ratio {warmup_ratio} results in 0 warmup steps "
f"due to truncation. Consider using a larger ratio or "
f"specify warmup_steps directly."
)
else:
self.warmup_steps = 0
self.warmup_start_factor = training_params.get("warmup_start_factor", 0.0)
self.gradient_max_norm = training_params.get("gradient_max_norm", 0.0)
assert self.num_steps - self.warmup_steps > 0 or self.warmup_steps == 0, (
"Warm up steps must be less than total training steps!"
)
if self.multi_task and config.get("learning_rate_dict", None) is not None:
self.lr_exp = {}
for model_key in self.model_keys:
Expand Down Expand Up @@ -702,14 +682,6 @@ def single_model_finetune(

# TODO add lr warmups for multitask
# author: iProzd
def warm_up_linear(step: int, warmup_steps: int) -> float:
if step < warmup_steps:
return self.warmup_start_factor + (1.0 - self.warmup_start_factor) * (
step / warmup_steps
)
else:
return self.lr_exp.value(step - warmup_steps) / self.lr_exp.start_lr

# TODO add optimizers for multitask
# author: iProzd
if self.opt_type in ["Adam", "AdamW"]:
Expand All @@ -730,7 +702,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
elif self.opt_type == "LKF":
self.optimizer = LKFOptimizer(
Expand Down Expand Up @@ -768,7 +741,8 @@ def warm_up_linear(step: int, warmup_steps: int) -> float:
self.optimizer.load_state_dict(optimizer_state_dict)
self.scheduler = torch.optim.lr_scheduler.LambdaLR(
self.optimizer,
lambda step: warm_up_linear(step + self.start_step, self.warmup_steps),
lambda step: self.lr_exp.value(step + self.start_step)
/ self.lr_exp.start_lr,
)
else:
raise ValueError(f"Not supported optimizer type '{self.opt_type}'")
Expand Down Expand Up @@ -883,10 +857,7 @@ def step(_step_id: int, task_key: str = "Default") -> None:
fout1.flush()
if self.opt_type in ["Adam", "AdamW", "AdaMuon", "HybridMuon"]:
cur_lr = self.scheduler.get_last_lr()[0]
if _step_id < self.warmup_steps:
pref_lr = _lr.start_lr
else:
pref_lr = cur_lr
pref_lr = cur_lr
model_pred, loss, more_loss = self.wrapper(
**input_dict, cur_lr=pref_lr, label=label_dict, task_key=task_key
)
Expand Down
9 changes: 7 additions & 2 deletions deepmd/pt/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from .env import (
DEVICE,
GLOBAL_NP_FLOAT_PRECISION,
)
from .env import PRECISION_DICT as PT_PRECISION_DICT

Expand Down Expand Up @@ -227,18 +228,22 @@ def to_numpy_array(xx: None) -> None: ...


def to_numpy_array(
xx: torch.Tensor | None,
xx: torch.Tensor | np.ndarray | float | None,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Need to update the overload method in line 223.

) -> np.ndarray | None:
if xx is None:
return None
assert xx is not None
if isinstance(xx, (float, int)):
return np.array(xx, dtype=GLOBAL_NP_FLOAT_PRECISION)
if isinstance(xx, np.ndarray):
return xx.astype(GLOBAL_NP_FLOAT_PRECISION)
# Create a reverse mapping of PT_PRECISION_DICT
reverse_precision_dict = {v: k for k, v in PT_PRECISION_DICT.items()}
# Use the reverse mapping to find keys with the desired value
prec = reverse_precision_dict.get(xx.dtype, None)
prec = NP_PRECISION_DICT.get(prec, None)
if prec is None:
raise ValueError(f"unknown precision {xx.dtype}")
assert isinstance(xx, torch.Tensor)
if xx.dtype == torch.bfloat16:
# https://github.com/pytorch/pytorch/issues/109873
xx = xx.float()
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dipole.py
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/dos.py
Original file line number Diff line number Diff line change
Expand Up @@ -655,7 +655,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/ener.py
Original file line number Diff line number Diff line change
Expand Up @@ -856,7 +856,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
The loss function parameters.
lr : LearningRateExp
lr : LearningRateSchedule
The learning rate.

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/fitting.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
2 changes: 1 addition & 1 deletion deepmd/tf/fit/polar.py
Original file line number Diff line number Diff line change
Expand Up @@ -863,7 +863,7 @@ def get_loss(self, loss: dict, lr) -> Loss:
----------
loss : dict
the loss dict
lr : LearningRateExp
lr : LearningRateSchedule
the learning rate

Returns
Expand Down
52 changes: 32 additions & 20 deletions deepmd/tf/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
import os
import shutil
import time
from typing import (
Any,
)

import google.protobuf.message
import numpy as np
Expand Down Expand Up @@ -52,7 +55,7 @@
load_graph_def,
)
from deepmd.tf.utils.learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from deepmd.tf.utils.sess import (
run_sess,
Expand Down Expand Up @@ -100,21 +103,18 @@ def _init_param(self, jdata) -> None:
self.model = Model(**model_param)
self.fitting = self.model.get_fitting()

def get_lr_and_coef(lr_param):
def get_lr_and_coef(
lr_param: dict[str, Any],
) -> tuple[LearningRateSchedule, float]:
scale_by_worker = lr_param.get("scale_by_worker", "linear")
if scale_by_worker == "linear":
scale_lr_coef = float(self.run_opt.world_size)
elif scale_by_worker == "sqrt":
scale_lr_coef = np.sqrt(self.run_opt.world_size).real
else:
scale_lr_coef = 1.0
lr_type = lr_param.get("type", "exp")
if lr_type == "exp":
lr = LearningRateExp(
lr_param["start_lr"], lr_param["stop_lr"], lr_param["decay_steps"]
)
else:
raise RuntimeError("unknown learning_rate type " + lr_type)
lr_params = {k: v for k, v in lr_param.items() if k != "scale_by_worker"}
lr = LearningRateSchedule(lr_params)
return lr, scale_lr_coef

# learning rate
Expand Down Expand Up @@ -242,8 +242,13 @@ def build(self, data=None, stop_batch=0, origin_type_map=None, suffix="") -> Non
def _build_lr(self) -> None:
self._extra_train_ops = []
self.global_step = tf.train.get_or_create_global_step()
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")
if self.stop_batch == 0:
# Use constant start_lr when stop_batch is zero (no training)
self.learning_rate = tf.cast(self.lr.start_lr(), GLOBAL_TF_FLOAT_PRECISION)
log.info("built lr (constant start_lr for stop_batch=0)")
else:
self.learning_rate = self.lr.build(self.global_step, self.stop_batch)
log.info("built lr")
Comment on lines +245 to +251
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it need a if...else block?


def _build_loss(self):
if self.stop_batch == 0:
Expand Down Expand Up @@ -426,14 +431,21 @@ def train(self, train_data=None, valid_data=None) -> None:
elapsed_batch = stop_batch - start_batch
is_first_step = True
self.cur_batch = cur_batch
log.info(
"start training at lr %.2e (== %.2e), decay_step %d, decay_rate %f, final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.decay_steps_,
self.lr.decay_rate_,
self.lr.value(stop_batch),
)
if stop_batch == 0:
lr0 = self.lr.start_lr()
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
lr0,
lr0,
)
else:
log.info(
"start training at lr %.2e (== %.2e), final lr will be %.2e",
run_sess(self.sess, self.learning_rate),
self.lr.value(cur_batch),
self.lr.value(stop_batch),
)
Comment on lines +434 to +448
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why does it need a if...else block?


prf_options = None
prf_run_metadata = None
Expand Down Expand Up @@ -797,7 +809,7 @@ def _get_place_holders(self, data_dict) -> None:
prec = GLOBAL_ENER_FLOAT_PRECISION
self.place_holders[kk] = tf.placeholder(prec, [None], name="t_" + kk)
self.place_holders["find_" + kk] = tf.placeholder(
tf.float32, name="t_find_" + kk
GLOBAL_TF_FLOAT_PRECISION, name="t_find_" + kk
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the reason to conduct this change in this PR?

)

def _init_from_frz_model(self) -> None:
Expand Down
4 changes: 2 additions & 2 deletions deepmd/tf/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
DeepmdDataSystem,
)
from .learning_rate import (
LearningRateExp,
LearningRateSchedule,
)
from .pair_tab import (
PairTab,
Expand All @@ -20,7 +20,7 @@
__all__ = [
"DeepmdData",
"DeepmdDataSystem",
"LearningRateExp",
"LearningRateSchedule",
"PairTab",
"Plugin",
"PluginVariant",
Expand Down
Loading