Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion src/transformers/models/sapiens2/modeling_sapiens2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1151,6 +1151,7 @@ def forward(
pixel_values: torch.FloatTensor,
flip_pairs: torch.Tensor | None = None,
labels: torch.FloatTensor | None = None,
target_weights: torch.FloatTensor | None = None,
**kwargs: Unpack[TransformersKwargs],
Comment thread
guarin marked this conversation as resolved.
Outdated
) -> Sapiens2PoseEstimatorOutput:
r"""
Expand All @@ -1161,6 +1162,9 @@ def forward(
original orientation.
labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
Heatmap ground truth for computing the loss.
target_weights (`torch.FloatTensor` of shape `(batch_size, num_keypoints)` or `(batch_size, num_keypoints, height, width)`, *optional*):
Visibility weights for each keypoint. If a keypoint is occluded or invisible, its weight should be 0.0 to prevent
penalizing the model during the loss computation. If `None`, standard unmasked MSE loss is computed.

Example:

Expand Down Expand Up @@ -1200,7 +1204,29 @@ def forward(

loss = None
if labels is not None:
raise NotImplementedError("Training is not yet supported")
if labels.shape != heatmaps.shape:
raise ValueError(f"Expected labels shape {heatmaps.shape}, got {labels.shape}")

if target_weights is None:
loss = torch.nn.functional.mse_loss(heatmaps, labels)
else:
if target_weights.ndim not in (2, 4):
raise ValueError(f"Expected target_weights to have 2 or 4 dimensions, got {target_weights.ndim}")

if target_weights.shape != labels.shape[: target_weights.ndim]:
raise ValueError(
f"Expected target_weights shape to match {labels.shape[: target_weights.ndim]}, "
f"got {target_weights.shape}"
)

per_pixel_loss = torch.nn.functional.mse_loss(heatmaps, labels, reduction="none")

ndim_pad = labels.ndim - target_weights.ndim
mask = target_weights.view(target_weights.shape + (1,) * ndim_pad)

mask = mask.to(heatmaps.dtype)

loss = (per_pixel_loss * mask).mean()
Comment thread
guarin marked this conversation as resolved.
Outdated

return Sapiens2PoseEstimatorOutput(
loss=loss,
Expand Down
28 changes: 27 additions & 1 deletion src/transformers/models/sapiens2/modular_sapiens2.py
Original file line number Diff line number Diff line change
Expand Up @@ -1661,6 +1661,7 @@ def forward(
pixel_values: torch.FloatTensor,
flip_pairs: torch.Tensor | None = None,
labels: torch.FloatTensor | None = None,
target_weights: torch.FloatTensor | None = None,
**kwargs: Unpack[TransformersKwargs],
) -> Sapiens2PoseEstimatorOutput:
r"""
Expand All @@ -1671,6 +1672,9 @@ def forward(
original orientation.
labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
Heatmap ground truth for computing the loss.
target_weights (`torch.FloatTensor` of shape `(batch_size, num_keypoints)` or `(batch_size, num_keypoints, height, width)`, *optional*):
Visibility weights for each keypoint. If a keypoint is occluded or invisible, its weight should be 0.0 to prevent
penalizing the model during the loss computation. If `None`, standard unmasked MSE loss is computed.

Example:

Expand Down Expand Up @@ -1710,7 +1714,29 @@ def forward(

loss = None
if labels is not None:
raise NotImplementedError("Training is not yet supported")
if labels.shape != heatmaps.shape:
raise ValueError(f"Expected labels shape {heatmaps.shape}, got {labels.shape}")

if target_weights is None:
loss = torch.nn.functional.mse_loss(heatmaps, labels)
else:
if target_weights.ndim not in (2, 4):
raise ValueError(f"Expected target_weights to have 2 or 4 dimensions, got {target_weights.ndim}")

if target_weights.shape != labels.shape[: target_weights.ndim]:
raise ValueError(
f"Expected target_weights shape to match {labels.shape[: target_weights.ndim]}, "
f"got {target_weights.shape}"
)

per_pixel_loss = torch.nn.functional.mse_loss(heatmaps, labels, reduction="none")

ndim_pad = labels.ndim - target_weights.ndim
mask = target_weights.view(target_weights.shape + (1,) * ndim_pad)

mask = mask.to(heatmaps.dtype)

loss = (per_pixel_loss * mask).mean()

return Sapiens2PoseEstimatorOutput(
loss=loss,
Expand Down
28 changes: 28 additions & 0 deletions tests/models/sapiens2/test_modeling_sapiens2.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,15 +187,43 @@ def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
model = Sapiens2ForPoseEstimation(config)
model.to(torch_device)
model.eval()

with torch.no_grad():
result = model(pixel_values)

patch_height = self.image_size // self.patch_size
expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))

self.parent.assertEqual(
result.heatmaps.shape,
(self.batch_size, config.num_labels, expected_h, expected_h),
)

pose_labels = torch.randn_like(result.heatmaps)

with torch.no_grad():
result_with_loss = model(
pixel_values,
labels=pose_labels,
)
Comment thread
guarin marked this conversation as resolved.
Outdated

self.parent.assertIsNotNone(result_with_loss.loss)

target_weights = torch.ones(
self.batch_size,
config.num_labels,
device=pixel_values.device,
)

with torch.no_grad():
result_with_weights = model(
pixel_values,
labels=pose_labels,
target_weights=target_weights,
)

self.parent.assertIsNotNone(result_with_weights.loss)

def create_and_check_for_normal_estimation(self, config, pixel_values, labels):
model = Sapiens2ForNormalEstimation(config)
model.to(torch_device)
Expand Down
Loading