huggingface · Sainava · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026 · Jun 19, 2026
diff --git a/src/transformers/models/sapiens2/modeling_sapiens2.py b/src/transformers/models/sapiens2/modeling_sapiens2.py
@@ -1151,6 +1151,7 @@ def forward(
         pixel_values: torch.FloatTensor,
         flip_pairs: torch.Tensor | None = None,
         labels: torch.FloatTensor | None = None,
+        target_weights: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Sapiens2PoseEstimatorOutput:
         r"""
@@ -1161,6 +1162,9 @@ def forward(
             original orientation.
         labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
             Heatmap ground truth for computing the loss.
+        target_weights (`torch.FloatTensor` of shape `(batch_size, num_keypoints)` or `(batch_size, num_keypoints, height, width)`, *optional*):
+            Visibility weights for each keypoint. If a keypoint is occluded or invisible, its weight should be 0.0 to prevent
+            penalizing the model during the loss computation. If `None`, standard unmasked MSE loss is computed.
 
         Example:
 
@@ -1200,7 +1204,29 @@ def forward(
 
         loss = None
         if labels is not None:
-            raise NotImplementedError("Training is not yet supported")
+            if labels.shape != heatmaps.shape:
+                raise ValueError(f"Expected labels shape {heatmaps.shape}, got {labels.shape}")
+
+            if target_weights is None:
+                loss = torch.nn.functional.mse_loss(heatmaps, labels)
+            else:
+                if target_weights.ndim not in (2, 4):
+                    raise ValueError(f"Expected target_weights to have 2 or 4 dimensions, got {target_weights.ndim}")
+
+                if target_weights.shape != labels.shape[: target_weights.ndim]:
+                    raise ValueError(
+                        f"Expected target_weights shape to match {labels.shape[: target_weights.ndim]}, "
+                        f"got {target_weights.shape}"
+                    )
+
+                per_pixel_loss = torch.nn.functional.mse_loss(heatmaps, labels, reduction="none")
+
+                ndim_pad = labels.ndim - target_weights.ndim
+                mask = target_weights.view(target_weights.shape + (1,) * ndim_pad)
+
+                mask = mask.to(heatmaps.dtype)
+
+                loss = (per_pixel_loss * mask).mean()
 
         return Sapiens2PoseEstimatorOutput(
             loss=loss,

diff --git a/src/transformers/models/sapiens2/modular_sapiens2.py b/src/transformers/models/sapiens2/modular_sapiens2.py
@@ -1661,6 +1661,7 @@ def forward(
         pixel_values: torch.FloatTensor,
         flip_pairs: torch.Tensor | None = None,
         labels: torch.FloatTensor | None = None,
+        target_weights: torch.FloatTensor | None = None,
         **kwargs: Unpack[TransformersKwargs],
     ) -> Sapiens2PoseEstimatorOutput:
         r"""
@@ -1671,6 +1672,9 @@ def forward(
             original orientation.
         labels (`torch.FloatTensor` of shape `(batch_size, num_keypoints, height, width)`, *optional*):
             Heatmap ground truth for computing the loss.
+        target_weights (`torch.FloatTensor` of shape `(batch_size, num_keypoints)` or `(batch_size, num_keypoints, height, width)`, *optional*):
+            Visibility weights for each keypoint. If a keypoint is occluded or invisible, its weight should be 0.0 to prevent
+            penalizing the model during the loss computation. If `None`, standard unmasked MSE loss is computed.
 
         Example:
 
@@ -1710,7 +1714,29 @@ def forward(
 
         loss = None
         if labels is not None:
-            raise NotImplementedError("Training is not yet supported")
+            if labels.shape != heatmaps.shape:
+                raise ValueError(f"Expected labels shape {heatmaps.shape}, got {labels.shape}")
+
+            if target_weights is None:
+                loss = torch.nn.functional.mse_loss(heatmaps, labels)
+            else:
+                if target_weights.ndim not in (2, 4):
+                    raise ValueError(f"Expected target_weights to have 2 or 4 dimensions, got {target_weights.ndim}")
+
+                if target_weights.shape != labels.shape[: target_weights.ndim]:
+                    raise ValueError(
+                        f"Expected target_weights shape to match {labels.shape[: target_weights.ndim]}, "
+                        f"got {target_weights.shape}"
+                    )
+
+                per_pixel_loss = torch.nn.functional.mse_loss(heatmaps, labels, reduction="none")
+
+                ndim_pad = labels.ndim - target_weights.ndim
+                mask = target_weights.view(target_weights.shape + (1,) * ndim_pad)
+
+                mask = mask.to(heatmaps.dtype)
+
+                loss = (per_pixel_loss * mask).mean()
 
         return Sapiens2PoseEstimatorOutput(
             loss=loss,

diff --git a/tests/models/sapiens2/test_modeling_sapiens2.py b/tests/models/sapiens2/test_modeling_sapiens2.py
@@ -187,15 +187,43 @@ def create_and_check_for_pose_estimation(self, config, pixel_values, labels):
         model = Sapiens2ForPoseEstimation(config)
         model.to(torch_device)
         model.eval()
+
         with torch.no_grad():
             result = model(pixel_values)
+
         patch_height = self.image_size // self.patch_size
         expected_h = patch_height * (2 ** len(config.head_config.upsample_out_channels))
+
         self.parent.assertEqual(
             result.heatmaps.shape,
             (self.batch_size, config.num_labels, expected_h, expected_h),
         )
 
+        pose_labels = torch.randn_like(result.heatmaps)
+
+        with torch.no_grad():
+            result_with_loss = model(
+                pixel_values,
+                labels=pose_labels,
+            )
+
+        self.parent.assertIsNotNone(result_with_loss.loss)
+
+        target_weights = torch.ones(
+            self.batch_size,
+            config.num_labels,
+            device=pixel_values.device,
+        )
+
+        with torch.no_grad():
+            result_with_weights = model(
+                pixel_values,
+                labels=pose_labels,
+                target_weights=target_weights,
+            )
+
+        self.parent.assertIsNotNone(result_with_weights.loss)
+
     def create_and_check_for_normal_estimation(self, config, pixel_values, labels):
         model = Sapiens2ForNormalEstimation(config)
         model.to(torch_device)