Merge branch 4.x

asmorkalov · asmorkalov · commit d031ffdd8943 · 2025-08-13T10:53:16.000+03:00
diff --git a/modules/cudafilters/include/opencv2/cudafilters.hpp b/modules/cudafilters/include/opencv2/cudafilters.hpp
@@ -97,6 +97,9 @@ center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa boxFilter
  */
 CV_EXPORTS_W Ptr<Filter> createBoxFilter(int srcType, int dstType, Size ksize, Point anchor = Point(-1, -1),
@@ -115,6 +118,9 @@ center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa filter2D
  */
 CV_EXPORTS_W Ptr<Filter> createLinearFilter(int srcType, int dstType, InputArray kernel, Point anchor = Point(-1, -1),
@@ -134,6 +140,9 @@ applied (see getDerivKernels ).
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Laplacian
  */
 CV_EXPORTS_W Ptr<Filter> createLaplacianFilter(int srcType, int dstType, int ksize = 1, double scale = 1,
@@ -156,6 +165,9 @@ the aperture center.
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa sepFilter2D
  */
 CV_EXPORTS_W Ptr<Filter> createSeparableLinearFilter(int srcType, int dstType, InputArray rowKernel, InputArray columnKernel,
@@ -178,6 +190,9 @@ applied. For details, see getDerivKernels .
 @param rowBorderMode Pixel extrapolation method in the vertical direction. For details, see
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createDerivFilter(int srcType, int dstType, int dx, int dy,
                                          int ksize, bool normalize = false, double scale = 1,
@@ -196,6 +211,9 @@ applied. For details, see getDerivKernels .
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Sobel
  */
 CV_EXPORTS_W Ptr<Filter> createSobelFilter(int srcType, int dstType, int dx, int dy, int ksize = 3,
@@ -213,6 +231,9 @@ applied. See getDerivKernels for details.
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa Scharr
  */
 CV_EXPORTS_W Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, int dy,
@@ -233,6 +254,9 @@ CV_EXPORTS_W Ptr<Filter> createScharrFilter(int srcType, int dstType, int dx, in
 borderInterpolate.
 @param columnBorderMode Pixel extrapolation method in the horizontal direction.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa GaussianBlur
  */
 CV_EXPORTS_W Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksize,
@@ -258,6 +282,9 @@ CV_EXPORTS_W Ptr<Filter> createGaussianFilter(int srcType, int dstType, Size ksi
 is at the center.
 @param iterations Number of times erosion and dilation to be applied.
 
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
+
 @sa morphologyEx
  */
 CV_EXPORTS_W Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray kernel, Point anchor = Point(-1, -1), int iterations = 1);
@@ -272,6 +299,9 @@ CV_EXPORTS_W Ptr<Filter> createMorphologyFilter(int op, int srcType, InputArray
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
@@ -284,6 +314,9 @@ CV_EXPORTS_W Ptr<Filter> createBoxMaxFilter(int srcType, Size ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
                                           Point anchor = Point(-1, -1),
@@ -300,6 +333,9 @@ CV_EXPORTS_W Ptr<Filter> createBoxMinFilter(int srcType, Size ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
@@ -311,6 +347,9 @@ CV_EXPORTS_W Ptr<Filter> createRowSumFilter(int srcType, int dstType, int ksize,
 @param anchor Anchor point. The default value (-1) means that the anchor is at the kernel center.
 @param borderMode Pixel extrapolation method. For details, see borderInterpolate .
 @param borderVal Default border value.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createColumnSumFilter(int srcType, int dstType, int ksize, int anchor = -1, int borderMode = BORDER_DEFAULT, Scalar borderVal = Scalar::all(0));
 
@@ -329,6 +368,9 @@ Outputs an image that has been filtered using a median-filtering formulation.
 Details on this algorithm can be found in:
 Green, O., 2017. "Efficient scalable median filtering using histogram-based operations",
                   IEEE Transactions on Image Processing, 27(5), pp.2217-2228.
+
+@note
+If applied in a CUDA Stream, a distinct filter instance must be created for each Stream. Sharing a single instance across multiple streams is unsupported and may lead to undefined behavior due to stream-specific internal state.
  */
 CV_EXPORTS_W Ptr<Filter> createMedianFilter(int srcType, int windowSize, int partition = 128);
 
diff --git a/modules/cudawarping/include/opencv2/cudawarping.hpp b/modules/cudawarping/include/opencv2/cudawarping.hpp
@@ -118,6 +118,7 @@ CV_EXPORTS_W void resize(InputArray src, OutputArray dst, Size dsize, double fx=
 @param src Source image. CV_8U , CV_16U , CV_32S , or CV_32F depth and 1, 3, or 4 channels are
 supported.
 @param dst Destination image with the same type as src . The size is dsize .
+    **In-place operation (src == dst) is not supported and will result in an error.**
 @param M *2x3* Mat or UMat transformation matrix.
 @param dsize Size of the destination image.
 @param flags Combination of interpolation methods (see resize) and the optional flag
@@ -127,6 +128,7 @@ INTER_NEAREST , INTER_LINEAR , and INTER_CUBIC interpolation methods are support
 @param borderValue
 @param stream Stream for the asynchronous version.
 
+@note In-place operation is not supported. If src and dst refer to the same data, the behavior is undefined.
 @sa warpAffine
  */
 CV_EXPORTS void warpAffine(InputArray src, OutputArray dst, InputArray M, Size dsize, int flags = INTER_LINEAR,
diff --git a/modules/cudawarping/src/warp.cpp b/modules/cudawarping/src/warp.cpp
@@ -208,6 +208,8 @@ void cv::cuda::warpAffine(InputArray _src, OutputArray _dst, InputArray _M, Size
     _dst.create(dsize, src.type());
     GpuMat dst = _dst.getGpuMat();
 
+    CV_Assert( src.data != dst.data && "In-place operation not supported for cv::cuda::warpAffine" );
+
     Size wholeSize;
     Point ofs;
     src.locateROI(wholeSize, ofs);
diff --git a/modules/cudawarping/test/test_warp_affine.cpp b/modules/cudawarping/test/test_warp_affine.cpp
@@ -222,6 +222,36 @@ CUDA_TEST_P(WarpAffine, Accuracy)
     EXPECT_MAT_NEAR(dst_gold, dst, src.depth() == CV_32F ? 1e-1 : 1.0);
 }
 
+CUDA_TEST_P(WarpAffine, OverlapDetection)
+{
+    cv::Mat src = randomMat(size, type);
+    ASSERT_FALSE(src.empty());
+    cv::cuda::GpuMat gpuSrc;
+    gpuSrc.upload(src);
+
+    cv::Mat M = cv::Mat::eye(2, 3, CV_64FC1);
+    int flags = interpolation;
+    if (inverse)
+        flags |= cv::WARP_INVERSE_MAP;
+
+    {
+        cv::cuda::GpuMat gpuDst(gpuSrc, cv::Rect(0, 0, size.width, size.height));
+
+        EXPECT_THROW(
+            cv::cuda::warpAffine(gpuSrc, gpuDst, M, size, flags, borderType, cv::Scalar::all(0)),
+            cv::Exception);
+    }
+
+    {
+        cv::cuda::GpuMat gpuDst(size, gpuSrc.type());
+        ASSERT_NE(gpuSrc.data, gpuDst.data); // Confirm they are distinct
+
+        EXPECT_NO_THROW({
+            cv::cuda::warpAffine(gpuSrc, gpuDst, M, size, flags, borderType, cv::Scalar::all(0));
+        });
+    }
+}
+
 INSTANTIATE_TEST_CASE_P(CUDA_Warping, WarpAffine, testing::Combine(
     ALL_DEVICES,
     DIFFERENT_SIZES,
diff --git a/modules/gapi/include/opencv2/gapi/infer/ov.hpp b/modules/gapi/include/opencv2/gapi/infer/ov.hpp
@@ -66,6 +66,8 @@ struct ParamDesc {
         LayerVariantAttr<std::vector<float>> scale_values;
 
         LayerVariantAttr<int> interpolation;
+
+        bool clamp_outputs = false;
     };
 
     struct CompiledModel {
@@ -356,6 +358,24 @@ template<typename Net> struct Params {
         return *this;
     }
 
+    /** @brief Enables or disables clamping of model outputs in the PrePostProcessor.
+
+    By default, output values are clamped to the valid range for the output precision
+    by the device or plugin. Enabling this option moves clamping to the PrePostProcessor stage.
+
+    @note This feature is only available with OpenVINO 2025.2 and newer.
+
+    @param flag If true, clamping is performed in the PrePostProcessor;
+    otherwise, it is handled by the device or plugin.
+    @return reference to this parameter structure.
+    */
+    Params<Net>&
+    cfgClampOutputs(bool flag = true) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "clamp outputs")
+            .clamp_outputs = std::move(flag);
+        return *this;
+    }
+
     /** @brief Specifies the new shape for input layers.
 
     The function is used to set new shape for input layers.
@@ -625,6 +645,14 @@ class Params<cv::gapi::Generic> {
         return *this;
     }
 
+    /** @see ov::Params::cfgClampOutputs. */
+    Params&
+    cfgClampOutputs(bool flag = true) {
+        detail::getModelToSetAttrOrThrow(m_desc.kind, "clamp outputs")
+            .clamp_outputs = std::move(flag);
+        return *this;
+    }
+
     /** @see ov::Params::cfgReshape. */
     Params& cfgReshape(std::vector<size_t> new_shape) {
         detail::getModelToSetAttrOrThrow(m_desc.kind, "reshape")
diff --git a/modules/gapi/src/backends/ov/govbackend.cpp b/modules/gapi/src/backends/ov/govbackend.cpp
@@ -147,6 +147,25 @@ static int toCV(const ov::element::Type &type) {
     return -1;
 }
 
+static inline std::pair<double, double> get_CV_type_range(int cv_type) {
+    switch (cv_type) {
+        case CV_8U:
+            return { static_cast<double>(std::numeric_limits<uint8_t>::min()),
+                     static_cast<double>(std::numeric_limits<uint8_t>::max()) };
+        case CV_32S:
+            return { static_cast<double>(std::numeric_limits<int32_t>::min()),
+                     static_cast<double>(std::numeric_limits<int32_t>::max()) };
+        case CV_32F:
+            return { static_cast<double>(std::numeric_limits<float>::lowest()),
+                     static_cast<double>(std::numeric_limits<float>::max()) };
+        case CV_16F:
+            return { -65504.0, 65504.0 };
+        default:
+            GAPI_Error("OV Backend: Unsupported data type");
+    }
+    return {0.0, 0.0};
+}
+
 static void copyFromOV(const ov::Tensor &tensor, cv::Mat &mat) {
     const auto total = mat.total() * mat.channels();
     if (toCV(tensor.get_element_type()) != mat.depth() ||
@@ -1052,6 +1071,20 @@ class PrePostProcWrapper {
             if (explicit_out_tensor_prec) {
                 m_ppp.output(output_name).tensor()
                     .set_element_type(toOV(*explicit_out_tensor_prec));
+
+                if (m_model_info.clamp_outputs) {
+                    #if INF_ENGINE_RELEASE >= 2025020000
+                    auto clamp_range = get_CV_type_range(*explicit_out_tensor_prec);
+                    m_ppp.output(output_name).postprocess()
+                        .clamp(clamp_range.first, clamp_range.second);
+                    #else
+                    static bool warned = false;
+                    if (!warned) {
+                        GAPI_LOG_WARNING(NULL, "cfgClampOutputs is enabled, but not supported in this OpenVINO version. Clamping will be ignored.");
+                        warned = true;
+                    }
+                    #endif // INF_ENGINE_RELEASE >= 2025020000
+                }
             }
         }
     }
diff --git a/modules/videostab/src/cuda/global_motion.cu b/modules/videostab/src/cuda/global_motion.cu
@@ -52,6 +52,11 @@ namespace cv { namespace cuda { namespace device { namespace globmotion {
 __constant__ float cml[9];
 __constant__ float cmr[9];
 
+struct is_zero
+{
+    __host__ __device__ bool operator()(uchar x) const { return x == 0; }
+};
+
 int compactPoints(int N, float *points0, float *points1, const uchar *mask)
 {
     thrust::device_ptr<float2> dpoints0((float2*)points0);
@@ -60,7 +65,7 @@ int compactPoints(int N, float *points0, float *points1, const uchar *mask)
 
     return (int)(thrust::remove_if(thrust::make_zip_iterator(thrust::make_tuple(dpoints0, dpoints1)),
                              thrust::make_zip_iterator(thrust::make_tuple(dpoints0 + N, dpoints1 + N)),
-                             dmask, thrust::not1(thrust::identity<uchar>()))
+                             dmask, is_zero())
            - thrust::make_zip_iterator(make_tuple(dpoints0, dpoints1)));
 }