Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions common/gst-zed-recovery.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// /////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2026, STEREOLABS.
//
// All rights reserved.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// /////////////////////////////////////////////////////////////////////////

#pragma once

#include <cuda_runtime.h>
#include <gst/base/gstbasesrc.h>
#include <gst/gst.h>
#include <sl/Camera.hpp>

///
/// \brief Shared recovery-aware grab loop for ZED GStreamer source plugins.
///
/// During multi-camera Argus recovery, grab() returns CAMERA_REBOOTING (-1)
/// for 10-30s while the ProviderGuardian coordinates provider destruction
/// and recreation. This helper retries instead of killing the pipeline.
///
/// \param element GstElement pointer (for GST_*_OBJECT logging macros)
/// \param basesrc GstBaseSrc pointer (for flushing check)
/// \param grab_fn Callable returning sl::ERROR_CODE (e.g., grab())
/// \param max_wait_sec Maximum seconds to wait before declaring timeout
/// \param[out] waited Set to the number of seconds spent waiting (0 if no recovery)
///
/// \return The final sl::ERROR_CODE from grab_fn.
/// On timeout: the last recovery error code (caller should post GST_ELEMENT_ERROR).
/// On flushing: sl::ERROR_CODE::FAILURE with *waited set to -1 as sentinel.
///
/// Usage:
/// \code
/// int waited = 0;
/// ret = zed_gst_grab_with_recovery(GST_ELEMENT(src), GST_BASE_SRC(src),
/// [&]() { return src->zed.grab(rtParams); }, 60, &waited);
/// if (waited == -1) { /* pipeline flushing */ }
/// if (waited > 0) GST_INFO_OBJECT(src, "recovered after %ds", waited);
/// \endcode
///
template <typename GrabFn>
static inline sl::ERROR_CODE zed_gst_grab_with_recovery(GstElement *element, GstBaseSrc *basesrc,
GrabFn grab_fn, int max_wait_sec,
int *waited) {
*waited = 0;

while (true) {
sl::ERROR_CODE ret = grab_fn();

if (ret != sl::ERROR_CODE::CAMERA_REBOOTING && ret != sl::ERROR_CODE::CUDA_ERROR) {
return ret;
}

// Recovery path.
if (*waited == 0)
GST_WARNING_OBJECT(element, "Camera recovering (error: %s), waiting...",
sl::toString(ret).c_str());

if (++(*waited) > max_wait_sec) {
GST_ERROR_OBJECT(element, "Camera recovery timeout after %ds (last error: %s)",
max_wait_sec, sl::toString(ret).c_str());
return ret; // caller posts GST_ELEMENT_ERROR
}

// Sleep 1s in 100ms chunks — check for pipeline flushing so that
// gst_element_set_state(NULL) isn't blocked.
for (int ms = 0; ms < 1000; ms += 100) {
if (GST_PAD_IS_FLUSHING(GST_BASE_SRC_PAD(basesrc))) {
*waited = -1; // sentinel: flushing
return sl::ERROR_CODE::FAILURE;
}
g_usleep(100000);
}
{
cudaError_t cu = cudaGetLastError();
if (cu != cudaSuccess)
GST_DEBUG_OBJECT(element, "Cleared CUDA error %d during recovery", (int) cu);
}
}
}
138 changes: 102 additions & 36 deletions gst-zed-src/gstzedsrc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
#endif

#include "gst-zed-meta/gstzedmeta.h"
#include "common/gst-zed-recovery.hpp"
#include "gstzedsrc.h"

// AI Module
Expand Down Expand Up @@ -312,6 +313,7 @@ enum {
PROP_SVO_REC_ENABLE,
PROP_SVO_REC_FILENAME,
PROP_SVO_REC_COMPRESSION,
PROP_RECOVERY_TIMEOUT,
N_PROPERTIES
};

Expand Down Expand Up @@ -534,6 +536,7 @@ typedef enum {
#define DEFAULT_PROP_SVO_REC_ENABLE FALSE
#define DEFAULT_PROP_SVO_REC_FILENAME ""
#define DEFAULT_PROP_SVO_REC_COMPRESSION GST_ZEDSRC_SVO_COMPRESSION_H265
#define DEFAULT_PROP_RECOVERY_TIMEOUT 60
//////////////////////////////////////////////////////////////////////////////////////////////////////////////

typedef enum {
Expand Down Expand Up @@ -1843,6 +1846,13 @@ static void gst_zedsrc_class_init(GstZedSrcClass *klass) {
"Object Detection Custom ONNX Dynamic Input Shape Height", 0, 10000,
DEFAULT_PROP_OD_CUSTOM_ONNX_DYNAMIC_INPUT_SHAPE_H,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));

g_object_class_install_property(
gobject_class, PROP_RECOVERY_TIMEOUT,
g_param_spec_int("recovery-timeout", "Recovery Timeout",
"Maximum seconds to wait during camera recovery before failing (0 = no retry)",
0, 300, DEFAULT_PROP_RECOVERY_TIMEOUT,
(GParamFlags) (G_PARAM_READWRITE | G_PARAM_STATIC_STRINGS)));
}

static void gst_zedsrc_reset(GstZedSrc *src) {
Expand Down Expand Up @@ -1996,6 +2006,7 @@ static void gst_zedsrc_init(GstZedSrc *src) {
src->svo_rec_filename = g_string_new(DEFAULT_PROP_SVO_REC_FILENAME);
src->svo_rec_compression = DEFAULT_PROP_SVO_REC_COMPRESSION;
src->svo_rec_active = FALSE;
src->recovery_timeout = DEFAULT_PROP_RECOVERY_TIMEOUT;
// <---- Parameters initialization

src->stop_requested = FALSE;
Expand Down Expand Up @@ -2409,6 +2420,9 @@ void gst_zedsrc_set_property(GObject *object, guint property_id, const GValue *v
case PROP_OD_CUSTOM_ONNX_DYNAMIC_INPUT_SHAPE_H:
src->od_custom_onnx_dynamic_input_shape_h = g_value_get_int(value);
break;
case PROP_RECOVERY_TIMEOUT:
src->recovery_timeout = g_value_get_int(value);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, property_id, pspec);
break;
Expand Down Expand Up @@ -2760,6 +2774,9 @@ void gst_zedsrc_get_property(GObject *object, guint property_id, GValue *value,
case PROP_OD_CUSTOM_ONNX_DYNAMIC_INPUT_SHAPE_H:
g_value_set_int(value, src->od_custom_onnx_dynamic_input_shape_h);
break;
case PROP_RECOVERY_TIMEOUT:
g_value_set_int(value, src->recovery_timeout);
break;
default:
G_OBJECT_WARN_INVALID_PROPERTY_ID(object, property_id, pspec);
break;
Expand Down Expand Up @@ -3997,21 +4014,31 @@ static GstFlowReturn gst_zedsrc_fill(GstPushSrc *psrc, GstBuffer *buf) {
// <---- Set runtime parameters

/// Push zed cuda context as current
int cu_err = (int) cudaGetLastError();
if (cu_err > 0) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED, ("Cuda ERROR trigger before ZED SDK : %d", cu_err),
(NULL));
return GST_FLOW_ERROR;
// Clear stale CUDA errors — during camera recovery the previous frame's
// CUDA state may be corrupted. We log it but do NOT fail the pipeline;
// grab() below will return CAMERA_REBOOTING and we'll retry.
{
int cu_err = (int) cudaGetLastError();
if (cu_err > 0) {
// cudaGetLastError() already cleared the error above; log it for diagnostics
GST_WARNING_OBJECT(src, "CUDA error %d detected before grab — cleared (camera may be recovering)", cu_err);
}
}

zctx = src->zed.getCUDAContext();
cuCtxPushCurrent_v2(zctx);

/// Utils for check ret value and send to out
/// Utils for check ret value and send to out — skip during recovery
#define CHECK_RET_OR_GOTO(_ret_expr) \
do { \
ret = (_ret_expr); \
if (ret != sl::ERROR_CODE::SUCCESS) { \
if (ret == sl::ERROR_CODE::CAMERA_REBOOTING || ret == sl::ERROR_CODE::CUDA_ERROR) { \
GST_WARNING_OBJECT(src, "Retrieve failed during recovery: %s — returning empty frame", \
sl::toString(ret).c_str()); \
flow_ret = GST_FLOW_OK; \
goto out; \
} \
GST_ELEMENT_ERROR(src, RESOURCE, FAILED, \
("Grabbing failed with error: '%s' - %s", sl::toString(ret).c_str(), \
sl::toVerbose(ret).c_str()), \
Expand All @@ -4021,15 +4048,27 @@ static GstFlowReturn gst_zedsrc_fill(GstPushSrc *psrc, GstBuffer *buf) {
} \
} while (0)

// ----> ZED grab
ret = src->zed.grab(zedRtParams);
if (ret > sl::ERROR_CODE::SUCCESS) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Grabbing failed with error: '%s' - %s", sl::toString(ret).c_str(),
sl::toVerbose(ret).c_str()),
(NULL));
flow_ret = GST_FLOW_ERROR;
goto out;
// ----> ZED grab with recovery retry loop
{
int waited = 0;
ret = zed_gst_grab_with_recovery(GST_ELEMENT(src), GST_BASE_SRC(src),
[&]() { return src->zed.grab(zedRtParams); }, src->recovery_timeout, &waited);

if (waited == -1) { flow_ret = GST_FLOW_FLUSHING; goto out; }
if (waited > 0) GST_INFO_OBJECT(src, "Camera recovered after %ds", waited);

if (ret == sl::ERROR_CODE::CAMERA_REBOOTING || ret == sl::ERROR_CODE::CUDA_ERROR) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Camera recovery timeout (last error: %s)", sl::toString(ret).c_str()), (NULL));
flow_ret = GST_FLOW_ERROR; goto out;
}
if (ret == sl::ERROR_CODE::END_OF_SVOFILE_REACHED) { flow_ret = GST_FLOW_EOS; goto out; }
if (ret != sl::ERROR_CODE::SUCCESS) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Grabbing failed with error: '%s' - %s", sl::toString(ret).c_str(),
sl::toVerbose(ret).c_str()), (NULL));
flow_ret = GST_FLOW_ERROR; goto out;
}
}
// <---- ZED grab

Expand Down Expand Up @@ -4207,15 +4246,54 @@ static GstFlowReturn gst_zedsrc_create(GstPushSrc *psrc, GstBuffer **outbuf) {
return GST_FLOW_ERROR;
}

ret = src->zed.grab(zedRtParams);
if (ret == sl::ERROR_CODE::END_OF_SVOFILE_REACHED) {
GST_INFO_OBJECT(src, "End of SVO file");
cuCtxPopCurrent_v2(NULL);
return GST_FLOW_EOS;
} else if (ret > sl::ERROR_CODE::SUCCESS) {
GST_ERROR_OBJECT(src, "grab() failed: %s", sl::toString(ret).c_str());
cuCtxPopCurrent_v2(NULL);
return GST_FLOW_ERROR;
// Grab + retrieve with recovery retry.
// Both grab() and retrieveImage() can return CAMERA_REBOOTING during the
// Guardian recovery window. The helper handles the grab retry; if
// retrieveImage also fails during recovery we retry the whole cycle.
sl::RawBuffer *raw_buffer = nullptr;
{
int waited = 0;
bool retrieve_ok = false;

while (!retrieve_ok) {
ret = zed_gst_grab_with_recovery(GST_ELEMENT(src), GST_BASE_SRC(src),
[&]() { return src->zed.grab(zedRtParams); }, src->recovery_timeout, &waited);

if (waited == -1) { cuCtxPopCurrent_v2(NULL); return GST_FLOW_FLUSHING; }
if (ret == sl::ERROR_CODE::CAMERA_REBOOTING || ret == sl::ERROR_CODE::CUDA_ERROR) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Camera recovery timeout (last error: %s)", sl::toString(ret).c_str()), (NULL));
cuCtxPopCurrent_v2(NULL); return GST_FLOW_ERROR;
}
if (ret == sl::ERROR_CODE::END_OF_SVOFILE_REACHED) {
cuCtxPopCurrent_v2(NULL); return GST_FLOW_EOS;
}
if (ret != sl::ERROR_CODE::SUCCESS) {
GST_ERROR_OBJECT(src, "grab() failed: %s", sl::toString(ret).c_str());
cuCtxPopCurrent_v2(NULL); return GST_FLOW_ERROR;
}

// Grab succeeded — try retrieve.
raw_buffer = new sl::RawBuffer();
ret = src->zed.retrieveImage(*raw_buffer);
if (ret == sl::ERROR_CODE::SUCCESS) {
retrieve_ok = true;
} else {
delete raw_buffer;
raw_buffer = nullptr;
if (ret == sl::ERROR_CODE::CAMERA_REBOOTING || ret == sl::ERROR_CODE::CUDA_ERROR) {
GST_WARNING_OBJECT(src, "RawBuffer retrieve failed during recovery: %s — retrying",
sl::toString(ret).c_str());
continue; // retry whole grab+retrieve
}
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Failed to retrieve RawBuffer: '%s'", sl::toString(ret).c_str()), (NULL));
cuCtxPopCurrent_v2(NULL); return GST_FLOW_ERROR;
}
}

if (waited > 0)
GST_INFO_OBJECT(src, "Camera recovered after %ds (NVMM path)", waited);
}

// Get clock for timestamp
Expand All @@ -4225,18 +4303,6 @@ static GstFlowReturn gst_zedsrc_create(GstPushSrc *psrc, GstBuffer **outbuf) {
gst_object_unref(clock);
}

// Retrieve RawBuffer - allocate on heap for GstBuffer lifecycle
sl::RawBuffer *raw_buffer = new sl::RawBuffer();
ret = src->zed.retrieveImage(*raw_buffer);
if (ret != sl::ERROR_CODE::SUCCESS) {
GST_ELEMENT_ERROR(src, RESOURCE, FAILED,
("Failed to retrieve RawBuffer: '%s'", sl::toString(ret).c_str()),
(NULL));
delete raw_buffer;
cuCtxPopCurrent_v2(NULL);
return GST_FLOW_ERROR;
}

// Get NvBufSurface for left eye
NvBufSurface *nvbuf = static_cast<NvBufSurface *>(raw_buffer->getRawBuffer());
if (!nvbuf) {
Expand Down
2 changes: 2 additions & 0 deletions gst-zed-src/gstzedsrc.h
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,8 @@ struct _GstZedSrc {
GString *svo_rec_filename;
gint svo_rec_compression;
gboolean svo_rec_active; // Internal state: is recording currently active

gint recovery_timeout; // Max seconds to wait during camera recovery (0 = no retry)
// <---- Properties

GstClockTime acq_start_time;
Expand Down
Loading