Skip to content

Commit 7be58c8

Browse files
watchdog: Touch Watchdog before executing most event loop callbacks to avoid spurious misses when there are many queued callbacks. (envoyproxy#13104)
This change assumes that the primary purposes of the watchdog is to terminate the proxy if worker threads are deadlocked, and aid in the debugging of long-running operations that end up scheduled in worker threads through monitoring for miss/megamiss events and auxiliary watchdog actions like the recently added watchdog profile action. Signed-off-by: Antonio Vicente <[email protected]>
1 parent fa59006 commit 7be58c8

24 files changed

+221
-97
lines changed

docs/root/version_history/current.rst

+1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ Bug Fixes
2929
* http: sending CONNECT_ERROR for HTTP/2 where appropriate during CONNECT requests.
3030
* proxy_proto: fixed a bug where the wrong downstream address got sent to upstream connections.
3131
* tls: fix read resumption after triggering buffer high-watermark and all remaining request/response bytes are stored in the SSL connection's internal buffers.
32+
* watchdog: touch the watchdog before most event loop operations to avoid misses when handling bursts of callbacks.
3233

3334
Removed Config or Runtime
3435
-------------------------

include/envoy/event/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ envoy_cc_library(
3131
"//include/envoy/network:listen_socket_interface",
3232
"//include/envoy/network:listener_interface",
3333
"//include/envoy/network:transport_socket_interface",
34+
"//include/envoy/server:watchdog_interface",
3435
"//include/envoy/thread:thread_interface",
3536
],
3637
)

include/envoy/event/dispatcher.h

+10
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include "envoy/network/listen_socket.h"
2020
#include "envoy/network/listener.h"
2121
#include "envoy/network/transport_socket.h"
22+
#include "envoy/server/watchdog.h"
2223
#include "envoy/stats/scope.h"
2324
#include "envoy/stats/stats_macros.h"
2425
#include "envoy/stream_info/stream_info.h"
@@ -63,6 +64,15 @@ class Dispatcher {
6364
*/
6465
virtual const std::string& name() PURE;
6566

67+
/**
68+
* Register a watchdog for this dispatcher. The dispatcher is responsible for touching the
69+
* watchdog at least once per touch interval. Dispatcher implementations may choose to touch more
70+
* often to avoid spurious miss events when processing long callback queues.
71+
* @param min_touch_interval Touch interval for the watchdog.
72+
*/
73+
virtual void registerWatchdog(const Server::WatchDogSharedPtr& watchdog,
74+
std::chrono::milliseconds min_touch_interval) PURE;
75+
6676
/**
6777
* Returns a time-source to use with this dispatcher.
6878
*/

include/envoy/server/BUILD

+1-2
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ envoy_cc_library(
6868
name = "guarddog_interface",
6969
hdrs = ["guarddog.h"],
7070
deps = [
71+
"//include/envoy/event:dispatcher_interface",
7172
"//include/envoy/server:watchdog_interface",
7273
"//include/envoy/stats:stats_interface",
7374
"//include/envoy/thread:thread_interface",
@@ -159,8 +160,6 @@ envoy_cc_library(
159160
name = "watchdog_interface",
160161
hdrs = ["watchdog.h"],
161162
deps = [
162-
"//include/envoy/event:dispatcher_interface",
163-
"//include/envoy/network:address_interface",
164163
"//include/envoy/thread:thread_interface",
165164
],
166165
)

include/envoy/server/guarddog.h

+4-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
#pragma once
22

33
#include "envoy/common/pure.h"
4+
#include "envoy/event/dispatcher.h"
45
#include "envoy/server/watchdog.h"
56

67
namespace Envoy {
@@ -28,9 +29,11 @@ class GuardDog {
2829
*
2930
* @param thread_id a Thread::ThreadId containing the system thread id
3031
* @param thread_name supplies the name of the thread which is used for per-thread miss stats.
32+
* @param dispatcher dispatcher responsible for petting the watchdog.
3133
*/
3234
virtual WatchDogSharedPtr createWatchDog(Thread::ThreadId thread_id,
33-
const std::string& thread_name) PURE;
35+
const std::string& thread_name,
36+
Event::Dispatcher& dispatcher) PURE;
3437

3538
/**
3639
* Tell the GuardDog to forget about this WatchDog.

include/envoy/server/watchdog.h

+3-13
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include <memory>
44

55
#include "envoy/common/pure.h"
6-
#include "envoy/event/dispatcher.h"
76
#include "envoy/thread/thread.h"
87

98
namespace Envoy {
@@ -19,21 +18,12 @@ class WatchDog {
1918
public:
2019
virtual ~WatchDog() = default;
2120

22-
/**
23-
* Start a recurring touch timer in the dispatcher passed as argument.
24-
*
25-
* This will automatically call the touch() method at the interval specified
26-
* during construction.
27-
*
28-
* The timer object is stored within the WatchDog object. It will go away if
29-
* the object goes out of scope and stop the timer.
30-
*/
31-
virtual void startWatchdog(Event::Dispatcher& dispatcher) PURE;
32-
3321
/**
3422
* Manually indicate that you are still alive by calling this.
3523
*
36-
* This can be used if this is later used on a thread where there is no dispatcher.
24+
* When the watchdog is registered with a dispatcher, the dispatcher will periodically call this
25+
* method to indicate the thread is still alive. It should be called directly by the application
26+
* code in cases where the watchdog is not registered with a dispatcher.
3727
*/
3828
virtual void touch() PURE;
3929
virtual Thread::ThreadId threadId() const PURE;

source/common/event/dispatcher_impl.cc

+30-3
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,13 @@ DispatcherImpl::DispatcherImpl(const std::string& name, Buffer::WatermarkFactory
5858

5959
DispatcherImpl::~DispatcherImpl() { FatalErrorHandler::removeFatalErrorHandler(*this); }
6060

61+
void DispatcherImpl::registerWatchdog(const Server::WatchDogSharedPtr& watchdog,
62+
std::chrono::milliseconds min_touch_interval) {
63+
ASSERT(!watchdog_registration_, "Each dispatcher can have at most one registered watchdog.");
64+
watchdog_registration_ =
65+
std::make_unique<WatchdogRegistration>(watchdog, *scheduler_, min_touch_interval, *this);
66+
}
67+
6168
void DispatcherImpl::initializeStats(Stats::Scope& scope,
6269
const absl::optional<std::string>& prefix) {
6370
const std::string effective_prefix = prefix.has_value() ? *prefix : absl::StrCat(name_, ".");
@@ -150,7 +157,13 @@ Network::DnsResolverSharedPtr DispatcherImpl::createDnsResolver(
150157
FileEventPtr DispatcherImpl::createFileEvent(os_fd_t fd, FileReadyCb cb, FileTriggerType trigger,
151158
uint32_t events) {
152159
ASSERT(isThreadSafe());
153-
return FileEventPtr{new FileEventImpl(*this, fd, cb, trigger, events)};
160+
return FileEventPtr{new FileEventImpl(
161+
*this, fd,
162+
[this, cb](uint32_t events) {
163+
touchWatchdog();
164+
cb(events);
165+
},
166+
trigger, events)};
154167
}
155168

156169
Filesystem::WatcherPtr DispatcherImpl::createFilesystemWatcher() {
@@ -179,11 +192,19 @@ TimerPtr DispatcherImpl::createTimer(TimerCb cb) {
179192

180193
Event::SchedulableCallbackPtr DispatcherImpl::createSchedulableCallback(std::function<void()> cb) {
181194
ASSERT(isThreadSafe());
182-
return base_scheduler_.createSchedulableCallback(cb);
195+
return base_scheduler_.createSchedulableCallback([this, cb]() {
196+
touchWatchdog();
197+
cb();
198+
});
183199
}
184200

185201
TimerPtr DispatcherImpl::createTimerInternal(TimerCb cb) {
186-
return scheduler_->createTimer(cb, *this);
202+
return scheduler_->createTimer(
203+
[this, cb]() {
204+
touchWatchdog();
205+
cb();
206+
},
207+
*this);
187208
}
188209

189210
void DispatcherImpl::deferredDelete(DeferredDeletablePtr&& to_delete) {
@@ -257,5 +278,11 @@ void DispatcherImpl::runPostCallbacks() {
257278
}
258279
}
259280

281+
void DispatcherImpl::touchWatchdog() {
282+
if (watchdog_registration_) {
283+
watchdog_registration_->touchWatchdog();
284+
}
285+
}
286+
260287
} // namespace Event
261288
} // namespace Envoy

source/common/event/dispatcher_impl.h

+30
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ class DispatcherImpl : Logger::Loggable<Logger::Id::main>,
4242

4343
// Event::Dispatcher
4444
const std::string& name() override { return name_; }
45+
void registerWatchdog(const Server::WatchDogSharedPtr& watchdog,
46+
std::chrono::milliseconds min_touch_interval) override;
4547
TimeSource& timeSource() override { return api_.timeSource(); }
4648
void initializeStats(Stats::Scope& scope, const absl::optional<std::string>& prefix) override;
4749
void clearDeferredDeleteList() override;
@@ -93,9 +95,36 @@ class DispatcherImpl : Logger::Loggable<Logger::Id::main>,
9395
}
9496

9597
private:
98+
// Holds a reference to the watchdog registered with this dispatcher and the timer used to ensure
99+
// that the dog is touched periodically.
100+
class WatchdogRegistration {
101+
public:
102+
WatchdogRegistration(const Server::WatchDogSharedPtr& watchdog, Scheduler& scheduler,
103+
std::chrono::milliseconds timer_interval, Dispatcher& dispatcher)
104+
: watchdog_(watchdog), timer_interval_(timer_interval) {
105+
touch_timer_ = scheduler.createTimer(
106+
[this]() -> void {
107+
watchdog_->touch();
108+
touch_timer_->enableTimer(timer_interval_);
109+
},
110+
dispatcher);
111+
touch_timer_->enableTimer(timer_interval_);
112+
}
113+
114+
void touchWatchdog() { watchdog_->touch(); }
115+
116+
private:
117+
Server::WatchDogSharedPtr watchdog_;
118+
const std::chrono::milliseconds timer_interval_;
119+
TimerPtr touch_timer_;
120+
};
121+
using WatchdogRegistrationPtr = std::unique_ptr<WatchdogRegistration>;
122+
96123
TimerPtr createTimerInternal(TimerCb cb);
97124
void updateApproximateMonotonicTimeInternal();
98125
void runPostCallbacks();
126+
// Helper used to touch the watchdog after most schedulable, fd, and timer callbacks.
127+
void touchWatchdog();
99128

100129
// Validate that an operation is thread safe, i.e. it's invoked on the same thread that the
101130
// dispatcher run loop is executing on. We allow run_tid_ to be empty for tests where we don't
@@ -122,6 +151,7 @@ class DispatcherImpl : Logger::Loggable<Logger::Id::main>,
122151
const ScopeTrackedObject* current_object_{};
123152
bool deferred_deleting_{};
124153
MonotonicTime approximate_monotonic_time_;
154+
WatchdogRegistrationPtr watchdog_registration_;
125155
};
126156

127157
} // namespace Event

source/server/BUILD

+1-2
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,7 @@ envoy_cc_library(
113113
":watchdog_lib",
114114
"//include/envoy/api:api_interface",
115115
"//include/envoy/common:time_interface",
116+
"//include/envoy/event:dispatcher_interface",
116117
"//include/envoy/event:timer_interface",
117118
"//include/envoy/server:configuration_interface",
118119
"//include/envoy/server:guarddog_config_interface",
@@ -473,11 +474,9 @@ envoy_cc_library(
473474

474475
envoy_cc_library(
475476
name = "watchdog_lib",
476-
srcs = ["watchdog_impl.cc"],
477477
hdrs = ["watchdog_impl.h"],
478478
deps = [
479479
"//include/envoy/common:time_interface",
480-
"//include/envoy/event:dispatcher_interface",
481480
"//include/envoy/server:watchdog_interface",
482481
"//source/common/common:assert_lib",
483482
],

source/server/guarddog_impl.cc

+5-2
Original file line numberDiff line numberDiff line change
@@ -185,19 +185,22 @@ void GuardDogImpl::step() {
185185
}
186186

187187
WatchDogSharedPtr GuardDogImpl::createWatchDog(Thread::ThreadId thread_id,
188-
const std::string& thread_name) {
188+
const std::string& thread_name,
189+
Event::Dispatcher& dispatcher) {
189190
// Timer started by WatchDog will try to fire at 1/2 of the interval of the
190191
// minimum timeout specified. loop_interval_ is const so all shared state
191192
// accessed out of the locked section below is const (time_source_ has no
192193
// state).
193194
const auto wd_interval = loop_interval_ / 2;
194-
auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id), wd_interval);
195+
auto new_watchdog = std::make_shared<WatchDogImpl>(std::move(thread_id));
195196
WatchedDogPtr watched_dog = std::make_unique<WatchedDog>(stats_scope_, thread_name, new_watchdog);
196197
new_watchdog->touch();
197198
{
198199
Thread::LockGuard guard(wd_lock_);
199200
watched_dogs_.push_back(std::move(watched_dog));
200201
}
202+
dispatcher.registerWatchdog(new_watchdog, wd_interval);
203+
new_watchdog->touch();
201204
return new_watchdog;
202205
}
203206

source/server/guarddog_impl.h

+2-2
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,8 @@ class GuardDogImpl : public GuardDog {
9292
}
9393

9494
// Server::GuardDog
95-
WatchDogSharedPtr createWatchDog(Thread::ThreadId thread_id,
96-
const std::string& thread_name) override;
95+
WatchDogSharedPtr createWatchDog(Thread::ThreadId thread_id, const std::string& thread_name,
96+
Event::Dispatcher& dispatcher) override;
9797
void stopWatching(WatchDogSharedPtr wd) override;
9898

9999
private:

source/server/server.cc

+1-2
Original file line numberDiff line numberDiff line change
@@ -682,8 +682,7 @@ void InstanceImpl::run() {
682682
// Run the main dispatch loop waiting to exit.
683683
ENVOY_LOG(info, "starting main dispatch loop");
684684
auto watchdog = main_thread_guard_dog_->createWatchDog(api_->threadFactory().currentThreadId(),
685-
"main_thread");
686-
watchdog->startWatchdog(*dispatcher_);
685+
"main_thread", *dispatcher_);
687686
dispatcher_->post([this] { notifyCallbacksForStage(Stage::Startup); });
688687
dispatcher_->run(Event::Dispatcher::RunType::Block);
689688
ENVOY_LOG(info, "main dispatch loop exited");

source/server/watchdog_impl.cc

-19
This file was deleted.

source/server/watchdog_impl.h

+2-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,7 @@
11
#pragma once
22

33
#include <atomic>
4-
#include <chrono>
54

6-
#include "envoy/common/time.h"
7-
#include "envoy/event/dispatcher.h"
85
#include "envoy/server/watchdog.h"
96

107
namespace Envoy {
@@ -17,17 +14,15 @@ namespace Server {
1714
class WatchDogImpl : public WatchDog {
1815
public:
1916
/**
20-
* @param interval WatchDog timer interval (used after startWatchdog())
17+
* @param thread_id ThreadId of the monitored thread
2118
*/
22-
WatchDogImpl(Thread::ThreadId thread_id, std::chrono::milliseconds interval)
23-
: thread_id_(thread_id), timer_interval_(interval) {}
19+
WatchDogImpl(Thread::ThreadId thread_id) : thread_id_(thread_id) {}
2420

2521
Thread::ThreadId threadId() const override { return thread_id_; }
2622
// Used by GuardDogImpl determine if the watchdog was touched recently and reset the touch status.
2723
bool getTouchedAndReset() { return touched_.exchange(false, std::memory_order_relaxed); }
2824

2925
// Server::WatchDog
30-
void startWatchdog(Event::Dispatcher& dispatcher) override;
3126
void touch() override {
3227
// Set touched_ if not already set.
3328
bool expected = false;
@@ -37,8 +32,6 @@ class WatchDogImpl : public WatchDog {
3732
private:
3833
const Thread::ThreadId thread_id_;
3934
std::atomic<bool> touched_{false};
40-
Event::TimerPtr timer_;
41-
const std::chrono::milliseconds timer_interval_;
4235
};
4336

4437
} // namespace Server

source/server/worker_impl.cc

+2-3
Original file line numberDiff line numberDiff line change
@@ -128,9 +128,8 @@ void WorkerImpl::threadRoutine(GuardDog& guard_dog) {
128128
// The watch dog must be created after the dispatcher starts running and has post events flushed,
129129
// as this is when TLS stat scopes start working.
130130
dispatcher_->post([this, &guard_dog]() {
131-
watch_dog_ =
132-
guard_dog.createWatchDog(api_.threadFactory().currentThreadId(), dispatcher_->name());
133-
watch_dog_->startWatchdog(*dispatcher_);
131+
watch_dog_ = guard_dog.createWatchDog(api_.threadFactory().currentThreadId(),
132+
dispatcher_->name(), *dispatcher_);
134133
});
135134
dispatcher_->run(Event::Dispatcher::RunType::Block);
136135
ENVOY_LOG(debug, "worker exited dispatch loop");

test/common/event/BUILD

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ envoy_cc_test(
1919
"//source/common/event:dispatcher_lib",
2020
"//source/common/stats:isolated_store_lib",
2121
"//test/mocks:common_lib",
22+
"//test/mocks/server:watch_dog_mocks",
2223
"//test/mocks/stats:stats_mocks",
2324
"//test/test_common:simulated_time_system_lib",
2425
"//test/test_common:test_runtime_lib",

0 commit comments

Comments
 (0)