diff --git a/lib/dynflow/executors/sidekiq/redis_locking.rb b/lib/dynflow/executors/sidekiq/redis_locking.rb index 5a4e668b..6bda278b 100644 --- a/lib/dynflow/executors/sidekiq/redis_locking.rb +++ b/lib/dynflow/executors/sidekiq/redis_locking.rb @@ -41,9 +41,7 @@ def release_orchestrator_lock def wait_for_orchestrator_lock mode = nil loop do - active = ::Sidekiq.redis do |conn| - conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true) - end + active = try_acquire_orchestrator_lock break if active if mode.nil? mode = :passive @@ -54,6 +52,15 @@ def wait_for_orchestrator_lock @logger.info('Acquired orchestrator lock, entering active mode.') end + def try_acquire_orchestrator_lock + ::Sidekiq.redis do |conn| + conn.set(REDIS_LOCK_KEY, @world.id, :ex => REDIS_LOCK_TTL, :nx => true) + end + rescue ::Redis::BaseError => e + @logger.error("Could not acquire orchestrator lock: #{e}") + nil + end + def reacquire_orchestrator_lock case ::Sidekiq.redis { |conn| conn.eval REACQUIRE_SCRIPT, [REDIS_LOCK_KEY], [@world.id] } when ACQUIRE_MISSING diff --git a/test/bats/sidekiq-orchestrator.bats b/test/bats/sidekiq-orchestrator.bats index 13e1d716..b75ae5f3 100644 --- a/test/bats/sidekiq-orchestrator.bats +++ b/test/bats/sidekiq-orchestrator.bats @@ -23,7 +23,7 @@ teardown() { cd "$TEST_PIDDIR" || return 1 shopt -s nullglob for pidfile in * ; do - kill -15 "$(cat "$pidfile")" + kill -9 "$(cat "$pidfile")" done ) cleanup_containers 1 @@ -116,3 +116,63 @@ teardown() { timeout 30 bundle exec ruby examples/remote_executor.rb client 1 wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)" } + +@test "active orchestrator can survive a brief redis connection drop" { + cd "$(get_project_root)" + + run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1 + wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)" + + run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default + wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)" + + stop_redis + wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)" + start_redis + + timeout 10 bundle exec ruby examples/remote_executor.rb client 1 + wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)" +} + +@test "active orchestrator can survive a longer redis connection drop" { + cd "$(get_project_root)" + + run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1 + wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)" + + run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default + wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)" + + stop_redis 1 + wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)" + start_redis + + wait_for 30 1 grep 'The orchestrator lock was lost, reacquired' "$(bg_output_file o1)" + + timeout 10 bundle exec ruby examples/remote_executor.rb client 1 + wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o1)" +} + +@test "orchestrators can fail over if active one goes away during downtime" { + cd "$(get_project_root)" + + run_background 'o1' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1 + wait_for 30 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o1)" + + run_background 'o2' bundle exec sidekiq -r ./examples/remote_executor.rb -q dynflow_orchestrator -c 1 + wait_for 30 1 grep 'dynflow: Orchestrator lock already taken, entering passive mode.' "$(bg_output_file o2)" + + run_background 'w1' bundle exec sidekiq -r ./examples/remote_executor.rb -q default + wait_for 5 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o1)" + + stop_redis 1 + wait_for 30 1 grep 'Error connecting to Redis' "$(bg_output_file o1)" + kill -15 "$(cat "$TEST_PIDDIR/o1.pid")" + start_redis + + wait_for 120 1 grep 'dynflow: Acquired orchestrator lock, entering active mode.' "$(bg_output_file o2)" + wait_for 120 1 grep 'dynflow: Finished performing validity checks' "$(bg_output_file o2)" + + timeout 10 bundle exec ruby examples/remote_executor.rb client 1 + wait_for 1 1 grep -P 'dynflow: ExecutionPlan.*running >>.*stopped' "$(bg_output_file o2)" +}