fix: manual retry blocked by idempotency after first success (#670)

alexluong · web-flow · commit d0fa4ce161a4 · 2026-02-02T21:58:27.000+07:00
* chore: deliverymq log skipped task

* test: duplicate manual retry (failing tests/tdd)

* fix: include DeliveryTask.Nonce for manual retry

* chore: remove unnecessary backward compat handling

* chore: comment explain idempotency key context

* test: redis testcontainer &amp; improve test stability
diff --git a/.env.test b/.env.test
@@ -1,8 +1,6 @@
 # DBs
 TEST_POSTGRES_URL="localhost:35432"
 TEST_CLICKHOUSE_URL="localhost:39000"
-TEST_REDIS_URL="localhost:36379"
-TEST_DRAGONFLY_URL="localhost:37379"
 # MQs
 TEST_RABBITMQ_URL="localhost:35672"
 TEST_LOCALSTACK_URL="localhost:34566"
diff --git a/build/test/compose.yml b/build/test/compose.yml
@@ -8,15 +8,6 @@ services:
       dockerfile: ./build/test/Dockerfile.mock
     ports:
       - 35555:5555
-  redis:
-    image: redis/redis-stack-server:latest
-    ports:
-      - 36379:6379
-  dragonfly:
-    image: docker.dragonflydb.io/dragonflydb/dragonfly
-    command: ["--proactor_threads=1", "--maxmemory=256mb"]
-    ports:
-      - 37379:6379
   clickhouse:
     image: clickhouse/clickhouse-server:24-alpine
     ports:
diff --git a/cmd/e2e/retry_test.go b/cmd/e2e/retry_test.go
@@ -93,6 +93,45 @@ func (s *basicSuite) TestRetry_ManualRetryCreatesNewAttempt() {
 	s.Equal(1, manualCount, "should have exactly one manual retry attempt")
 }
 
+func (s *basicSuite) TestRetry_DuplicateManualRetryExecutesBoth() {
+	tenant := s.createTenant()
+	dest := s.createWebhookDestination(tenant.ID, "*", withSecret(testSecret))
+
+	eventID := idgen.Event()
+	s.publish(tenant.ID, "user.created", map[string]any{
+		"test": "duplicate_manual_retry",
+	}, withEventID(eventID))
+
+	// Wait for initial delivery
+	s.waitForNewAttempts(tenant.ID, 1)
+
+	// First manual retry
+	status := s.retryEvent(eventID, dest.ID)
+	s.Equal(http.StatusAccepted, status)
+	s.waitForNewAttempts(tenant.ID, 2)
+
+	// Second manual retry for same event+destination
+	status = s.retryEvent(eventID, dest.ID)
+	s.Equal(http.StatusAccepted, status)
+	s.waitForNewAttempts(tenant.ID, 3)
+
+	// Verify: 3 attempts total, 2 manual
+	var resp struct {
+		Models []map[string]any `json:"models"`
+	}
+	status = s.doJSON(http.MethodGet, s.apiURL("/attempts?tenant_id="+tenant.ID+"&event_id="+eventID+"&dir=asc"), nil, &resp)
+	s.Require().Equal(http.StatusOK, status)
+	s.Require().Len(resp.Models, 3, "should have 3 attempts: initial + 2 manual retries")
+
+	manualCount := 0
+	for _, atm := range resp.Models {
+		if manual, ok := atm["manual"].(bool); ok && manual {
+			manualCount++
+		}
+	}
+	s.Equal(2, manualCount, "should have exactly 2 manual retry attempts")
+}
+
 func (s *basicSuite) TestRetry_ManualRetryOnDisabledDestinationRejected() {
 	tenant := s.createTenant()
 	dest := s.createWebhookDestination(tenant.ID, "*")
diff --git a/contributing/test.md b/contributing/test.md
@@ -134,7 +134,7 @@ Integration and e2e tests require external services like ClickHouse, LocalStack,
 
 ### Why persistent infrastructure?
 
-Lightweight services like Redis start quickly, but heavier dependencies like LocalStack (AWS) or GCP emulators can take 15-30 seconds to initialize. With persistent infrastructure, you pay this cost once and get fast iteration from then on.
+Redis and Dragonfly always use testcontainers (one container per test) since they start quickly. Heavier dependencies like LocalStack (AWS) or GCP emulators can take 15-30 seconds to initialize. With persistent infrastructure, you pay this cost once and get fast iteration from then on.
 
 To run the test infrastructure:
 
diff --git a/internal/deliverymq/messagehandler.go b/internal/deliverymq/messagehandler.go
@@ -147,9 +147,21 @@ func (h *messageHandler) Handle(ctx context.Context, msg *mqs.Message) error {
 		return h.handleError(msg, &PreDeliveryError{err: err})
 	}
 
-	err = h.idempotence.Exec(ctx, idempotencyKeyFromDeliveryTask(task), func(ctx context.Context) error {
+	executed := false
+	idempotencyKey := idempotencyKeyFromDeliveryTask(task)
+	err = h.idempotence.Exec(ctx, idempotencyKey, func(ctx context.Context) error {
+		executed = true
 		return h.doHandle(ctx, task, destination)
 	})
+	if err == nil && !executed {
+		h.logger.Ctx(ctx).Info("delivery task skipped (idempotent)",
+			zap.String("event_id", task.Event.ID),
+			zap.String("tenant_id", task.Event.TenantID),
+			zap.String("destination_id", task.DestinationID),
+			zap.Int("attempt", task.Attempt),
+			zap.Bool("manual", task.Manual),
+			zap.String("idempotency_key", idempotencyKey))
+	}
 	return h.handleError(msg, err)
 }
 
diff --git a/internal/deliverymq/messagehandler_test.go b/internal/deliverymq/messagehandler_test.go
@@ -1118,6 +1118,66 @@ func assertAlertMonitor(t *testing.T, m *mockAlertMonitor, success bool, destina
 	}
 }
 
+func TestManualDelivery_DuplicateRetry(t *testing.T) {
+	// Test scenario:
+	// - First manual retry for event+destination succeeds
+	// - Second manual retry for same event+destination is requested
+	// - Second retry should also execute (not be blocked by idempotency)
+	//
+	// Manual retries are explicit user actions and should always execute,
+	// even if a previous manual retry for the same event+destination already succeeded.
+
+	// Setup test data
+	tenant := models.Tenant{ID: idgen.String()}
+	destination := testutil.DestinationFactory.Any(
+		testutil.DestinationFactory.WithTenantID(tenant.ID),
+	)
+	event := testutil.EventFactory.Any(
+		testutil.EventFactory.WithTenantID(tenant.ID),
+		testutil.EventFactory.WithDestinationID(destination.ID),
+	)
+
+	// Setup mocks
+	destGetter := &mockDestinationGetter{dest: &destination}
+	retryScheduler := newMockRetryScheduler()
+	publisher := newMockPublisher([]error{nil, nil}) // Both succeed
+	logPublisher := newMockLogPublisher(nil)
+	alertMonitor := newMockAlertMonitor()
+
+	// Setup message handler with Redis for idempotency
+	redis := testutil.CreateTestRedisClient(t)
+	handler := deliverymq.NewMessageHandler(
+		testutil.CreateTestLogger(t),
+		logPublisher,
+		destGetter,
+		publisher,
+		testutil.NewMockEventTracer(nil),
+		retryScheduler,
+		&backoff.ConstantBackoff{Interval: 1 * time.Second},
+		10,
+		alertMonitor,
+		idempotence.New(redis, idempotence.WithSuccessfulTTL(24*time.Hour)),
+	)
+
+	// Step 1: First manual retry succeeds
+	task1 := models.NewManualDeliveryTask(event, destination.ID)
+	mockMsg1, msg1 := newDeliveryMockMessage(task1)
+	err := handler.Handle(context.Background(), msg1)
+	require.NoError(t, err)
+	assert.True(t, mockMsg1.acked, "first manual retry should be acked")
+	assert.Equal(t, 1, publisher.current, "first manual retry should publish")
+	require.Len(t, logPublisher.entries, 1, "first manual retry should log delivery")
+
+	// Step 2: Second manual retry for same event+destination should also execute
+	task2 := models.NewManualDeliveryTask(event, destination.ID)
+	mockMsg2, msg2 := newDeliveryMockMessage(task2)
+	err = handler.Handle(context.Background(), msg2)
+	require.NoError(t, err)
+	assert.True(t, mockMsg2.acked, "second manual retry should be acked")
+	assert.Equal(t, 2, publisher.current, "second manual retry should also publish")
+	require.Len(t, logPublisher.entries, 2, "second manual retry should also log delivery")
+}
+
 func TestMessageHandler_RetryID_MultipleDestinations(t *testing.T) {
 	// Test scenario:
 	// - One event is delivered to TWO different destinations
diff --git a/internal/models/tasks.go b/internal/models/tasks.go
@@ -3,6 +3,7 @@ package models
 import (
 	"encoding/json"
 
+	"github.com/hookdeck/outpost/internal/idgen"
 	"github.com/hookdeck/outpost/internal/mqs"
 )
 
@@ -38,6 +39,7 @@ type DeliveryTask struct {
 	DestinationID string             `json:"destination_id"`
 	Attempt       int                `json:"attempt"`
 	Manual        bool               `json:"manual"`
+	Nonce         string             `json:"nonce,omitempty"`
 	Telemetry     *DeliveryTelemetry `json:"telemetry,omitempty"`
 }
 
@@ -56,12 +58,16 @@ func (t *DeliveryTask) ToMessage() (*mqs.Message, error) {
 }
 
 // IdempotencyKey returns the key used for idempotency checks.
-// Uses Event.ID + DestinationID + Manual flag.
-// Manual retries get a different key so they can bypass idempotency of failed automatic deliveries.
+// Manual retries include a nonce so each /retry request gets its own idempotency key,
+// while MQ redeliveries of the same message (same nonce) are still deduplicated.
+// Nonce was added to fix a regression from #653 where removing DeliveryEvent.ID
+// made the manual retry idempotency key static per event+destination.
 func (t *DeliveryTask) IdempotencyKey() string {
 	if t.Manual {
-		return t.Event.ID + ":" + t.DestinationID + ":manual"
+		return t.Event.ID + ":" + t.DestinationID + ":manual:" + t.Nonce
 	}
+	// Non-manual deliveries share a key per event+destination. On failure, the
+	// idempotency key is cleared so the scheduled retry can execute with the same key.
 	return t.Event.ID + ":" + t.DestinationID
 }
 
@@ -81,9 +87,11 @@ func NewDeliveryTask(event Event, destinationID string) DeliveryTask {
 }
 
 // NewManualDeliveryTask creates a new DeliveryTask for a manual retry.
+// Each manual retry gets a unique nonce so separate /retry requests are not deduplicated.
 func NewManualDeliveryTask(event Event, destinationID string) DeliveryTask {
 	task := NewDeliveryTask(event, destinationID)
 	task.Manual = true
+	task.Nonce = idgen.String()
 	return task
 }
 
diff --git a/internal/rsmq/rsmq_test.go b/internal/rsmq/rsmq_test.go
@@ -15,6 +15,7 @@ import (
 // RSMQSuite runs RSMQ tests against different backends.
 type RSMQSuite struct {
 	suite.Suite
+	cfg    *redis.RedisConfig
 	client RedisClient
 	rsmq   *RedisSMQ
 }
@@ -30,10 +31,21 @@ func TestRedisRSMQSuite(t *testing.T) {
 
 func TestDragonflyRSMQSuite(t *testing.T) { suite.Run(t, new(DragonflyRSMQSuite)) }
 
-func (s *RedisRSMQSuite) SetupTest() {
+func (s *RedisRSMQSuite) SetupSuite() {
 	testinfra.Start(s.T())
-	cfg := testinfra.NewRedisConfig(s.T())
-	client, err := redis.New(context.Background(), cfg)
+	s.cfg = testinfra.NewRedisConfig(s.T())
+}
+
+func (s *RedisRSMQSuite) SetupTest() {
+	// Flush the container's DB 0 before each test method for a clean state.
+	flushClient, err := redis.New(context.Background(), s.cfg)
+	if err != nil {
+		s.T().Fatalf("failed to create redis client for flush: %v", err)
+	}
+	flushClient.FlushDB(context.Background())
+	flushClient.Close()
+
+	client, err := redis.New(context.Background(), s.cfg)
 	if err != nil {
 		s.T().Fatalf("failed to create redis client: %v", err)
 	}
@@ -42,10 +54,21 @@ func (s *RedisRSMQSuite) SetupTest() {
 	s.rsmq = NewRedisSMQ(s.client, "test")
 }
 
-func (s *DragonflyRSMQSuite) SetupTest() {
+func (s *DragonflyRSMQSuite) SetupSuite() {
 	testinfra.Start(s.T())
-	cfg := testinfra.NewDragonflyConfig(s.T())
-	client, err := redis.New(context.Background(), cfg)
+	s.cfg = testinfra.NewDragonflyConfig(s.T())
+}
+
+func (s *DragonflyRSMQSuite) SetupTest() {
+	// Flush the container's DB 0 before each test method for a clean state.
+	flushClient, err := redis.New(context.Background(), s.cfg)
+	if err != nil {
+		s.T().Fatalf("failed to create redis client for flush: %v", err)
+	}
+	flushClient.FlushDB(context.Background())
+	flushClient.Close()
+
+	client, err := redis.New(context.Background(), s.cfg)
 	if err != nil {
 		s.T().Fatalf("failed to create redis client: %v", err)
 	}
diff --git a/internal/tenantstore/redistenantstore/redistenantstore_test.go b/internal/tenantstore/redistenantstore/redistenantstore_test.go
diff --git a/internal/util/testinfra/redis.go b/internal/util/testinfra/redis.go
diff --git a/internal/util/testinfra/testinfra.go b/internal/util/testinfra/testinfra.go

Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@ package models`
`3`	`3`	`import (`
`4`	`4`	`"encoding/json"`
`5`	`5`
	`6`	`+ "github.com/hookdeck/outpost/internal/idgen"`
`6`	`7`	`"github.com/hookdeck/outpost/internal/mqs"`
`7`	`8`	`)`
`8`	`9`
`@@ -38,6 +39,7 @@ type DeliveryTask struct {`
`38`	`39`	DestinationID string `json:"destination_id"`
`39`	`40`	Attempt int `json:"attempt"`
`40`	`41`	Manual bool `json:"manual"`
	`42`	+ Nonce string `json:"nonce,omitempty"`
`41`	`43`	Telemetry *DeliveryTelemetry `json:"telemetry,omitempty"`
`42`	`44`	`}`
`43`	`45`
`@@ -56,12 +58,16 @@ func (t DeliveryTask) ToMessage() (mqs.Message, error) {`
`56`	`58`	`}`
`57`	`59`
`58`	`60`	`// IdempotencyKey returns the key used for idempotency checks.`
`59`		`-// Uses Event.ID + DestinationID + Manual flag.`
`60`		`-// Manual retries get a different key so they can bypass idempotency of failed automatic deliveries.`
	`61`	`+// Manual retries include a nonce so each /retry request gets its own idempotency key,`
	`62`	`+// while MQ redeliveries of the same message (same nonce) are still deduplicated.`
	`63`	`+// Nonce was added to fix a regression from #653 where removing DeliveryEvent.ID`
	`64`	`+// made the manual retry idempotency key static per event+destination.`
`61`	`65`	`func (t *DeliveryTask) IdempotencyKey() string {`
`62`	`66`	`if t.Manual {`
`63`		`- return t.Event.ID + ":" + t.DestinationID + ":manual"`
	`67`	`+ return t.Event.ID + ":" + t.DestinationID + ":manual:" + t.Nonce`
`64`	`68`	`}`
	`69`	`+ // Non-manual deliveries share a key per event+destination. On failure, the`
	`70`	`+ // idempotency key is cleared so the scheduled retry can execute with the same key.`
`65`	`71`	`return t.Event.ID + ":" + t.DestinationID`
`66`	`72`	`}`
`67`	`73`
`@@ -81,9 +87,11 @@ func NewDeliveryTask(event Event, destinationID string) DeliveryTask {`
`81`	`87`	`}`
`82`	`88`
`83`	`89`	`// NewManualDeliveryTask creates a new DeliveryTask for a manual retry.`
	`90`	`+// Each manual retry gets a unique nonce so separate /retry requests are not deduplicated.`
`84`	`91`	`func NewManualDeliveryTask(event Event, destinationID string) DeliveryTask {`
`85`	`92`	`task := NewDeliveryTask(event, destinationID)`
`86`	`93`	`task.Manual = true`
	`94`	`+ task.Nonce = idgen.String()`
`87`	`95`	`return task`
`88`	`96`	`}`
`89`	`97`