Skip to content

Commit 4c75164

Browse files
authored
fix(damgr): failover altda->ethda should keep finalizing l2 chain (#23)
* test(altda): add a test to make sure altda node keeps finalizing even after failover to ethda Currently it does not, as shown by the test TestAltDA_FinalizationAfterEthDAFailover failing * fix(damgr): ethda failover finalization stall bug Weiwei from Polymer found this bug. He proposed a solution. This is an alternative solution which seems simpler, but not 100% of its soundness. * fix: damgr_test doesn't compile * ci: add op-alt-da tests to ci * chore: add more logs to damgr and altda_data_source * docs(altda_test): fix typo * test(kt-devnet): add node-keeps-finalizing test to kurtosis TestFailoverToEthDACalldata This makes the test takes 4-7 minutes now, which is still acceptable. * style: rename log -> logger * docs(kt-tests): document the EnclaveServiceClients * style: breakup long log line
1 parent 8fdeeb1 commit 4c75164

File tree

10 files changed

+228
-63
lines changed

10 files changed

+228
-63
lines changed

.github/workflows/test-golang.yml

+1
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ jobs:
5050
packages:
5151
- op-batcher
5252
- op-node
53+
- op-alt-da
5354
- op-e2e/system/altda
5455
- op-e2e/actions/altda
5556
steps:

kurtosis-devnet/tests/eigenda/failover_test.go

+45-18
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"encoding/json"
66
"fmt"
7+
"log/slog"
78
"math/big"
89
"net/http"
910
"reflect"
@@ -14,9 +15,12 @@ import (
1415

1516
"github.com/Layr-Labs/eigenda-proxy/clients/memconfig_client"
1617
"github.com/ethereum-optimism/optimism/op-e2e/e2eutils/geth"
18+
"github.com/ethereum-optimism/optimism/op-service/dial"
19+
"github.com/ethereum-optimism/optimism/op-service/sources"
20+
"github.com/ethereum-optimism/optimism/op-service/testlog"
1721
"github.com/ethereum/go-ethereum/common"
1822
"github.com/ethereum/go-ethereum/ethclient"
19-
"github.com/ethereum/go-ethereum/rpc"
23+
"github.com/ethereum/go-ethereum/log"
2024
"github.com/kurtosis-tech/kurtosis/api/golang/core/lib/enclaves"
2125
"github.com/kurtosis-tech/kurtosis/api/golang/engine/lib/kurtosis_context"
2226
"github.com/stretchr/testify/require"
@@ -35,20 +39,18 @@ const enclaveName = "eigenda-memstore-devnet"
3539
//
3640
// Note: because this test relies on modifying the proxy's memstore config, it should be run in isolation.
3741
// That is, if we ever implement more kurtosis tests, they would currently need to be run sequentially.
38-
//
39-
// TODO: We will also need to test the failover behavior of the node, which currently doesn't finalize after failover (fixed in https://github.com/Layr-Labs/optimism/pull/23)
4042
func TestFailoverToEthDACalldata(t *testing.T) {
4143
deadline, ok := t.Deadline()
4244
if !ok {
43-
deadline = time.Now().Add(1 * time.Minute)
45+
deadline = time.Now().Add(10 * time.Minute)
4446
}
4547
ctxWithDeadline, cancel := context.WithDeadline(context.Background(), deadline)
4648
defer cancel()
4749

4850
harness := newHarness(t)
4951
t.Cleanup(func() {
5052
// switch proxy back to normal mode, in case test gets cancelled
51-
ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second)
53+
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
5254
defer cancel()
5355
err := harness.clients.proxyMemconfigClient.Failback(ctx)
5456
if err != nil {
@@ -75,7 +77,7 @@ func TestFailoverToEthDACalldata(t *testing.T) {
7577
harness.requireBatcherTxsToBeFromLayer(t, fromBlock, fromBlock+l1BlocksQueriedForBatcherTxs, DALayerEigenDA)
7678

7779
// 2. Failover and check that the commitments are now EthDACalldata
78-
t.Logf("Failover over... changing proxy's config to return 503 errors")
80+
t.Logf("Failing over... changing proxy's config to return 503 errors")
7981
err := harness.clients.proxyMemconfigClient.Failover(ctxWithDeadline)
8082
require.NoError(t, err)
8183

@@ -87,6 +89,16 @@ func TestFailoverToEthDACalldata(t *testing.T) {
8789

8890
harness.requireBatcherTxsToBeFromLayer(t, afterFailoverFromBlockNum, afterFailoverToBlockNum, DALayerEthCalldata)
8991

92+
// We also check that the op-node is still finalizing blocks after the failover
93+
syncStatus, err := harness.clients.opNodeClient.SyncStatus(ctxWithDeadline)
94+
require.NoError(t, err)
95+
afterFailoverFinalizedL2 := syncStatus.FinalizedL2
96+
t.Logf("Current finalized L2 block: %d. Waiting for next block to finalize to make sure finalization is still happening.", afterFailoverFinalizedL2.Number)
97+
// On average would expect this to take half an epoch, aka 16 L1 blocks, which at 6 sec/block means 1.5 minutes.
98+
// This generally takes longer (3-6 minutes), but I'm not quite sure why.
99+
_, err = geth.WaitForBlockToBeFinalized(new(big.Int).SetUint64(afterFailoverFinalizedL2.Number+1), harness.clients.opGethClient, 6*time.Minute)
100+
require.NoError(t, err, "op-node should still be finalizing blocks after failover")
101+
90102
// 3. Failback and check that the commitments are EigenDA again
91103
t.Logf("Failing back... changing proxy's config to start processing PUT requests normally again")
92104
err = harness.clients.proxyMemconfigClient.Failback(ctxWithDeadline)
@@ -105,13 +117,16 @@ func TestFailoverToEthDACalldata(t *testing.T) {
105117
// Test Harness, which contains all the state needed to run the tests.
106118
// harness also defines some higher-level "require" methods that are used in the tests.
107119
type harness struct {
120+
logger log.Logger
108121
endpoints *EnclaveServicePublicEndpoints
109122
clients *EnclaveServiceClients
110123
batchInboxAddr common.Address
111124
testStartL1BlockNum uint64
112125
}
113126

114127
func newHarness(t *testing.T) *harness {
128+
logger := testlog.Logger(t, slog.LevelInfo)
129+
115130
// We leave 20 seconds to build the entire testHarness.
116131
ctxWithTimeout, cancel := context.WithTimeout(context.Background(), 20*time.Second)
117132
defer cancel()
@@ -128,24 +143,22 @@ func newHarness(t *testing.T) *harness {
128143
require.NoError(t, err)
129144
t.Logf("Endpoints: %+v", endpoints)
130145

131-
clients, err := getClientsFromEndpoints(endpoints)
146+
clients, err := getClientsFromEndpoints(ctxWithTimeout, logger, endpoints)
132147
require.NoError(t, err)
133148

134-
// Get the batch inbox address
135-
var rollupConfigMap struct {
136-
BatchInboxAddress string `json:"batch_inbox_address"`
137-
}
138-
err = clients.opNodeClient.CallContext(ctxWithTimeout, &rollupConfigMap, "optimism_rollupConfig")
149+
// Get the batch inbox address from the rollup config
150+
rollupConfig, err := clients.opNodeClient.RollupConfig(ctxWithTimeout)
139151
require.NoError(t, err)
140152

141153
// Get the current L1 block number
142154
testStartL1BlockNum, err := clients.gethL1Client.BlockNumber(ctxWithTimeout)
143155
require.NoError(t, err)
144156

145157
return &harness{
158+
logger: logger,
146159
endpoints: endpoints,
147160
clients: clients,
148-
batchInboxAddr: common.HexToAddress(rollupConfigMap.BatchInboxAddress),
161+
batchInboxAddr: rollupConfig.BatchInboxAddress,
149162
testStartL1BlockNum: testStartL1BlockNum,
150163
}
151164
}
@@ -315,6 +328,7 @@ func fetchBatcherTxs(gethL1Endpoint string, batchInbox string, fromBlockNum, toB
315328
// The public endpoints are the ones that are exposed to the host machine.
316329
type EnclaveServicePublicEndpoints struct {
317330
OpNodeEndpoint string `kurtosis:"op-cl-1-op-node-op-geth-op-kurtosis,http"`
331+
OpGethEndpoint string `kurtosis:"op-el-1-op-geth-op-node-op-kurtosis,rpc"`
318332
GethL1Endpoint string `kurtosis:"el-1-geth-teku,rpc"`
319333
EigendaProxyEndpoint string `kurtosis:"da-server-op-kurtosis,http"`
320334
// Adding new endpoints is as simple as adding a new field with a kurtosis tag
@@ -374,17 +388,29 @@ func getPublicEndpointsFromKurtosis(enclaveCtx *enclaves.EnclaveContext) (*Encla
374388
}
375389

376390
type EnclaveServiceClients struct {
377-
opNodeClient *rpc.Client
378-
gethL1Client *ethclient.Client
391+
// opNode and opGeth are the L2 clients for the rollup.
392+
opNodeClient *sources.RollupClient
393+
// opGeth is the client for the L2 execution layer client.
394+
opGethClient *ethclient.Client
395+
// gethL1 is the client for the L1 chain execution layer client.
396+
gethL1Client *ethclient.Client
397+
// proxyMemconfigClient is the client for the eigenda-proxy's memstore config API.
398+
// It allows us to toggle the proxy's failover behavior.
379399
proxyMemconfigClient *ProxyMemconfigClient
380400
}
381401

382-
func getClientsFromEndpoints(endpoints *EnclaveServicePublicEndpoints) (*EnclaveServiceClients, error) {
383-
opNodeClient, err := rpc.Dial(endpoints.OpNodeEndpoint)
402+
func getClientsFromEndpoints(ctx context.Context, logger log.Logger, endpoints *EnclaveServicePublicEndpoints) (*EnclaveServiceClients, error) {
403+
opNodeClient, err := dial.DialRollupClientWithTimeout(ctx, 10*time.Second, logger, endpoints.OpNodeEndpoint)
404+
if err != nil {
405+
return nil, fmt.Errorf("dial.DialRollupClientWithTimeout: %w", err)
406+
}
407+
408+
opGethClient, err := dial.DialEthClientWithTimeout(ctx, 10*time.Second, logger, endpoints.OpGethEndpoint)
384409
if err != nil {
385-
return nil, fmt.Errorf("rpc.Dial: %w", err)
410+
return nil, fmt.Errorf("dial.DialEthClientWithTimeout: %w", err)
386411
}
387412

413+
// TODO: prob also change to use dial.DialEthClient?
388414
gethL1Client, err := ethclient.Dial(endpoints.GethL1Endpoint)
389415
if err != nil {
390416
return nil, fmt.Errorf("ethclient.Dial: %w", err)
@@ -396,6 +422,7 @@ func getClientsFromEndpoints(endpoints *EnclaveServicePublicEndpoints) (*Enclave
396422

397423
return &EnclaveServiceClients{
398424
opNodeClient: opNodeClient,
425+
opGethClient: opGethClient,
399426
gethL1Client: gethL1Client,
400427
proxyMemconfigClient: proxyMemconfigClient,
401428
}, nil

op-alt-da/damgr.go

+14-2
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,18 @@ func (d *DA) OnFinalizedHeadSignal(f HeadSignalFn) {
117117
func (d *DA) updateFinalizedHead(l1Finalized eth.L1BlockRef) {
118118
d.l1FinalizedHead = l1Finalized
119119
// Prune the state to the finalized head
120-
d.state.Prune(l1Finalized.ID())
121-
d.finalizedHead = d.state.lastPrunedCommitment
120+
lastPrunedCommIncBlock := d.state.Prune(l1Finalized.ID())
121+
d.log.Debug("updateFinalizedHead",
122+
"currFinalizedHead", d.finalizedHead.Number,
123+
"lastPrunedCommIncBlock", lastPrunedCommIncBlock.Number,
124+
"l1Finalized", l1Finalized.Number)
125+
// If a commitment was pruned, set the finalized head to that commitment's inclusion block
126+
// When no commitments are left to be pruned (one example is if we have failed over to ethda)
127+
// then updateFinalizedFromL1 becomes the main driver of the finalized head.
128+
// Note that updateFinalizedFromL1 is only called when d.state.NoCommitments() is true.
129+
if lastPrunedCommIncBlock != (eth.L1BlockRef{}) {
130+
d.finalizedHead = lastPrunedCommIncBlock
131+
}
122132
}
123133

124134
// updateFinalizedFromL1 updates the finalized head based on the challenge window.
@@ -133,6 +143,7 @@ func (d *DA) updateFinalizedFromL1(ctx context.Context, l1 L1Fetcher) error {
133143
if err != nil {
134144
return err
135145
}
146+
d.log.Debug("updateFinalizedFromL1", "currFinalizedHead", d.finalizedHead.Number, "newFinalizedHead", ref.Number, "l1FinalizedHead", d.l1FinalizedHead.Number, "challengeWindow", d.cfg.ChallengeWindow)
136147
d.finalizedHead = ref
137148
return nil
138149
}
@@ -413,6 +424,7 @@ func (d *DA) fetchChallengeLogs(ctx context.Context, l1 L1Fetcher, block eth.Blo
413424
}
414425
for _, log := range rec.Logs {
415426
if log.Address == d.cfg.DAChallengeContractAddress && len(log.Topics) > 0 && log.Topics[0] == ChallengeStatusEventABIHash {
427+
d.log.Info("found challenge event", "block", block.Number, "log", log.Index)
416428
logs = append(logs, log)
417429
}
418430
}

op-alt-da/damgr_test.go

+14-14
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ func TestFinalization(t *testing.T) {
5353
require.NoError(t, state.ExpireCommitments(bID(8)))
5454
require.Empty(t, state.commitments)
5555

56-
state.Prune(bID(bn1))
57-
require.Equal(t, eth.L1BlockRef{}, state.lastPrunedCommitment)
58-
state.Prune(bID(7))
59-
require.Equal(t, eth.L1BlockRef{}, state.lastPrunedCommitment)
60-
state.Prune(bID(8))
61-
require.Equal(t, l1Ref(bn1), state.lastPrunedCommitment)
56+
lastPrunedCommitment := state.Prune(bID(bn1))
57+
require.Equal(t, eth.L1BlockRef{}, lastPrunedCommitment)
58+
lastPrunedCommitment = state.Prune(bID(7))
59+
require.Equal(t, eth.L1BlockRef{}, lastPrunedCommitment)
60+
lastPrunedCommitment = state.Prune(bID(8))
61+
require.Equal(t, l1Ref(bn1), lastPrunedCommitment)
6262

6363
// Track a commitment, challenge it, & then resolve it
6464
c2 := RandomCommitment(rng)
@@ -83,12 +83,12 @@ func TestFinalization(t *testing.T) {
8383
require.Empty(t, state.challenges)
8484

8585
// Now finalize everything
86-
state.Prune(bID(20))
87-
require.Equal(t, l1Ref(bn1), state.lastPrunedCommitment)
88-
state.Prune(bID(28))
89-
require.Equal(t, l1Ref(bn1), state.lastPrunedCommitment)
90-
state.Prune(bID(32))
91-
require.Equal(t, l1Ref(bn2), state.lastPrunedCommitment)
86+
lastPrunedCommitment = state.Prune(bID(20))
87+
require.Equal(t, eth.L1BlockRef{}, lastPrunedCommitment)
88+
lastPrunedCommitment = state.Prune(bID(28))
89+
require.Equal(t, eth.L1BlockRef{}, lastPrunedCommitment)
90+
lastPrunedCommitment = state.Prune(bID(32))
91+
require.Equal(t, l1Ref(bn2), lastPrunedCommitment)
9292
}
9393

9494
// TestExpireChallenges expires challenges and prunes the state for longer windows
@@ -175,8 +175,8 @@ func TestDAChallengeDetached(t *testing.T) {
175175
require.ErrorIs(t, err, ErrReorgRequired)
176176

177177
// pruning finalized block is safe. It should not prune any commitments yet.
178-
state.Prune(bID(1))
179-
require.Equal(t, eth.L1BlockRef{}, state.lastPrunedCommitment)
178+
lastPrunedCommitment := state.Prune(bID(1))
179+
require.Equal(t, eth.L1BlockRef{}, lastPrunedCommitment)
180180

181181
// Perform reorg back to bn2
182182
state.ClearCommitments()

op-alt-da/damock.go

+6
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ func (c *MockDAClient) DeleteData(key []byte) error {
4848
return c.store.Delete(key)
4949
}
5050

51+
// DAErrFaker is a DA client that can be configured to return errors on GetInput
52+
// and SetInput calls.
5153
type DAErrFaker struct {
5254
Client *MockDAClient
5355

@@ -109,6 +111,10 @@ func (d *AltDADisabled) AdvanceL1Origin(ctx context.Context, l1 L1Fetcher, block
109111
// - request latencies, to mimic a DA service with slow responses
110112
// (eg. eigenDA with 10 min batching interval).
111113
// - response status codes, to mimic a DA service that is down.
114+
//
115+
// We use this FakeDaServer as opposed to the DAErrFaker client in the op-e2e altda system tests
116+
// because the batcher service only has a constructor to build from CLI flags (no dependency injection),
117+
// meaning the da client is built from an rpc url config instead of being injected.
112118
type FakeDAServer struct {
113119
*DAServer
114120
putRequestLatency time.Duration

op-alt-da/dastate.go

+16-13
Original file line numberDiff line numberDiff line change
@@ -52,15 +52,14 @@ func challengeKey(comm CommitmentData, inclusionBlockNumber uint64) string {
5252
// In the special case of a L2 reorg, challenges are still tracked but commitments are removed.
5353
// This will allow the altDA fetcher to find the expired challenge.
5454
type State struct {
55-
commitments []Commitment // commitments where the challenge/resolve period has not expired yet
56-
expiredCommitments []Commitment // commitments where the challenge/resolve period has expired but not finalized
57-
challenges []*Challenge // challenges ordered by L1 inclusion
58-
expiredChallenges []*Challenge // challenges ordered by L1 inclusion
59-
challengesMap map[string]*Challenge // challenges by serialized comm + block number for easy lookup
60-
lastPrunedCommitment eth.L1BlockRef // the last commitment to be pruned
61-
cfg Config
62-
log log.Logger
63-
metrics Metricer
55+
commitments []Commitment // commitments where the challenge/resolve period has not expired yet
56+
expiredCommitments []Commitment // commitments where the challenge/resolve period has expired but not finalized
57+
challenges []*Challenge // challenges ordered by L1 inclusion
58+
expiredChallenges []*Challenge // challenges ordered by L1 inclusion
59+
challengesMap map[string]*Challenge // challenges by serialized comm + block number for easy lookup
60+
cfg Config
61+
log log.Logger
62+
metrics Metricer
6463
}
6564

6665
func NewState(log log.Logger, m Metricer, cfg Config) *State {
@@ -207,15 +206,18 @@ func (s *State) ExpireChallenges(origin eth.BlockID) {
207206
}
208207

209208
// Prune removes challenges & commitments which have an expiry block number beyond the given block number.
210-
func (s *State) Prune(origin eth.BlockID) {
209+
// It returns the last pruned commitment's inclusion block number, or eth.L1BlockRef{} if no commitments were pruned.
210+
func (s *State) Prune(origin eth.BlockID) eth.L1BlockRef {
211211
// Commitments rely on challenges, so we prune commitments first.
212-
s.pruneCommitments(origin)
212+
lastPrunedCommIncBlock := s.pruneCommitments(origin)
213213
s.pruneChallenges(origin)
214+
return lastPrunedCommIncBlock
214215
}
215216

216217
// pruneCommitments removes commitments which have are beyond a given block number.
217218
// It will remove commitments in order of inclusion until it finds a commitment which is not beyond the given block number.
218-
func (s *State) pruneCommitments(origin eth.BlockID) {
219+
func (s *State) pruneCommitments(origin eth.BlockID) eth.L1BlockRef {
220+
var lastPrunedCommIncBlock eth.L1BlockRef
219221
for len(s.expiredCommitments) > 0 {
220222
c := s.expiredCommitments[0]
221223
challenge, ok := s.GetChallenge(c.data, c.inclusionBlock.Number)
@@ -236,8 +238,9 @@ func (s *State) pruneCommitments(origin eth.BlockID) {
236238
s.expiredCommitments = s.expiredCommitments[1:]
237239

238240
// Record the latest inclusion block to be returned
239-
s.lastPrunedCommitment = c.inclusionBlock
241+
lastPrunedCommIncBlock = c.inclusionBlock
240242
}
243+
return lastPrunedCommIncBlock
241244
}
242245

243246
// pruneChallenges removes challenges which have are beyond a given block number.

0 commit comments

Comments
 (0)