@@ -2,6 +2,7 @@ package main
22
33import (
44 "context"
5+ "errors"
56 "fmt"
67 "io"
78 "os"
@@ -16,22 +17,28 @@ import (
1617 "github.com/urfave/cli/v2"
1718
1819 "k8s.io/apimachinery/pkg/util/sets"
20+ "k8s.io/apimachinery/pkg/util/wait"
1921 "k8s.io/client-go/tools/leaderelection"
2022 "k8s.io/client-go/tools/leaderelection/resourcelock"
2123 "k8s.io/client-go/tools/record"
2224 "k8s.io/klog/v2"
2325 kexec "k8s.io/utils/exec"
2426
2527 "github.com/ovn-org/libovsdb/client"
28+ "github.com/ovn-org/libovsdb/model"
29+ "github.com/ovn-org/libovsdb/ovsdb"
2630
2731 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/clustermanager"
2832 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/config"
2933 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/controllermanager"
3034 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/factory"
3135 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb"
36+ libovsdbops "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/libovsdb/ops"
3237 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/metrics"
38+ "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/nbdb"
3339 ovnnode "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node"
3440 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/node/routemanager"
41+ "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/sbdb"
3542 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/types"
3643 "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util"
3744 utilerrors "github.com/ovn-org/ovn-kubernetes/go-controller/pkg/util/errors"
@@ -149,6 +156,77 @@ func delPidfile(pidfile string) {
149156 }
150157}
151158
159+ // ovnkubeControllerSyncPostBootFilePath is located in storage that does not persist following a reboot. ovn-controller start
160+ // is predicated on the existence of this file.
161+ const ovnkubeControllerSyncPostBootFilePath = "/tmp/ovnkube-controller-sync-post-boot"
162+
163+ func ensureOVNKubeControllerSyncPostBootFile () error {
164+ if err := os .WriteFile (ovnkubeControllerSyncPostBootFilePath , []byte (time .Now ().String ()), 0o644 ); err != nil {
165+ return fmt .Errorf ("failed to write to file at path %q: %v" , ovnkubeControllerSyncPostBootFilePath , err )
166+ }
167+ return nil
168+ }
169+
170+ // doesOVNKubeControllerSyncPostBootFileExist checks if a sentinel file exists. File will not exist post Node restart. OVNKube controller creates the file post sync and after a reboot.
171+ func doesOVNKubeControllerSyncPostBootFileExist () (bool , error ) {
172+ exists := true
173+ _ , err := os .Stat (ovnkubeControllerSyncPostBootFilePath )
174+ if err != nil {
175+ if ! os .IsNotExist (err ) {
176+ return exists , fmt .Errorf ("failed to stat file at path %q: %w" , ovnkubeControllerSyncPostBootFilePath , err )
177+ }
178+ exists = false
179+ }
180+ return exists , nil
181+ }
182+
183+ // waitUntilNorthdSyncOnce ensures northd has sync'd at least once by increments nb_cfg value in NB DB and waiting
184+ // for northd to copy it to SB DB. Poll SB DB until context is cancelled.
185+ // The expectation is that the data you wish to be sync'd to SB DB has already been written to NB DB so when we get the initial
186+ // nb_cfg value, we know that if we increment that by one and see that value or greater in SB DB, then the data has sync'd.
187+ // All other processes interacting with nb_cfg increment it. This function depends on other processes respecting that.
188+ // No guarantee of any changes in SB DB made after this func.
189+ func waitUntilNorthdSyncOnce (ctx context.Context , nbClient , sbClient client.Client ) error {
190+ // 1. Get value of nb_cfg
191+ // 2. Increment value of nb_cfg
192+ // 3. Wait until value appears in SB DB after northd copies it.
193+ nbGlobal := & nbdb.NBGlobal {}
194+ nbGlobal , err := libovsdbops .GetNBGlobal (nbClient , nbGlobal )
195+ if err != nil {
196+ return fmt .Errorf ("failed to find OVN Northbound NB_Global table" +
197+ " entry: %w" , err )
198+ }
199+ // increment nb_cfg value by 1. When northd consumes updates from NB DB, it will copy this value to SB DBs SB_Global table nb_cfg field.
200+ ops , err := nbClient .Where (nbGlobal ).Mutate (nbGlobal , model.Mutation {
201+ Field : & nbGlobal .NbCfg ,
202+ Mutator : ovsdb .MutateOperationAdd ,
203+ Value : 1 ,
204+ })
205+ if err != nil {
206+ return fmt .Errorf ("failed to generate ops to mutate nb_cfg: %w" , err )
207+ }
208+ expectedNbCfgValue := nbGlobal .NbCfg + 1
209+ if _ , err = libovsdbops .TransactAndCheck (nbClient , ops ); err != nil {
210+ return fmt .Errorf ("failed to transact to increment nb_cfg: %w" , err )
211+ }
212+ sbGlobal := & sbdb.SBGlobal {}
213+ // poll until we see the expected value in SB DB every 5 milliseconds until context is cancelled.
214+ err = wait .PollUntilContextCancel (ctx , time .Millisecond * 5 , true , func (_ context.Context ) (done bool , err error ) {
215+ if sbGlobal , err = libovsdbops .GetSBGlobal (sbClient , sbGlobal ); err != nil {
216+ // northd hasn't added an entry yet
217+ if errors .Is (err , client .ErrNotFound ) {
218+ return false , nil
219+ }
220+ return false , fmt .Errorf ("failed to get sb_global table entry from SB DB: %w" , err )
221+ }
222+ return sbGlobal .NbCfg >= expectedNbCfgValue , nil // we only need to ensure it is greater than or equal to the expected value
223+ })
224+ if err != nil {
225+ return fmt .Errorf ("failed while waiting for nb_cfg value greater than or equal %d in sb db sb_global table: %w" , expectedNbCfgValue , err )
226+ }
227+ return nil
228+ }
229+
152230func setupPIDFile (pidfile string ) error {
153231 // need to test if already there
154232 _ , err := os .Stat (pidfile )
@@ -493,6 +571,19 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util
493571 defer cancel ()
494572 defer wg .Done ()
495573
574+ // If IC mode, ovnkube controller will create a file in directory that will not persistent a Node reboot, therefore
575+ // we can determine if the Node is rebooting or not. Post sync, ensure the file exists. ovn-controller start is
576+ // predicated on the file existence because in IC mode, SB DB may contain stale data from before reboot and, we do
577+ // not want ovn-controller to consume this stale control plane data.
578+ var doesBootFileExist bool
579+ if config .OVNKubernetesFeature .EnableInterconnect {
580+ // no support for multiple Nodes per zone.
581+ doesBootFileExist , err = doesOVNKubeControllerSyncPostBootFileExist ()
582+ if err != nil {
583+ controllerErr = fmt .Errorf ("failed to determine if Node restarted via sentinel file availability: %v" , err )
584+ }
585+ }
586+
496587 libovsdbOvnNBClient , err := libovsdb .NewNBClient (ctx .Done ())
497588 if err != nil {
498589 controllerErr = fmt .Errorf ("failed to initialize libovsdb NB client: %w" , err )
@@ -523,6 +614,19 @@ func runOvnKube(ctx context.Context, runMode *ovnkubeRunMode, ovnClientset *util
523614 return
524615 }
525616
617+ // if IC and a Node restart, block writing sentinel file until northd has sync'd once.
618+ // external resources may predicate on this file availability and guaranteed ovnkube controller has sync'd once
619+ // and changes propagated to SB DB.
620+ if config .OVNKubernetesFeature .EnableInterconnect && ! doesBootFileExist {
621+ if err = waitUntilNorthdSyncOnce (ctx , libovsdbOvnNBClient , libovsdbOvnSBClient ); err != nil {
622+ controllerErr = fmt .Errorf ("failed while waiting for northd to copy NB DB contents to SB DB: %w" , err )
623+ return
624+ }
625+ if err = ensureOVNKubeControllerSyncPostBootFile (); err != nil {
626+ controllerErr = fmt .Errorf ("failed to ensure ovnkube controller start file: %w" , err )
627+ }
628+ }
629+
526630 // record delay until ready
527631 metrics .MetricOVNKubeControllerReadyDuration .Set (time .Since (startTime ).Seconds ())
528632
0 commit comments