@@ -8,10 +8,9 @@ package actors
88import (
99 "context"
1010 "fmt"
11- "math/rand/v2 "
11+ "sync "
1212 "time"
1313
14- "github.com/hypermodeinc/modus/runtime/db"
1514 "github.com/hypermodeinc/modus/runtime/logger"
1615 "github.com/hypermodeinc/modus/runtime/messages"
1716 "github.com/hypermodeinc/modus/runtime/pluginmanager"
@@ -82,7 +81,13 @@ func startActorSystem(ctx context.Context, actorSystem goakt.ActorSystem) error
8281 }
8382
8483 // important: wait for the actor system to sync with the cluster before proceeding
85- waitForClusterSync (ctx )
84+ if clusterEnabled () {
85+ select {
86+ case <- time .After (peerSyncInterval ()):
87+ case <- ctx .Done ():
88+ logger .Warn (context .WithoutCancel (ctx )).Msg ("Context cancelled while waiting for cluster sync." )
89+ }
90+ }
8691
8792 return nil
8893 }
@@ -106,68 +111,9 @@ func loadAgentActors(ctx context.Context, plugin *plugins.Plugin) error {
106111 }
107112 }
108113
109- // do this in a goroutine to avoid blocking the cluster engine startup
110- go func () {
111- if err := restoreAgentActors (ctx , plugin .Name ()); err != nil {
112- const msg = "Failed to restore agent actors."
113- sentryutils .CaptureError (ctx , err , msg )
114- logger .Error (ctx , err ).Msg (msg )
115- }
116- }()
117-
118114 return nil
119115}
120116
121- // restoreAgentActors spawn actors for agents with state in the database, that are not already running
122- func restoreAgentActors (ctx context.Context , pluginName string ) error {
123- span , ctx := sentryutils .NewSpanForCurrentFunc (ctx )
124- defer span .Finish ()
125-
126- logger .Debug (ctx ).Msg ("Restoring agent actors from database." )
127-
128- // query the database for active agents
129- agents , err := db .QueryActiveAgents (ctx )
130- if err != nil {
131- return fmt .Errorf ("failed to query active agents from database: %w" , err )
132- }
133-
134- // shuffle the agents to help distribute the load across the cluster when multiple nodes are starting simultaneously
135- rand .Shuffle (len (agents ), func (i , j int ) {
136- agents [i ], agents [j ] = agents [j ], agents [i ]
137- })
138-
139- // spawn actors for each agent that is not already running
140- for _ , agent := range agents {
141- actorName := getActorName (agent .Id )
142- if exists , err := _actorSystem .ActorExists (ctx , actorName ); err != nil {
143- const msg = "Failed to check if agent actor exists."
144- sentryutils .CaptureError (ctx , err , msg , sentryutils .WithData ("agent_id" , agent .Id ))
145- logger .Error (ctx , err ).Str ("agent_id" , agent .Id ).Msg (msg )
146- } else if ! exists {
147- err := spawnActorForAgent (ctx , pluginName , agent .Id , agent .Name , false )
148- if err != nil {
149- const msg = "Failed to spawn actor for agent."
150- sentryutils .CaptureError (ctx , err , msg , sentryutils .WithData ("agent_id" , agent .Id ))
151- logger .Error (ctx , err ).Str ("agent_id" , agent .Id ).Msg (msg )
152- }
153- }
154- }
155-
156- return nil
157- }
158-
159- // Waits for the peer sync interval to pass, allowing time for the actor system to synchronize its
160- // list of actors with the remote nodes in the cluster. Cancels early if the context is done.
161- func waitForClusterSync (ctx context.Context ) {
162- if clusterEnabled () {
163- select {
164- case <- time .After (peerSyncInterval ()):
165- case <- ctx .Done ():
166- logger .Warn (context .WithoutCancel (ctx )).Msg ("Context cancelled while waiting for cluster sync." )
167- }
168- }
169- }
170-
171117func Shutdown (ctx context.Context ) {
172118 span , ctx := sentryutils .NewSpanForCurrentFunc (ctx )
173119 defer span .Finish ()
@@ -209,29 +155,25 @@ func (sh *shutdownHook) Execute(ctx context.Context, actorSystem goakt.ActorSyst
209155
210156 // Suspend all local running agent actors first, which allows them to gracefully stop and persist their state.
211157 // In cluster mode, this will also allow the actor to resume on another node after this node shuts down.
158+ // We use goroutines and a wait group to do this concurrently.
159+ var wg sync.WaitGroup
212160 for _ , pid := range actors {
213161 if actor , ok := pid .Actor ().(* wasmAgentActor ); ok && pid .IsRunning () {
214162 if actor .status == AgentStatusRunning {
215- ctx := actor .augmentContext (ctx , pid )
216- if err := actor .suspendAgent (ctx ); err != nil {
217- const msg = "Failed to suspend agent actor."
218- sentryutils .CaptureError (ctx , err , msg , sentryutils .WithData ("agent_id" , actor .agentId ))
219- logger .Error (ctx , err ).Str ("agent_id" , actor .agentId ).Msg (msg )
220- }
221- }
222- }
223- }
224-
225- // Then shut down subscription actors. They will have received the suspend message already.
226- for _ , pid := range actors {
227- if a , ok := pid .Actor ().(* subscriptionActor ); ok && pid .IsRunning () {
228- if err := pid .Shutdown (ctx ); err != nil {
229- const msg = "Failed to shut down subscription actor."
230- sentryutils .CaptureError (ctx , err , msg , sentryutils .WithData ("agent_id" , a .agentId ))
231- logger .Error (ctx , err ).Str ("agent_id" , a .agentId ).Msg (msg )
163+ wg .Add (1 )
164+ go func () {
165+ defer wg .Done ()
166+ ctx := actor .augmentContext (ctx , pid )
167+ if err := actor .suspendAgent (ctx ); err != nil {
168+ const msg = "Failed to suspend agent actor."
169+ sentryutils .CaptureError (ctx , err , msg , sentryutils .WithData ("agent_id" , actor .agentId ))
170+ logger .Error (ctx , err ).Str ("agent_id" , actor .agentId ).Msg (msg )
171+ }
172+ }()
232173 }
233174 }
234175 }
176+ wg .Wait ()
235177
236178 // Then allow the actor system to continue with its shutdown process.
237179 return nil
0 commit comments