CUB-200: fix WS initial sync ordering — start readLoop before initialSync
Some checks failed
Dev Build & Deploy / test-and-build (pull_request) Failing after 0s
Dev Build & Deploy / docker-build-push (pull_request) Has been skipped

The root cause of the initial sync timeout was that connectAndRun called
initialSync (which uses Send/RPC) before starting readLoop, but Send's
response delivery depends on readLoop→routeFrame→handleResponse. Without
the readLoop running, agents.list and sessions.list would always time out.

Fix: start readLoop in a goroutine before calling initialSync so that
RPC responses are properly routed back to pending Send() calls. After
initialSync completes, event handlers are registered and MarkWSReady
is called. The connectAndRun function then blocks on the readLoop
goroutine's completion.

Also added TestConnectAndRun_InitialSyncOrdering which verifies that
agents are persisted from initial sync (would hang/timeout under the
old ordering).
This commit is contained in:
2026-05-20 21:42:31 +00:00
parent 1b82e1d3a6
commit d370d5ec23
2 changed files with 125 additions and 11 deletions

View File

@@ -229,7 +229,29 @@ func (c *WSClient) connectAndRun(ctx context.Context) error {
c.connId = helloOK.ConnID
c.connMu.Unlock()
// Notify REST client that WS is live so it stands down
// Step 2b: Start the read loop in a goroutine so that Send() in
// initialSync can receive responses. The read loop goroutine will
// continue running after initialSync completes, routing live events
// and any future RPC responses.
readLoopErrCh := make(chan error, 1)
go func() {
readLoopErrCh <- c.readLoop(ctx, conn)
}()
// Step 2c: Initial sync — fetch agents + sessions from gateway.
// This now works because the read loop is active and will route
// response frames back to Send() via handleResponse.
if err := c.initialSync(ctx); err != nil {
c.logger.Warn("initial sync failed, will continue with read loop", "error", err)
}
// Step 2d: Register live event handlers (read loop is already
// active, so events will be dispatched immediately)
c.registerEventHandlers()
// Notify REST client that WS is live so it stands down.
// This must happen AFTER initialSync so that the REST poller
// doesn't start polling while we're still syncing.
if c.restClient != nil {
c.restClient.MarkWSReady()
c.logger.Info("ws client notified REST fallback to stand down")
@@ -238,16 +260,9 @@ func (c *WSClient) connectAndRun(ctx context.Context) error {
// Reset wsReadyOnce so MarkWSReady can fire again after a reconnect
c.wsReadyOnce = sync.Once{}
// Step 2b: Initial sync — fetch agents + sessions from gateway
if err := c.initialSync(ctx); err != nil {
c.logger.Warn("initial sync failed, will continue with read loop", "error", err)
}
// Step 2c: Register live event handlers
c.registerEventHandlers()
// Step 3: Read loop
return c.readLoop(ctx, conn)
// Step 3: Wait for the read loop goroutine to finish (blocks
// until the connection drops or context is cancelled).
return <-readLoopErrCh
}
// readChallenge reads the first frame from the gateway, which must be a