The root cause of the initial sync timeout was that connectAndRun called initialSync (which uses Send/RPC) before starting readLoop, but Send's response delivery depends on readLoop→routeFrame→handleResponse. Without the readLoop running, agents.list and sessions.list would always time out. Fix: start readLoop in a goroutine before calling initialSync so that RPC responses are properly routed back to pending Send() calls. After initialSync completes, event handlers are registered and MarkWSReady is called. The connectAndRun function then blocks on the readLoop goroutine's completion. Also added TestConnectAndRun_InitialSyncOrdering which verifies that agents are persisted from initial sync (would hang/timeout under the old ordering).
475 lines
14 KiB
Go
475 lines
14 KiB
Go
// Package gateway provides WebSocket client integration with the OpenClaw
|
|
// gateway using WS protocol v3. The WSClient handles connection, handshake,
|
|
// frame routing, request/response correlation, and automatic reconnection
|
|
// with exponential backoff.
|
|
package gateway
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"fmt"
|
|
"log/slog"
|
|
"sync"
|
|
"time"
|
|
|
|
"code.cubecraftcreations.com/CubeCraft-Creations/Control-Center/go-backend/internal/handler"
|
|
"code.cubecraftcreations.com/CubeCraft-Creations/Control-Center/go-backend/internal/repository"
|
|
|
|
"github.com/gorilla/websocket"
|
|
"github.com/google/uuid"
|
|
)
|
|
|
|
// WSConfig holds WebSocket client configuration, typically loaded from
|
|
// environment variables. AuthToken must be set to a valid OpenClaw gateway
|
|
// operator token.
|
|
type WSConfig struct {
|
|
URL string // e.g. "ws://host.docker.internal:18789/"
|
|
AuthToken string // from OPENCLAW_GATEWAY_TOKEN
|
|
}
|
|
|
|
// DefaultWSConfig returns sensible defaults for local development.
|
|
func DefaultWSConfig() WSConfig {
|
|
return WSConfig{
|
|
URL: "ws://localhost:18789/",
|
|
AuthToken: "",
|
|
}
|
|
}
|
|
|
|
// eventHandler is a callback invoked when a named event arrives from the
|
|
// gateway.
|
|
type eventHandler func(json.RawMessage)
|
|
|
|
// WSClient connects to the OpenClaw gateway over WebSocket, completes the
|
|
// v3 handshake, routes incoming frames, and automatically reconnects on
|
|
// disconnect with exponential backoff.
|
|
type WSClient struct {
|
|
config WSConfig
|
|
conn *websocket.Conn
|
|
connMu sync.Mutex // protects conn for writes
|
|
pending map[string]chan<- json.RawMessage
|
|
mu sync.Mutex // protects pending and handlers
|
|
agents repository.AgentRepo
|
|
broker *handler.Broker
|
|
logger *slog.Logger
|
|
|
|
handlers map[string][]eventHandler
|
|
connId string // set after successful hello-ok
|
|
restClient *Client // optional REST client to notify on WS ready
|
|
wsReadyOnce sync.Once // ensures MarkWSReady close is one-shot
|
|
}
|
|
|
|
// NewWSClient returns a WSClient wired to the given repository and broker.
|
|
func NewWSClient(cfg WSConfig, agents repository.AgentRepo, broker *handler.Broker, logger *slog.Logger) *WSClient {
|
|
if logger == nil {
|
|
logger = slog.Default()
|
|
}
|
|
return &WSClient{
|
|
config: cfg,
|
|
pending: make(map[string]chan<- json.RawMessage),
|
|
agents: agents,
|
|
broker: broker,
|
|
logger: logger,
|
|
handlers: make(map[string][]eventHandler),
|
|
}
|
|
}
|
|
|
|
// SetRESTClient wires the REST fallback client so the WS client can notify
|
|
// it when the WS connection is ready. Call this before Start.
|
|
func (c *WSClient) SetRESTClient(rest *Client) {
|
|
c.restClient = rest
|
|
}
|
|
|
|
// OnEvent registers a handler for the given event name. Handlers are called
|
|
// when an incoming frame with type "event" and matching event name is
|
|
// received. This is safe to call before Start.
|
|
func (c *WSClient) OnEvent(event string, handler func(json.RawMessage)) {
|
|
c.mu.Lock()
|
|
defer c.mu.Unlock()
|
|
c.handlers[event] = append(c.handlers[event], handler)
|
|
}
|
|
|
|
// ── Frame types ──────────────────────────────────────────────────────────
|
|
|
|
// wsFrame represents a generic WebSocket frame in the OpenClaw v3 protocol.
|
|
type wsFrame struct {
|
|
Type string `json:"type"` // "req", "res", "event"
|
|
ID string `json:"id,omitempty"` // request/response correlation
|
|
Method string `json:"method,omitempty"` // method name (req frames)
|
|
Event string `json:"event,omitempty"` // event name (event frames)
|
|
Params json.RawMessage `json:"params,omitempty"`
|
|
Result json.RawMessage `json:"result,omitempty"`
|
|
Error *wsError `json:"error,omitempty"`
|
|
}
|
|
|
|
// wsError represents an error in a response frame.
|
|
type wsError struct {
|
|
Code int `json:"code"`
|
|
Message string `json:"message"`
|
|
}
|
|
|
|
// connectRequest builds the initial connect handshake payload.
|
|
type connectRequest struct {
|
|
MinProtocol int `json:"minProtocol"`
|
|
MaxProtocol int `json:"maxProtocol"`
|
|
Client connectClientInfo `json:"client"`
|
|
Role string `json:"role"`
|
|
Scopes []string `json:"scopes"`
|
|
Auth connectAuth `json:"auth"`
|
|
}
|
|
|
|
type connectClientInfo struct {
|
|
ID string `json:"id"`
|
|
Version string `json:"version"`
|
|
Platform string `json:"platform"`
|
|
Mode string `json:"mode"`
|
|
}
|
|
|
|
type connectAuth struct {
|
|
Token string `json:"token"`
|
|
}
|
|
|
|
// helloOKResponse represents the expected response to a successful connect.
|
|
type helloOKResponse struct {
|
|
ConnID string `json:"connId"`
|
|
Features struct {
|
|
Methods []string `json:"methods"`
|
|
Events []string `json:"events"`
|
|
} `json:"features"`
|
|
}
|
|
|
|
// ── Start loop ───────────────────────────────────────────────────────────
|
|
|
|
// Start connects to the gateway, completes the handshake, and begins the
|
|
// read loop. On disconnect it reconnects with exponential backoff. On
|
|
// ctx cancellation it performs a clean shutdown.
|
|
func (c *WSClient) Start(ctx context.Context) {
|
|
initialBackoff := 1 * time.Second
|
|
maxBackoff := 30 * time.Second
|
|
backoff := initialBackoff
|
|
|
|
for {
|
|
err := c.connectAndRun(ctx)
|
|
if err != nil {
|
|
if ctx.Err() != nil {
|
|
c.logger.Info("ws client stopped (context cancelled)")
|
|
return
|
|
}
|
|
c.logger.Warn("ws client disconnected, reconnecting",
|
|
"error", err,
|
|
"backoff", backoff)
|
|
} else {
|
|
// Reset backoff on successful connect+run completion
|
|
backoff = initialBackoff
|
|
}
|
|
|
|
select {
|
|
case <-ctx.Done():
|
|
c.logger.Info("ws client stopped during backoff (context cancelled)")
|
|
return
|
|
case <-time.After(backoff):
|
|
// Exponential backoff: 1s, 2s, 4s, 8s, 16s, max 30s
|
|
backoff = backoff * 2
|
|
if backoff > maxBackoff {
|
|
backoff = maxBackoff
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// connectAndRun dials the gateway, completes the handshake, and runs the
|
|
// read loop until an error occurs or ctx is cancelled.
|
|
func (c *WSClient) connectAndRun(ctx context.Context) error {
|
|
c.logger.Info("ws client connecting", "url", c.config.URL)
|
|
|
|
dialer := websocket.Dialer{
|
|
HandshakeTimeout: 10 * time.Second,
|
|
}
|
|
|
|
conn, _, err := dialer.DialContext(ctx, c.config.URL, nil)
|
|
if err != nil {
|
|
return fmt.Errorf("dial failed: %w", err)
|
|
}
|
|
|
|
c.connMu.Lock()
|
|
c.conn = conn
|
|
c.connMu.Unlock()
|
|
|
|
// When context is cancelled, close the conn to unblock ReadJSON in readLoop.
|
|
go func() {
|
|
<-ctx.Done()
|
|
c.connMu.Lock()
|
|
if c.conn != nil {
|
|
c.conn.Close()
|
|
}
|
|
c.connMu.Unlock()
|
|
}()
|
|
|
|
defer func() {
|
|
conn.Close()
|
|
}()
|
|
|
|
// Step 1: Read the connect.challenge frame
|
|
if err := c.readChallenge(conn); err != nil {
|
|
return fmt.Errorf("handshake challenge: %w", err)
|
|
}
|
|
|
|
// Step 2: Send connect request
|
|
helloOK, err := c.sendConnect(conn)
|
|
if err != nil {
|
|
return fmt.Errorf("handshake connect: %w", err)
|
|
}
|
|
|
|
c.logger.Info("ws client handshake complete",
|
|
"connId", helloOK.ConnID,
|
|
"methods", helloOK.Features.Methods,
|
|
"events", helloOK.Features.Events)
|
|
|
|
// Store connId for reference
|
|
c.connMu.Lock()
|
|
c.connId = helloOK.ConnID
|
|
c.connMu.Unlock()
|
|
|
|
// Step 2b: Start the read loop in a goroutine so that Send() in
|
|
// initialSync can receive responses. The read loop goroutine will
|
|
// continue running after initialSync completes, routing live events
|
|
// and any future RPC responses.
|
|
readLoopErrCh := make(chan error, 1)
|
|
go func() {
|
|
readLoopErrCh <- c.readLoop(ctx, conn)
|
|
}()
|
|
|
|
// Step 2c: Initial sync — fetch agents + sessions from gateway.
|
|
// This now works because the read loop is active and will route
|
|
// response frames back to Send() via handleResponse.
|
|
if err := c.initialSync(ctx); err != nil {
|
|
c.logger.Warn("initial sync failed, will continue with read loop", "error", err)
|
|
}
|
|
|
|
// Step 2d: Register live event handlers (read loop is already
|
|
// active, so events will be dispatched immediately)
|
|
c.registerEventHandlers()
|
|
|
|
// Notify REST client that WS is live so it stands down.
|
|
// This must happen AFTER initialSync so that the REST poller
|
|
// doesn't start polling while we're still syncing.
|
|
if c.restClient != nil {
|
|
c.restClient.MarkWSReady()
|
|
c.logger.Info("ws client notified REST fallback to stand down")
|
|
}
|
|
|
|
// Reset wsReadyOnce so MarkWSReady can fire again after a reconnect
|
|
c.wsReadyOnce = sync.Once{}
|
|
|
|
// Step 3: Wait for the read loop goroutine to finish (blocks
|
|
// until the connection drops or context is cancelled).
|
|
return <-readLoopErrCh
|
|
}
|
|
|
|
// readChallenge reads the first frame from the gateway, which must be a
|
|
// connect.challenge event.
|
|
func (c *WSClient) readChallenge(conn *websocket.Conn) error {
|
|
var frame wsFrame
|
|
if err := conn.ReadJSON(&frame); err != nil {
|
|
return fmt.Errorf("read challenge: %w", err)
|
|
}
|
|
|
|
if frame.Type != "event" || frame.Event != "connect.challenge" {
|
|
return fmt.Errorf("expected connect.challenge, got type=%s event=%s", frame.Type, frame.Event)
|
|
}
|
|
|
|
c.logger.Debug("received connect.challenge", "params", string(frame.Params))
|
|
return nil
|
|
}
|
|
|
|
// sendConnect sends the connect request and waits for the hello-ok response.
|
|
func (c *WSClient) sendConnect(conn *websocket.Conn) (*helloOKResponse, error) {
|
|
reqID := uuid.New().String()
|
|
params := connectRequest{
|
|
MinProtocol: 3,
|
|
MaxProtocol: 3,
|
|
Client: connectClientInfo{
|
|
ID: "control-center",
|
|
Version: "1.0",
|
|
Platform: "server",
|
|
Mode: "operator",
|
|
},
|
|
Role: "operator",
|
|
Scopes: []string{"operator.read"},
|
|
Auth: connectAuth{
|
|
Token: c.config.AuthToken,
|
|
},
|
|
}
|
|
|
|
paramsJSON, err := json.Marshal(params)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal connect params: %w", err)
|
|
}
|
|
|
|
reqFrame := wsFrame{
|
|
Type: "req",
|
|
ID: reqID,
|
|
Method: "connect",
|
|
Params: paramsJSON,
|
|
}
|
|
|
|
if err := conn.WriteJSON(reqFrame); err != nil {
|
|
return nil, fmt.Errorf("write connect request: %w", err)
|
|
}
|
|
|
|
// Read response
|
|
var resFrame wsFrame
|
|
if err := conn.ReadJSON(&resFrame); err != nil {
|
|
return nil, fmt.Errorf("read connect response: %w", err)
|
|
}
|
|
|
|
if resFrame.Error != nil {
|
|
return nil, fmt.Errorf("connect rejected: code=%d msg=%s", resFrame.Error.Code, resFrame.Error.Message)
|
|
}
|
|
|
|
if resFrame.ID != reqID {
|
|
return nil, fmt.Errorf("response id mismatch: expected %s, got %s", reqID, resFrame.ID)
|
|
}
|
|
|
|
// Check for hello-ok method in the result
|
|
// The gateway responds with method "hello-ok" on success
|
|
var helloOK helloOKResponse
|
|
if err := json.Unmarshal(resFrame.Result, &helloOK); err != nil {
|
|
return nil, fmt.Errorf("parse hello-ok: %w", err)
|
|
}
|
|
|
|
return &helloOK, nil
|
|
}
|
|
|
|
// readLoop continuously reads frames from the connection and routes them.
|
|
// It returns on read error or when the connection is closed by the ctx-done
|
|
// goroutine started in connectAndRun.
|
|
func (c *WSClient) readLoop(ctx context.Context, conn *websocket.Conn) error {
|
|
for {
|
|
var frame wsFrame
|
|
if err := conn.ReadJSON(&frame); err != nil {
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
// Check if it's a close error
|
|
if websocket.IsCloseError(err, websocket.CloseNormalClosure, websocket.CloseGoingAway) {
|
|
c.logger.Info("ws connection closed by server")
|
|
return nil
|
|
}
|
|
if websocket.IsUnexpectedCloseError(err) {
|
|
c.logger.Warn("ws connection unexpectedly closed", "error", err)
|
|
return err
|
|
}
|
|
return fmt.Errorf("read frame: %w", err)
|
|
}
|
|
|
|
c.routeFrame(frame)
|
|
}
|
|
}
|
|
|
|
// routeFrame dispatches a received frame to the appropriate handler.
|
|
func (c *WSClient) routeFrame(frame wsFrame) {
|
|
switch frame.Type {
|
|
case "res":
|
|
c.handleResponse(frame)
|
|
case "event":
|
|
c.handleEvent(frame)
|
|
default:
|
|
c.logger.Warn("unknown frame type", "type", frame.Type, "id", frame.ID)
|
|
}
|
|
}
|
|
|
|
// handleResponse correlates a response frame to a pending request channel.
|
|
func (c *WSClient) handleResponse(frame wsFrame) {
|
|
c.mu.Lock()
|
|
ch, ok := c.pending[frame.ID]
|
|
if ok {
|
|
delete(c.pending, frame.ID)
|
|
}
|
|
c.mu.Unlock()
|
|
|
|
if !ok {
|
|
c.logger.Warn("received response for unknown request", "id", frame.ID)
|
|
return
|
|
}
|
|
|
|
if frame.Error != nil {
|
|
// Send nil to signal error; caller checks via Send return
|
|
ch <- nil
|
|
return
|
|
}
|
|
|
|
ch <- frame.Result
|
|
}
|
|
|
|
// handleEvent dispatches an event frame to registered handlers.
|
|
func (c *WSClient) handleEvent(frame wsFrame) {
|
|
c.mu.Lock()
|
|
handlers := c.handlers[frame.Event]
|
|
c.mu.Unlock()
|
|
|
|
if len(handlers) == 0 {
|
|
c.logger.Debug("unhandled event", "event", frame.Event)
|
|
return
|
|
}
|
|
|
|
for _, h := range handlers {
|
|
h(frame.Params)
|
|
}
|
|
}
|
|
|
|
// ── Send ─────────────────────────────────────────────────────────────────
|
|
|
|
// Send sends a JSON request to the gateway and returns the response payload.
|
|
// It is safe for concurrent use. Returns an error if the client is not
|
|
// connected.
|
|
func (c *WSClient) Send(method string, params any) (json.RawMessage, error) {
|
|
reqID := uuid.New().String()
|
|
|
|
paramsJSON, err := json.Marshal(params)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("marshal params: %w", err)
|
|
}
|
|
|
|
// Register pending response channel
|
|
respCh := make(chan json.RawMessage, 1)
|
|
c.mu.Lock()
|
|
c.pending[reqID] = respCh
|
|
c.mu.Unlock()
|
|
|
|
defer func() {
|
|
c.mu.Lock()
|
|
delete(c.pending, reqID)
|
|
c.mu.Unlock()
|
|
}()
|
|
|
|
// Build and send frame
|
|
frame := wsFrame{
|
|
Type: "req",
|
|
ID: reqID,
|
|
Method: method,
|
|
Params: paramsJSON,
|
|
}
|
|
|
|
c.connMu.Lock()
|
|
if c.conn == nil {
|
|
c.connMu.Unlock()
|
|
return nil, fmt.Errorf("gateway: not connected")
|
|
}
|
|
err = c.conn.WriteJSON(frame)
|
|
c.connMu.Unlock()
|
|
|
|
if err != nil {
|
|
return nil, fmt.Errorf("write request: %w", err)
|
|
}
|
|
|
|
// Wait for response with timeout
|
|
select {
|
|
case resp := <-respCh:
|
|
if resp == nil {
|
|
return nil, fmt.Errorf("gateway returned error for request %s", reqID)
|
|
}
|
|
return resp, nil
|
|
case <-time.After(30 * time.Second):
|
|
return nil, fmt.Errorf("request %s timed out", reqID)
|
|
}
|
|
} |