Refactor FuturesWebsocket: implement batch subscription handling, enhance connection management, and improve logging

This commit is contained in:
2025-09-09 00:08:57 +00:00
parent 70f3714d2f
commit 6ebc541de0
13 changed files with 700 additions and 784 deletions

View File

@@ -1,7 +1,9 @@
package manager
import (
"errors"
"fmt"
"log/slog"
"time"
"github.com/google/uuid"
@@ -10,6 +12,8 @@ import (
"gitlab.michelsen.id/phillmichelsen/tessera/services/data_service/internal/router"
)
func lg() *slog.Logger { return slog.Default().With("cmp", "manager") }
// Manager is a single-goroutine actor that owns all state.
type Manager struct {
// Command channel
@@ -24,8 +28,8 @@ type Manager struct {
router *router.Router
}
// New creates a manager and starts its run loop.
func New(r *router.Router) *Manager {
// NewManager creates a manager and starts its run loop.
func NewManager(r *router.Router) *Manager {
m := &Manager{
cmdCh: make(chan any, 256),
providers: make(map[string]provider.Provider),
@@ -35,6 +39,9 @@ func New(r *router.Router) *Manager {
}
go r.Run()
go m.run()
lg().Info("manager started")
return m
}
@@ -42,6 +49,7 @@ func New(r *router.Router) *Manager {
// AddProvider adds and starts a new provider.
func (m *Manager) AddProvider(name string, p provider.Provider) error {
lg().Debug("add provider request", slog.String("name", name))
resp := make(chan error, 1)
m.cmdCh <- addProviderCmd{name: name, p: p, resp: resp}
return <-resp
@@ -49,6 +57,7 @@ func (m *Manager) AddProvider(name string, p provider.Provider) error {
// RemoveProvider stops and removes a provider, cleaning up all sessions.
func (m *Manager) RemoveProvider(name string) error {
lg().Debug("remove provider request", slog.String("name", name))
resp := make(chan error, 1)
m.cmdCh <- removeProviderCmd{name: name, resp: resp}
return <-resp
@@ -56,6 +65,7 @@ func (m *Manager) RemoveProvider(name string) error {
// NewSession creates a new session with the given idle timeout.
func (m *Manager) NewSession(idleAfter time.Duration) (uuid.UUID, error) {
lg().Debug("new session request", slog.Duration("idle_after", idleAfter))
resp := make(chan struct {
id uuid.UUID
err error
@@ -67,6 +77,7 @@ func (m *Manager) NewSession(idleAfter time.Duration) (uuid.UUID, error) {
// AttachClient attaches a client to a session, creates and returns client channels for the session.
func (m *Manager) AttachClient(id uuid.UUID, inBuf, outBuf int) (chan<- domain.Message, <-chan domain.Message, error) {
lg().Debug("attach client request", slog.String("session", id.String()), slog.Int("in_buf", inBuf), slog.Int("out_buf", outBuf))
resp := make(chan struct {
cin chan<- domain.Message
cout <-chan domain.Message
@@ -79,6 +90,7 @@ func (m *Manager) AttachClient(id uuid.UUID, inBuf, outBuf int) (chan<- domain.M
// DetachClient detaches the client from the session, closes client channels and arms timeout.
func (m *Manager) DetachClient(id uuid.UUID) error {
lg().Debug("detach client request", slog.String("session", id.String()))
resp := make(chan error, 1)
m.cmdCh <- detachCmd{sid: id, resp: resp}
return <-resp
@@ -86,6 +98,7 @@ func (m *Manager) DetachClient(id uuid.UUID) error {
// ConfigureSession sets the next set of identifiers for the session, starting and stopping streams as needed.
func (m *Manager) ConfigureSession(id uuid.UUID, next []domain.Identifier) error {
lg().Debug("configure session request", slog.String("session", id.String()), slog.Int("idents", len(next)))
resp := make(chan error, 1)
m.cmdCh <- configureCmd{sid: id, next: next, resp: resp}
return <-resp
@@ -93,6 +106,7 @@ func (m *Manager) ConfigureSession(id uuid.UUID, next []domain.Identifier) error
// CloseSession closes and removes the session, cleaning up all bindings.
func (m *Manager) CloseSession(id uuid.UUID) error {
lg().Debug("close session request", slog.String("session", id.String()))
resp := make(chan error, 1)
m.cmdCh <- closeSessionCmd{sid: id, resp: resp}
return <-resp
@@ -125,10 +139,12 @@ func (m *Manager) run() {
func (m *Manager) handleAddProvider(cmd addProviderCmd) {
if _, ok := m.providers[cmd.name]; ok {
lg().Warn("provider already exists", slog.String("name", cmd.name))
cmd.resp <- fmt.Errorf("provider exists: %s", cmd.name)
return
}
if err := cmd.p.Start(); err != nil {
lg().Warn("failed to start provider", slog.String("name", cmd.name), slog.String("err", err.Error()))
cmd.resp <- fmt.Errorf("start provider %s: %w", cmd.name, err)
return
}
@@ -139,6 +155,7 @@ func (m *Manager) handleAddProvider(cmd addProviderCmd) {
func (m *Manager) handleRemoveProvider(cmd removeProviderCmd) {
p, ok := m.providers[cmd.name]
if !ok {
lg().Warn("provider not found", slog.String("name", cmd.name))
cmd.resp <- fmt.Errorf("provider not found: %s", cmd.name)
return
}
@@ -149,6 +166,7 @@ func (m *Manager) handleRemoveProvider(cmd removeProviderCmd) {
provName, subj, ok := ident.ProviderSubject()
if !ok || provName != cmd.name {
// TODO: add log warning, but basically should never ever happen
lg().Warn("identifier with mismatched provider found in session during provider removal", slog.String("session", s.id.String()), slog.String("ident", ident.Key()), slog.String("expected_provider", cmd.name), slog.String("found_provider", provName))
continue
}
if s.attached && s.clientOut != nil {
@@ -158,19 +176,19 @@ func (m *Manager) handleRemoveProvider(cmd removeProviderCmd) {
// decrementStreamRefCount returns true if this was the last ref. In which case we want to stop the stream.
if ident.IsRaw() && m.decrementStreamRefCount(ident) && subj != "" {
_ = p.StopStream(subj) // best-effort as we will remove the provider anyway
_ = p.StopStreams([]string{subj}) // best-effort as we will remove the provider anyway
}
}
}
// first iteration above is sound, but as a precaution we also clean up any dangling streamRef entries here
// Defensive sweep: log and clear any dangling streamRef entries for this provider.
for id := range m.streamRef {
provName, _, ok := id.ProviderSubject()
if !ok || provName != cmd.name {
continue
}
fmt.Printf("manager: warning — dangling streamRef for %s after removing provider %s\n", id.Key(), cmd.name)
delete(m.streamRef, id)
lg().Warn("dangling streamRef entry found during provider removal", slog.String("ident", id.Key()), slog.String("provider", cmd.name))
}
p.Stop()
@@ -196,6 +214,8 @@ func (m *Manager) handleNewSession(cmd newSessionCmd) {
id uuid.UUID
err error
}{id: s.id, err: nil}
lg().Info("new session created", slog.String("session", s.id.String()), slog.Duration("idle_after", cmd.idleAfter))
}
func (m *Manager) handleAttach(cmd attachCmd) {
@@ -232,6 +252,8 @@ func (m *Manager) handleAttach(cmd attachCmd) {
cout <-chan domain.Message
err error
}{cin, cout, err}
lg().Info("client attached to session", slog.String("session", s.id.String()))
}
func (m *Manager) handleDetach(cmd detachCmd) {
@@ -252,126 +274,163 @@ func (m *Manager) handleDetach(cmd detachCmd) {
_ = m.detachSession(cmd.sid, s)
cmd.resp <- nil
lg().Info("client detached from session", slog.String("session", s.id.String()))
}
func (m *Manager) handleConfigure(c configureCmd) {
s, ok := m.sessions[c.sid]
func (m *Manager) handleConfigure(cmd configureCmd) {
s, ok := m.sessions[cmd.sid]
if !ok {
c.resp <- ErrSessionNotFound
cmd.resp <- ErrSessionNotFound
return
}
if s.closed {
c.resp <- ErrSessionClosed
cmd.resp <- ErrSessionClosed
return
}
old := copySet(s.bound)
toAdd, toDel := identifierSetDifferences(old, c.next)
toAdd, toDel := identifierSetDifferences(old, cmd.next)
// 1) Handle removals first.
var aggErrs error
// 1) Build batches: provider → starts(starters) and stops(subjects)
type starter struct {
id domain.Identifier
subj string
}
startsByProv := make(map[provider.Provider][]starter)
stopsByProv := make(map[provider.Provider][]string)
// Removals
for _, ident := range toDel {
if s.attached && s.clientOut != nil {
m.router.DeregisterRoute(ident, s.clientOut)
}
delete(s.bound, ident)
if ident.IsRaw() {
if m.decrementStreamRefCount(ident) {
if p, subj, err := m.resolveProvider(ident); err == nil {
_ = p.StopStream(subj) // fire-and-forget
}
}
}
}
// 2) Handle additions. Collect starts to await.
type startItem struct {
id domain.Identifier
ch <-chan error
}
var starts []startItem
var initErrs []error
for _, ident := range toAdd {
// Bind intent now.
s.bound[ident] = struct{}{}
if !ident.IsRaw() {
if s.attached && s.clientOut != nil {
m.router.RegisterRoute(ident, s.clientOut)
}
continue
}
p, subj, err := m.resolveProvider(ident)
if err != nil {
delete(s.bound, ident)
initErrs = append(initErrs, err)
aggErrs = errors.Join(aggErrs, fmt.Errorf("stop %s: %w", ident.Key(), err))
continue
}
if subj == "" {
continue
}
if m.decrementStreamRefCount(ident) { // only when last ref
stopsByProv[p] = append(stopsByProv[p], subj)
}
}
// Additions
for _, ident := range toAdd {
if !ident.IsRaw() {
if s.attached && s.clientOut != nil {
m.router.RegisterRoute(ident, s.clientOut)
}
s.bound[ident] = struct{}{}
continue
}
p, subj, err := m.resolveProvider(ident)
if err != nil {
aggErrs = errors.Join(aggErrs, err)
continue
}
if !p.IsValidSubject(subj, false) {
delete(s.bound, ident)
initErrs = append(initErrs, fmt.Errorf("invalid subject %q for provider", subj))
aggErrs = errors.Join(aggErrs, fmt.Errorf("invalid subject %q", subj))
continue
}
first := m.incrementStreamRefCount(ident)
if first || !p.IsStreamActive(subj) {
ch := p.StartStream(subj, m.router.IncomingChannel())
starts = append(starts, startItem{id: ident, ch: ch})
} else if s.attached && s.clientOut != nil {
// Already active, just register for this session.
m.router.RegisterRoute(ident, s.clientOut)
}
}
// 3) Wait for starts initiated by this call, each with its own timeout.
if len(starts) == 0 {
c.resp <- join(initErrs)
return
}
type result struct {
id domain.Identifier
err error
}
done := make(chan result, len(starts))
for _, si := range starts {
// Per-start waiter.
go func(id domain.Identifier, ch <-chan error) {
select {
case err := <-ch:
done <- result{id: id, err: err}
case <-time.After(statusWaitTotal):
done <- result{id: id, err: fmt.Errorf("timeout")}
if m.incrementStreamRefCount(ident) { // first ref → start later
startsByProv[p] = append(startsByProv[p], starter{id: ident, subj: subj})
} else {
// already active → bind+route now
if s.attached && s.clientOut != nil {
m.router.RegisterRoute(ident, s.clientOut)
}
}(si.id, si.ch)
s.bound[ident] = struct{}{}
}
}
// Collect results and apply.
for i := 0; i < len(starts); i++ {
r := <-done
if r.err != nil {
// Roll back this session's bind and drop ref.
delete(s.bound, r.id)
_ = m.decrementStreamRefCount(r.id)
initErrs = append(initErrs, fmt.Errorf("start %v: %w", r.id, r.err))
continue
// 2) Fire provider calls
type batchRes struct {
prov provider.Provider
err error
op string // "start"/"stop"
}
done := make(chan batchRes, len(startsByProv)+len(stopsByProv))
// Start batches
for p, items := range startsByProv {
subjs := make([]string, 0, len(items))
for _, it := range items {
subjs = append(subjs, it.subj)
}
// Success: register for any attached sessions that are bound.
for _, sess := range m.sessions {
if !sess.attached || sess.clientOut == nil {
ack := p.StartStreams(subjs)
go func(p provider.Provider, ack <-chan error) {
var err error
select {
case err = <-ack:
case <-time.After(statusWaitTotal):
err = fmt.Errorf("timeout")
}
done <- batchRes{prov: p, err: err, op: "start"}
}(p, ack)
}
// Stop batches
for p, subjs := range stopsByProv {
ack := p.StopStreams(subjs)
go func(p provider.Provider, ack <-chan error) {
var err error
select {
case err = <-ack:
case <-time.After(statusWaitTotal):
err = fmt.Errorf("timeout")
}
done <- batchRes{prov: p, err: err, op: "stop"}
}(p, ack)
}
// 3) Collect results
for i := 0; i < len(startsByProv)+len(stopsByProv); i++ {
r := <-done
switch r.op {
case "start":
items := startsByProv[r.prov]
if r.err != nil {
// Roll back refcounts for each ident in this provider batch
for _, it := range items {
_ = m.decrementStreamRefCount(it.id)
aggErrs = errors.Join(aggErrs, fmt.Errorf("start %s: %w", it.id.Key(), r.err))
}
continue
}
if _, bound := sess.bound[r.id]; bound {
m.router.RegisterRoute(r.id, sess.clientOut)
// Success → bind and route
for _, it := range items {
if s.attached && s.clientOut != nil {
m.router.RegisterRoute(it.id, s.clientOut)
}
s.bound[it.id] = struct{}{}
}
case "stop":
if r.err != nil {
for _, subj := range stopsByProv[r.prov] {
aggErrs = errors.Join(aggErrs, fmt.Errorf("stop %s/%s: %w", "raw", subj, r.err))
}
}
}
}
c.resp <- join(initErrs)
cmd.resp <- aggErrs
lg().Info("session configured", slog.String("session", s.id.String()), slog.Int("bound", len(s.bound)), slog.Int("to_add", len(toAdd)), slog.Int("to_del", len(toDel)))
}
func (m *Manager) handleCloseSession(c closeSessionCmd) {
@@ -382,4 +441,6 @@ func (m *Manager) handleCloseSession(c closeSessionCmd) {
}
m.closeSession(c.sid, s)
c.resp <- nil
lg().Info("session closed", slog.String("session", s.id.String()))
}