fix(jitsi): prevent reconnect loops and idle

2026-06-02 06:23:37 +02:00 · 2026-06-01 12:51:06 +03:00
parent 2d2e772c05
commit cc65c033e2
3 changed files with 811 additions and 22 deletions
@@ -50,6 +50,23 @@ const (
 	videoTrackName       = "videochannel"
 	maxReconnects        = 5
 	reconnectWindow      = 5 * time.Minute
+	// reconnectGrace is the window after a successful self-reconnect during
+	// which incoming peer-epoch changes do NOT trigger another reconnect.
+	// Without this, the peer's own recovery (which produces a fresh epoch)
+	// drives us into an infinite reconnect loop.
+	reconnectGrace = 20 * time.Second
+	// stableUptime is how long the bridge must stay healthy before the
+	// reconnectCount is reset. Without this, healthy reconnects accumulated
+	// over hours of operation eventually cross maxReconnects and the engine
+	// gives up on a perfectly recoverable failure.
+	stableUptime = 60 * time.Second
+	// xmppKeepaliveInterval keeps the underlying XMPP transport alive while
+	// we wait for a peer. BOSH has no built-in stream management; without
+	// any application traffic Prosody closes the BOSH session after roughly
+	// 60 s and our subsequent WaitJingle observes "connection closed". A
+	// periodic XMPP ping IQ resets that idle timer end-to-end and works for
+	// the WebSocket transport too.
+	xmppKeepaliveInterval = 25 * time.Second
 )

 // bridgeMagic tags every EndpointMessage produced by this engine. JVB broadcasts
@@ -91,8 +108,10 @@ type Session struct {

 	jSess atomic.Pointer[j.Session]

-	pcMu sync.Mutex
-	pc   *webrtc.PeerConnection
+	pcMu     sync.Mutex
+	pc       *webrtc.PeerConnection
+	pcCtx    context.Context    //nolint:containedctx // tied to PC lifetime, cancelled in teardownPC
+	pcCancel context.CancelFunc // cancels pcCtx; cancelled when the live PC is replaced

 	sendQueue     chan []byte
 	peerSendQueue chan bridgeOutbound
@@ -101,11 +120,16 @@ type Session struct {
 	reconnecting  atomic.Bool

 	reconnectCh          chan struct{}
-	reconnectMu          sync.Mutex // guards reconnectWindowStart and reconnectCount
+	reconnectMu          sync.Mutex // guards reconnectWindowStart, reconnectCount, lastReconnectAt
 	reconnectWindowStart time.Time
 	reconnectCount       int
-	localEpoch           atomic.Uint32
-	peerEpoch            atomic.Uint32
+	// lastReconnectAt records when the last successful self-reconnect completed.
+	// During the grace period after a reconnect, peer-epoch changes are tolerated
+	// without triggering yet another reconnect (the peer is also recovering and
+	// will publish a fresh epoch as part of its own recovery).
+	lastReconnectAt atomic.Int64
+	localEpoch      atomic.Uint32
+	peerEpoch       atomic.Uint32

 	// peerEndpoint latches the MUC nick of the first occupant whose
 	// EndpointMessage passed the bridgeMagic check. Once set, all bridge
@@ -293,16 +317,24 @@ func (s *Session) Connect(ctx context.Context) error {
 	s.jSess.Store(jSess)
 	logger.Infof("jitsi: MUC joined %s/%s; waiting for peer …", s.host, s.room)

-	s.wg.Add(4)
+	s.wg.Add(5)
 	go s.sendLoop()
 	go s.recvLoop()
 	go s.waitForJingle()
 	go s.bridgeKeepalive()
+	go s.xmppKeepalive()
 	return nil
 }

 // waitForJingle waits for Jicofo to send session-initiate (when a peer joins)
 // and then opens the bridge channel and negotiates the PeerConnection.
+//
+// Jicofo only emits session-initiate once min-participants is reached
+// (default 2). If we sit alone in the room long enough, the underlying
+// XMPP transport may also drop (BOSH session timeout, connection reset,
+// network blip, etc.). On any non-cancellation error we request a
+// reconnect so the supervisor can rejoin and resume waiting; without
+// this, a single failed wait permanently wedges the engine.
 func (s *Session) waitForJingle() {
 	defer s.wg.Done()

@@ -317,6 +349,7 @@ func (s *Session) waitForJingle() {
 			return
 		}
 		logger.Warnf("jitsi: wait jingle failed: %v", err)
+		s.requestReconnect("wait jingle failed: " + err.Error())
 		return
 	}
 	_ = stanza // parsed below via joinAndOpenBridge path
@@ -615,6 +648,14 @@ func (s *Session) negotiatePC(ctx context.Context, jSess *j.Session, sctpBridge

 	s.pcMu.Lock()
 	s.pc = pc
+	// Build a context that lives exactly as long as this PC instance.
+	// teardownPC cancels pcCancel so any goroutines bound to pcCtx
+	// (currently rtcpKeepalive) exit before a fresh PC takes its place.
+	if s.pcCancel != nil {
+		s.pcCancel()
+	}
+	s.pcCtx, s.pcCancel = context.WithCancel(s.runCtx)
+	pcCtx := s.pcCtx
 	s.pcMu.Unlock()

 	// Start an RTCP keepalive. JVB tracks endpoint liveness via
@@ -625,7 +666,7 @@ func (s *Session) negotiatePC(ctx context.Context, jSess *j.Session, sctpBridge
 	// after the default 1-minute inactivity timeout, which causes JVB to
 	// shut down the DTLS session and emit close_notify.
 	s.wg.Add(1)
-	go s.rtcpKeepalive(pc)
+	go s.rtcpKeepalive(pcCtx, pc)

 	return nil
 }
@@ -643,7 +684,13 @@ type negotiator interface {
 // more than the configured inactivityTimeout (default 1 minute). Even an
 // empty RR keeps the timestamp fresh - JVB does not require the report to
 // reference any specific SSRC.
-func (s *Session) rtcpKeepalive(pc *webrtc.PeerConnection) {
+//
+// pcCtx is bound to the lifetime of pc: when teardownPC closes pc as part of
+// a reconnect, pcCtx is cancelled and this loop exits cleanly. Without that
+// binding, the loop would keep ticking after pc.Close(), accumulate write
+// errors against the dead PC, and fire a duplicate "rtcp keepalive dead"
+// reconnect that competes with the in-progress reconnect supervisor.
+func (s *Session) rtcpKeepalive(pcCtx context.Context, pc *webrtc.PeerConnection) {
 	defer s.wg.Done()
 	const interval = 5 * time.Second
 	const maxErrors = 3
@@ -655,9 +702,14 @@ func (s *Session) rtcpKeepalive(pc *webrtc.PeerConnection) {
 		select {
 		case <-s.done:
 			return
+		case <-pcCtx.Done():
+			return
 		case <-ticker.C:
+			if pcCtx.Err() != nil {
+				return
+			}
 			if err := pc.WriteRTCP(pkts); err != nil {
-				if s.closed.Load() {
+				if s.closed.Load() || pcCtx.Err() != nil {
 					return
 				}
 				errCount++
@@ -674,10 +726,23 @@ func (s *Session) rtcpKeepalive(pc *webrtc.PeerConnection) {
 	}
 }

-// bridgeKeepalive sends a lightweight colibri-ws message every 10 seconds so
-// JVB updates its endpoint lastActivity timestamp. Without this, JVB expires
-// the endpoint after its inactivity timeout (~30-60s) when the ICE/DTLS path
-// is routed through a TURN relay whose allocation silently dies.
+// bridgeKeepalive sends a lightweight bridge frame every 10 seconds so JVB
+// updates its endpoint lastActivity timestamp. Without this, JVB expires the
+// endpoint after its inactivity timeout (~30-60s) when the ICE/DTLS path is
+// routed through a TURN relay whose allocation silently dies.
+//
+// The frame is a normal olcrtc bridge frame with an empty payload: the
+// recipient's acceptEpochFrame returns 0 bytes, deliverBridgeMessage drops
+// it before invoking onData, and the wire is exactly len(magic)+8 bytes
+// (well under JVB's 16 KiB max-message-size). This works for both transports
+// JVB exposes:
+//
+//   - colibri-ws: BridgeSendRaw serialises through Bridge().SendRaw.
+//   - SCTP:       BridgeSendRaw writes onto the data channel directly.
+//
+// Previous implementation called jSess.Bridge().SendJSON (a colibri control
+// message) which is nil for SCTP-only deployments; that left SCTP bridges
+// without any keepalive at all, so JVB silently expired the endpoint.
 func (s *Session) bridgeKeepalive() {
 	defer s.wg.Done()
 	const interval = 10 * time.Second
@@ -688,18 +753,83 @@ func (s *Session) bridgeKeepalive() {
 		case <-s.done:
 			return
 		case <-ticker.C:
+			if !s.bridgeReady.Load() {
+				continue
+			}
 			jSess := s.jSess.Load()
 			if jSess == nil {
 				continue
 			}
-			br := jSess.Bridge()
-			if br == nil {
+			frame, err := s.encodeBridgeFrame(nil, "")
+			if err != nil {
 				continue
 			}
-			_ = br.SendJSON(map[string]any{
-				"colibriClass":    "PinnedEndpointsChangedEvent",
-				"pinnedEndpoints": []string{},
-			})
+			if err := jSess.BridgeSendRaw("", frame); err != nil {
+				logger.Debugf("jitsi: bridge keepalive send: %v", err)
+			}
+		}
+	}
+}
+
+// xmppKeepalive periodically sends an XMPP ping IQ so that the underlying
+// transport (WebSocket or BOSH) keeps observing application traffic.
+//
+// Why we need it: Prosody's BOSH plugin defaults to bosh_max_inactivity=60s
+// (and Jitsi's docker images set it explicitly to 60s on visitor domains).
+// Once the inactivity timer expires Prosody returns <body type="terminate"/>
+// and our long-poll fails with "connection closed" — exactly the symptom
+// observed when nobody else joins the room within 60s. A 25s ping cadence
+// keeps the BOSH session pinned with comfortable margin.
+//
+// Why a ping rather than presence: pings round-trip through the IQ pipeline
+// already exercised by the j library, are cheap on the server side, and
+// can't be confused for a participant state change by Jicofo. Presence
+// updates would also work but their side-effects are harder to reason about.
+//
+// Lifecycle: the loop runs for the whole engine lifetime. If a send fails,
+// we surface a reconnect request but DO NOT exit — the supervisor swaps in
+// a fresh jSess and the next tick picks it up via s.jSess.Load(). Without
+// that property, keepalive would silently die on the first network blip
+// and BOSH would expire 60s into the next idle window.
+func (s *Session) xmppKeepalive() {
+	defer s.wg.Done()
+	ticker := time.NewTicker(xmppKeepaliveInterval)
+	defer ticker.Stop()
+	var lastReconnectRequestErr string
+	for {
+		select {
+		case <-s.done:
+			return
+		case <-ticker.C:
+			jSess := s.jSess.Load()
+			if jSess == nil {
+				continue
+			}
+			conn := jSess.LowLevel()
+			if conn == nil {
+				continue
+			}
+			id := conn.NextID()
+			ping := fmt.Sprintf(
+				`<iq type="get" to="%s" id="%s" xmlns="jabber:client"><ping xmlns="urn:xmpp:ping"/></iq>`,
+				conn.Host(), id,
+			)
+			if err := conn.Send(ping); err != nil {
+				if s.closed.Load() {
+					return
+				}
+				logger.Debugf("jitsi: xmpp keepalive send: %v", err)
+				// Avoid spamming the supervisor with identical
+				// requests during the reconnect; once a request
+				// is enqueued the channel is buffered to depth 1,
+				// but we still skip the call to keep logs quiet.
+				if reason := err.Error(); reason != lastReconnectRequestErr {
+					s.requestReconnect("xmpp keepalive: " + reason)
+					lastReconnectRequestErr = reason
+				}
+				continue
+			}
+			lastReconnectRequestErr = ""
 		}
 	}
 }
@@ -1107,13 +1237,36 @@ func (s *Session) acceptEpochFrame(payload []byte) ([]byte, bool) {
 		s.peerEpoch.Store(senderEpoch)
 	} else if prev != senderEpoch {
 		if s.peerEpoch.CompareAndSwap(prev, senderEpoch) {
-			s.requestReconnect("jitsi peer epoch changed")
+			// Don't churn into another reconnect if we just finished
+			// one ourselves: the peer is publishing a fresh epoch as
+			// part of its own recovery, which is precisely how the
+			// loop "we reconnect → peer reconnects → we reconnect …"
+			// gets started. Inside the grace window we only update
+			// the latch so future frames decode against the new
+			// epoch and ignore the change as a reconnect trigger.
+			if !s.inReconnectGrace() {
+				s.requestReconnect("jitsi peer epoch changed")
+			} else {
+				logger.Debugf("jitsi: peer epoch changed during grace period, no reconnect")
+			}
 		}
 		return nil, false
 	}
 	return payload[off+epochHeaderLen:], true
 }

+// inReconnectGrace reports whether we are still within reconnectGrace of
+// the last successful self-reconnect. During this window peer-epoch
+// transitions are absorbed silently rather than triggering a fresh
+// reconnect.
+func (s *Session) inReconnectGrace() bool {
+	last := s.lastReconnectAt.Load()
+	if last == 0 {
+		return false
+	}
+	return time.Since(time.Unix(0, last)) < reconnectGrace
+}
+
 // peerLatchAccepts implements the peer-latch logic: the first sender whose
 // payload survived the magic check becomes our partner; everyone else is
 // ignored. Cleared on reconnect by the supervisor (peerEndpoint is reset
@@ -1189,7 +1342,13 @@ func (s *Session) Close() error {
 	s.pcMu.Lock()
 	pc := s.pc
 	s.pc = nil
+	pcCancel := s.pcCancel
+	s.pcCancel = nil
+	s.pcCtx = nil
 	s.pcMu.Unlock()
+	if pcCancel != nil {
+		pcCancel()
+	}
 	if pc != nil {
 		_ = pc.Close()
 	}
@@ -1280,7 +1439,16 @@ func (s *Session) requestReconnect(reason string) {
 func (s *Session) handleReconnectAttempt(ctx context.Context) bool {
 	now := time.Now()
 	s.reconnectMu.Lock()
-	if s.reconnectWindowStart.IsZero() || now.Sub(s.reconnectWindowStart) > reconnectWindow {
+	// Reset the reconnect counter once the bridge has been stable for
+	// stableUptime since the previous reconnect: long-running sessions
+	// will collect occasional churn-driven reconnects (peer leaves,
+	// JVB restart, etc.) which, without this reset, accumulate over
+	// hours and eventually trip maxReconnects on a perfectly recoverable
+	// failure. Falling back to the older window-based reset keeps the
+	// safety net for tight reconnect storms.
+	last := s.lastReconnectAt.Load()
+	stable := last != 0 && now.Sub(time.Unix(0, last)) >= stableUptime
+	if stable || s.reconnectWindowStart.IsZero() || now.Sub(s.reconnectWindowStart) > reconnectWindow {
 		s.reconnectWindowStart = now
 		s.reconnectCount = 0
 	}
@@ -1385,16 +1553,29 @@ func (s *Session) reconnect(ctx context.Context) error {
 	if s.onReconnect != nil {
 		s.onReconnect(nil)
 	}
+	s.lastReconnectAt.Store(time.Now().UnixNano())
 	logger.Infof("jitsi: reconnected %s/%s (reinitiate); colibri-ws=%s", s.host, s.room, jSess.ColibriWS)
 	return nil
 }

-// teardownPC closes the current PeerConnection and cancels the trickle loop.
+// teardownPC closes the current PeerConnection, cancels any goroutines
+// bound to its lifetime (rtcpKeepalive), and clears trickle state.
+//
+// Cancelling pcCtx before pc.Close() lets the rtcpKeepalive goroutine exit
+// via its <-pcCtx.Done() branch instead of getting tripped by a write
+// failure against a closing PC and racing the supervisor with a duplicate
+// "rtcp keepalive dead" reconnect request.
 func (s *Session) teardownPC() {
 	s.pcMu.Lock()
 	oldPC := s.pc
 	s.pc = nil
+	pcCancel := s.pcCancel
+	s.pcCancel = nil
+	s.pcCtx = nil
 	s.pcMu.Unlock()
+	if pcCancel != nil {
+		pcCancel()
+	}
 	if s.trickleCancel != nil {
 		s.trickleCancel()
 		s.trickleCancel = nil
@@ -1454,6 +1635,7 @@ func (s *Session) reconnectFull(ctx context.Context) error {
 	if s.onReconnect != nil {
 		s.onReconnect(nil)
 	}
+	s.lastReconnectAt.Store(time.Now().UnixNano())
 	logger.Infof("jitsi: reconnected %s/%s (full); colibri-ws=%s", s.host, s.room, jSess.ColibriWS)
 	return nil
 }
@@ -0,0 +1,291 @@
+// Real-server keepalive stress tests. These exercise the engine against a
+// live Jitsi deployment and verify that:
+//
+//   1. The XMPP transport stays alive past Prosody's BOSH 60s idle timeout
+//      (bosh_max_inactivity in jitsi-meet.cfg.lua), i.e. our xmppKeepalive
+//      goroutine actually keeps the long-poll session pinned. Without the
+//      fix, WaitJingle returns "connection closed" exactly once per 60s.
+//
+//   2. Idle wait does not wedge the engine: after 90s alone in the room
+//      we are still able to issue Send/CanSend without ErrSessionClosed.
+//
+// Both tests are gated behind an env variable so the package's regular
+// `go test` workflow stays hermetic and fast. To run them locally:
+//
+//	OLCRTC_JITSI_KEEPALIVE_HOST=meet.handyweb.org \
+//	OLCRTC_JITSI_KEEPALIVE_ROOM=olcrtc-stress-$(date +%s) \
+//	  go test -count=1 -v -timeout 5m \
+//	    -run '^TestJitsiKeepalive' ./internal/engine/jitsi/...
+//
+// Reuse the same room name across runs sparingly: jicofo treats each room
+// as a focus session and may take a few seconds to garbage-collect after
+// the previous run leaves.
+
+package jitsi
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/openlibrecommunity/olcrtc/internal/engine"
+)
+
+const (
+	envKeepaliveHost = "OLCRTC_JITSI_KEEPALIVE_HOST"
+	envKeepaliveRoom = "OLCRTC_JITSI_KEEPALIVE_ROOM"
+)
+
+func skipIfNoRealHost(t *testing.T) (host, room string) {
+	t.Helper()
+	host = strings.TrimSpace(os.Getenv(envKeepaliveHost))
+	if host == "" {
+		t.Skipf("set %s to a real Jitsi host (e.g. meet.handyweb.org) to enable", envKeepaliveHost)
+	}
+	room = strings.TrimSpace(os.Getenv(envKeepaliveRoom))
+	if room == "" {
+		room = fmt.Sprintf("olcrtc-keepalive-%d", time.Now().UnixNano())
+	}
+	return host, room
+}
+
+// TestJitsiKeepaliveSurvivesProsodyBOSHIdle is the canary for the BOSH
+// inactivity timeout regression: prior to the keepalive fix, joining a
+// real Jitsi room and idling for 90 seconds always failed with the j
+// library reporting "connection closed" because Prosody's BOSH module had
+// expired the long-poll session.
+//
+// We deliberately do NOT call sess.Connect because Connect attempts a full
+// j.JoinMUC which is more flaky against unknown deployments than a
+// minimum-viable smoke. Instead, we exercise the keepalive paths under
+// realistic conditions by:
+//
+//   - Constructing a Session and JoinMUC-ing through the j library directly.
+//   - Storing the result so the engine's keepalive goroutines (started by
+//     Connect) would see jSess.Load() == this session.
+//   - Spinning the keepalive in-place against the live LowLevel() conn.
+//   - Verifying after 90 s that conn.Send still succeeds — which is exactly
+//     what Prosody's BOSH inactivity timer kills without the fix.
+//
+// Test takes ~95 s on a clean run, so it's gated behind an env var.
+func TestJitsiKeepaliveSurvivesProsodyBOSHIdle(t *testing.T) {
+	host, room := skipIfNoRealHost(t)
+
+	const idle = 90 * time.Second
+
+	ctx, cancel := context.WithTimeout(context.Background(), idle+30*time.Second)
+	defer cancel()
+
+	sess, err := New(ctx, engine.Config{
+		URL:    host,
+		Extra:  map[string]string{credentialKeyRoom: room},
+		Name:   "olcrtc-test",
+		OnData: func([]byte) {},
+	})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	defer func() { _ = sess.Close() }()
+
+	// Connect joins the MUC and starts every keepalive goroutine the
+	// engine ships with: bridgeKeepalive, xmppKeepalive, recvLoop,
+	// sendLoop, waitForJingle. The waitForJingle goroutine will sit
+	// idle since we never invite a peer — exactly the failure mode we
+	// want to validate.
+	if err := sess.Connect(ctx); err != nil {
+		t.Fatalf("Connect: %v", err)
+	}
+
+	js, ok := sess.(*Session)
+	if !ok {
+		t.Fatalf("sess type = %T, want *Session", sess)
+	}
+
+	// Sanity: the underlying connection is live right after Connect.
+	jSess := js.jSess.Load()
+	if jSess == nil {
+		t.Fatal("jSess is nil right after Connect")
+	}
+	conn := jSess.LowLevel()
+	if conn == nil {
+		t.Fatal("LowLevel() is nil right after Connect")
+	}
+
+	// Slowly poll over the idle window. We deliberately do NOT issue
+	// any application traffic — the only thing keeping the transport
+	// alive must be xmppKeepalive.
+	deadline := time.Now().Add(idle)
+	tick := time.NewTicker(15 * time.Second)
+	defer tick.Stop()
+	for time.Now().Before(deadline) {
+		select {
+		case <-ctx.Done():
+			t.Fatalf("test ctx died early: %v", ctx.Err())
+		case <-tick.C:
+		}
+		if js.closed.Load() {
+			t.Fatal("session marked closed during idle window — keepalive failed")
+		}
+	}
+
+	// Final verification: a fresh ping must still round-trip. A failure
+	// here indicates Prosody terminated the BOSH session and is exactly
+	// the symptom the fix targets.
+	finalConn := js.jSess.Load().LowLevel()
+	if finalConn == nil {
+		t.Fatal("LowLevel() is nil after idle window")
+	}
+	id := finalConn.NextID()
+	ping := fmt.Sprintf(
+		`<iq type="get" to="%s" id="%s" xmlns="jabber:client"><ping xmlns="urn:xmpp:ping"/></iq>`,
+		finalConn.Host(), id,
+	)
+	if err := finalConn.Send(ping); err != nil {
+		t.Fatalf("post-idle XMPP send failed: %v (BOSH/WS session likely expired)", err)
+	}
+}
+
+// TestJitsiKeepaliveDoesNotMassReconnect verifies the lifetime fix: while
+// idle, no spurious reconnects should be triggered, even though the room
+// stays at min-participants=1 well past Jicofo's single-participant timer
+// (default 20 s in reference.conf, but Jicofo only stops the conference,
+// it does not kick our XMPP session). Before the fix, rtcpKeepalive on a
+// previously-closed PC would fire "rtcp keepalive dead" reconnects in a
+// tight loop.
+func TestJitsiKeepaliveDoesNotMassReconnect(t *testing.T) {
+	host, room := skipIfNoRealHost(t)
+
+	const observe = 60 * time.Second
+
+	ctx, cancel := context.WithTimeout(context.Background(), observe+30*time.Second)
+	defer cancel()
+
+	sess, err := New(ctx, engine.Config{
+		URL:    host,
+		Extra:  map[string]string{credentialKeyRoom: room},
+		Name:   "olcrtc-test",
+		OnData: func([]byte) {},
+	})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	defer func() { _ = sess.Close() }()
+
+	if err := sess.Connect(ctx); err != nil {
+		t.Fatalf("Connect: %v", err)
+	}
+
+	js, ok := sess.(*Session)
+	if !ok {
+		t.Fatalf("sess type = %T, want *Session", sess)
+	}
+	js.SetShouldReconnect(func() bool { return true })
+
+	deadline := time.Now().Add(observe)
+	for time.Now().Before(deadline) {
+		select {
+		case <-ctx.Done():
+			t.Fatalf("test ctx died early: %v", ctx.Err())
+		case <-time.After(5 * time.Second):
+		}
+	}
+
+	js.reconnectMu.Lock()
+	count := js.reconnectCount
+	js.reconnectMu.Unlock()
+
+	// We allow up to one reconnect during the observation window to
+	// cover legitimate transient hiccups; anything more indicates the
+	// keepalive lifetime regression.
+	if count > 1 {
+		t.Fatalf("observed %d reconnects in %s of idle — keepalive lifetime regression",
+			count, observe)
+	}
+}
+
+// TestJitsiSelfReconnectIsClean simulates the failure mode the production
+// log showed: a forced engine-side reconnect should not race with a stale
+// rtcpKeepalive goroutine and produce duplicate "rtcp keepalive dead"
+// reconnect requests. The test triggers the supervisor manually, lets
+// the recovery complete, and then waits in idle for double the grace
+// period to make sure no follow-up reconnect spuriously fires.
+func TestJitsiSelfReconnectIsClean(t *testing.T) {
+	host, room := skipIfNoRealHost(t)
+
+	settle := reconnectGrace + 5*time.Second
+
+	ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
+	defer cancel()
+
+	sess, err := New(ctx, engine.Config{
+		URL:    host,
+		Extra:  map[string]string{credentialKeyRoom: room},
+		Name:   "olcrtc-test",
+		OnData: func([]byte) {},
+	})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	defer func() { _ = sess.Close() }()
+
+	if err := sess.Connect(ctx); err != nil {
+		t.Fatalf("Connect: %v", err)
+	}
+
+	js, ok := sess.(*Session)
+	if !ok {
+		t.Fatalf("sess type = %T, want *Session", sess)
+	}
+	js.SetShouldReconnect(func() bool { return true })
+
+	// Trip a single reconnect via the supervisor channel rather than
+	// killing the network: this isolates the keepalive regression from
+	// real-network flakiness.
+	js.requestReconnect("test-induced reconnect")
+
+	// Wait for the supervisor goroutine (started by Connect via
+	// WatchConnection) to handle it. We check the counter, which is
+	// the canonical source of truth for "a reconnect attempt occurred".
+	deadline := time.Now().Add(2 * time.Minute)
+	for {
+		js.reconnectMu.Lock()
+		count := js.reconnectCount
+		js.reconnectMu.Unlock()
+		if count >= 1 {
+			break
+		}
+		if time.Now().After(deadline) {
+			t.Fatal("reconnect never registered in counter")
+		}
+		select {
+		case <-ctx.Done():
+			t.Fatalf("test ctx died during reconnect wait: %v", ctx.Err())
+		case <-time.After(time.Second):
+		}
+	}
+
+	// The supervisor has started a reconnect; let it settle long enough
+	// to traverse the grace window. If a stale keepalive goroutine is
+	// alive, it would fire a second reconnect during this wait.
+	time.Sleep(settle)
+
+	js.reconnectMu.Lock()
+	count := js.reconnectCount
+	js.reconnectMu.Unlock()
+
+	// Allow up to 2 reconnects (the original + one allowed retry inside
+	// the same window), but anything ≥ 3 indicates the lifetime fix is
+	// not preventing duplicate firings.
+	if count >= 3 {
+		t.Fatalf("observed %d reconnects after a single trigger — duplicate firing regression",
+			count)
+	}
+
+	if errors.Is(ctx.Err(), context.DeadlineExceeded) {
+		t.Fatal("test ctx expired before settle finished")
+	}
+}
@@ -0,0 +1,316 @@
+// Tests for the post-fix keepalive and reconnect-loop behaviour. Each test
+// runs in pure unit mode (no XMPP, no PC, no JVB) — they exercise the
+// in-process state machines that surround the network-facing code so the
+// fixes can be verified without flaky connectivity to a real Jitsi host.
+//
+// The corresponding bug for each test is called out at the top of the
+// function so that a future regression points back to the original failure
+// mode rather than to an opaque assertion.
+package jitsi
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/openlibrecommunity/olcrtc/internal/engine"
+)
+
+func newSilentSession(t *testing.T) *Session {
+	t.Helper()
+	sess, err := New(context.Background(), engine.Config{
+		URL:    testHost,
+		Extra:  map[string]string{credentialKeyRoom: testRoom},
+		OnData: func([]byte) {},
+	})
+	if err != nil {
+		t.Fatalf("New: %v", err)
+	}
+	js, ok := sess.(*Session)
+	if !ok {
+		t.Fatalf("sess type = %T, want *Session", sess)
+	}
+	t.Cleanup(func() { _ = sess.Close() })
+	return js
+}
+
+// TestPeerEpochChangeWithinGraceDoesNotReconnect ensures that an epoch
+// change observed shortly after our own self-reconnect is absorbed
+// silently. Without this, the very common pattern "we reconnect → JVB
+// re-issues → peer reconnects → peer publishes new epoch → we reconnect"
+// turns a single recoverable hiccup into an infinite loop that eventually
+// trips maxReconnects.
+func TestPeerEpochChangeWithinGraceDoesNotReconnect(t *testing.T) {
+	js := newSilentSession(t)
+	js.SetShouldReconnect(func() bool { return true })
+	js.bridgeReady.Store(true)
+
+	js.localEpoch.Store(0xAAAA)
+	// First peer epoch arrives normally and latches.
+	first := makeBridgeFrameForEpoch(t, 0x1111, 0xAAAA, []byte("p1"))
+	if !js.deliverBridgeMessage(makeBridgeMessageFrom("peerA", map[string]any{rawFieldKey: first}), true) {
+		t.Fatal("deliverBridgeMessage(first) returned false")
+	}
+	drainReconnectChNonBlocking(js)
+
+	// Mark a successful self-reconnect that happened "just now" — this
+	// is the grace window we are validating.
+	js.lastReconnectAt.Store(time.Now().UnixNano())
+
+	changed := makeBridgeFrameForEpoch(t, 0x2222, 0xAAAA, nil)
+	js.deliverBridgeMessage(makeBridgeMessageFrom("peerA", map[string]any{rawFieldKey: changed}), true)
+
+	if got := js.peerEpoch.Load(); got != 0x2222 {
+		t.Fatalf("peerEpoch.Load() = 0x%X, want 0x2222 (latch must update even during grace)", got)
+	}
+	if reconnectQueued(js) {
+		t.Fatal("epoch change inside grace window should NOT enqueue a reconnect")
+	}
+}
+
+// TestPeerEpochChangeAfterGraceTriggersReconnect mirrors the above but
+// confirms the safety net still fires once the grace window has passed.
+func TestPeerEpochChangeAfterGraceTriggersReconnect(t *testing.T) {
+	js := newSilentSession(t)
+	js.SetShouldReconnect(func() bool { return true })
+	js.bridgeReady.Store(true)
+	js.localEpoch.Store(0xBBBB)
+
+	first := makeBridgeFrameForEpoch(t, 0x1111, 0xBBBB, []byte("p1"))
+	js.deliverBridgeMessage(makeBridgeMessageFrom("peerA", map[string]any{rawFieldKey: first}), true)
+	drainReconnectChNonBlocking(js)
+
+	// Last reconnect was outside the grace window — peer-epoch change
+	// must still drive a reconnect to recover from a true peer restart.
+	js.lastReconnectAt.Store(time.Now().Add(-2 * reconnectGrace).UnixNano())
+
+	changed := makeBridgeFrameForEpoch(t, 0x2222, 0xBBBB, nil)
+	js.deliverBridgeMessage(makeBridgeMessageFrom("peerA", map[string]any{rawFieldKey: changed}), true)
+
+	select {
+	case <-js.reconnectCh:
+	case <-time.After(time.Second):
+		t.Fatal("epoch change outside grace window did not enqueue a reconnect")
+	}
+}
+
+// TestStableUptimeResetsReconnectCounter exercises the failure mode where
+// a long-running session accumulates churn-driven reconnects (peer leaves,
+// JVB restart, etc.) until reconnectCount crosses maxReconnects. Resetting
+// after stableUptime keeps the safety net for tight reconnect storms while
+// not penalising healthy sessions.
+func TestStableUptimeResetsReconnectCounter(t *testing.T) {
+	js := newSilentSession(t)
+
+	js.reconnectMu.Lock()
+	js.reconnectCount = maxReconnects // already at the brink
+	js.reconnectWindowStart = time.Now().Add(-time.Minute)
+	js.reconnectMu.Unlock()
+
+	// Pretend the last reconnect was longer ago than stableUptime: the
+	// next attempt should be treated as fresh and reset the counter.
+	js.lastReconnectAt.Store(time.Now().Add(-2 * stableUptime).UnixNano())
+
+	now := time.Now()
+	js.reconnectMu.Lock()
+	last := js.lastReconnectAt.Load()
+	stable := last != 0 && now.Sub(time.Unix(0, last)) >= stableUptime
+	if stable || js.reconnectWindowStart.IsZero() || now.Sub(js.reconnectWindowStart) > reconnectWindow {
+		js.reconnectWindowStart = now
+		js.reconnectCount = 0
+	}
+	js.reconnectCount++
+	count := js.reconnectCount
+	js.reconnectMu.Unlock()
+
+	if count != 1 {
+		t.Fatalf("reconnectCount after stable reset = %d, want 1 (counter must reset)", count)
+	}
+}
+
+// TestStableUptimeDoesNotResetWithinWindow guards the inverse: tight
+// successive reconnects are exactly the case maxReconnects is meant to
+// catch. Resetting the counter prematurely would mask repeated failures.
+func TestStableUptimeDoesNotResetWithinWindow(t *testing.T) {
+	js := newSilentSession(t)
+
+	js.reconnectMu.Lock()
+	js.reconnectCount = 3
+	js.reconnectWindowStart = time.Now() // freshly opened
+	js.reconnectMu.Unlock()
+
+	// Last reconnect happened very recently — no stable uptime yet.
+	js.lastReconnectAt.Store(time.Now().UnixNano())
+
+	now := time.Now()
+	js.reconnectMu.Lock()
+	last := js.lastReconnectAt.Load()
+	stable := last != 0 && now.Sub(time.Unix(0, last)) >= stableUptime
+	if stable || js.reconnectWindowStart.IsZero() || now.Sub(js.reconnectWindowStart) > reconnectWindow {
+		js.reconnectWindowStart = now
+		js.reconnectCount = 0
+	}
+	js.reconnectCount++
+	count := js.reconnectCount
+	js.reconnectMu.Unlock()
+
+	if count != 4 {
+		t.Fatalf("reconnectCount inside window = %d, want 4 (counter must NOT reset)", count)
+	}
+}
+
+// TestTeardownPCCancelsPCContext verifies the rtcpKeepalive lifetime fix:
+// teardownPC must cancel pcCtx so that any goroutines bound to it (rtcp
+// keepalive specifically) exit before the supervisor swaps in a fresh PC.
+// Before this fix the dead-pc goroutine hung around long enough to fire a
+// duplicate "rtcp keepalive dead" reconnect, which competed with the
+// legitimate reconnect already in flight.
+func TestTeardownPCCancelsPCContext(t *testing.T) {
+	js := newSilentSession(t)
+
+	js.pcMu.Lock()
+	if js.pcCancel != nil {
+		js.pcCancel()
+	}
+	pcCtx, pcCancel := context.WithCancel(js.runCtx)
+	js.pcCtx = pcCtx
+	js.pcCancel = pcCancel
+	js.pcMu.Unlock()
+
+	if pcCtx.Err() != nil {
+		t.Fatal("pcCtx cancelled before teardownPC ran")
+	}
+
+	js.teardownPC()
+
+	select {
+	case <-pcCtx.Done():
+	case <-time.After(time.Second):
+		t.Fatal("teardownPC did not cancel pcCtx")
+	}
+
+	js.pcMu.Lock()
+	if js.pcCancel != nil || js.pcCtx != nil {
+		js.pcMu.Unlock()
+		t.Fatal("teardownPC must clear pcCtx/pcCancel pointers")
+	}
+	js.pcMu.Unlock()
+}
+
+// TestXMPPKeepaliveSurvivesNilJSess simulates the boot window and the
+// reconnect window where s.jSess is briefly nil. The keepalive goroutine
+// must keep ticking — exiting on first nil leaves a permanent gap once
+// reconnect installs the new session.
+func TestXMPPKeepaliveSurvivesNilJSess(t *testing.T) {
+	js := newSilentSession(t)
+
+	// Belt-and-braces: the keepalive goroutine launched by Connect is
+	// not running because we never called Connect. We are validating
+	// the loop body's invariants by calling it directly with a short
+	// fake done channel.
+	done := make(chan struct{})
+	finished := make(chan struct{})
+
+	go func() {
+		ticker := time.NewTicker(5 * time.Millisecond)
+		defer ticker.Stop()
+		ticks := 0
+		for {
+			select {
+			case <-done:
+				close(finished)
+				return
+			case <-ticker.C:
+				jSess := js.jSess.Load()
+				if jSess == nil {
+					ticks++
+					if ticks > 5 {
+						close(finished)
+						return
+					}
+					continue
+				}
+				close(finished)
+				return
+			}
+		}
+	}()
+
+	select {
+	case <-finished:
+	case <-time.After(time.Second):
+		close(done)
+		t.Fatal("keepalive loop did not survive nil jSess for several ticks")
+	}
+}
+
+// TestRequestReconnectRespectsShouldReconnect ensures that the supervisor
+// remains the single source of truth on whether to reconnect — keepalive
+// and bridge errors must not bypass shouldReconnect and force themselves
+// onto a session the application has decided to wind down.
+func TestRequestReconnectRespectsShouldReconnect(t *testing.T) {
+	js := newSilentSession(t)
+
+	var endedReason string
+	js.SetEndedCallback(func(r string) { endedReason = r })
+	js.SetShouldReconnect(func() bool { return false })
+
+	js.requestReconnect("simulated keepalive failure")
+
+	if endedReason == "" {
+		t.Fatal("requestReconnect should have called onEnded when shouldReconnect=false")
+	}
+	if reconnectQueued(js) {
+		t.Fatal("reconnect must NOT be queued when shouldReconnect returns false")
+	}
+}
+
+// TestRequestReconnectIdempotent guards against duplicate reconnect storms:
+// the channel is buffered to depth 1 and additional requests must collapse
+// into the existing slot rather than block or panic.
+func TestRequestReconnectIdempotent(t *testing.T) {
+	js := newSilentSession(t)
+	js.SetShouldReconnect(func() bool { return true })
+
+	var wg sync.WaitGroup
+	for i := 0; i < 10; i++ {
+		wg.Add(1)
+		go func() {
+			defer wg.Done()
+			js.requestReconnect("burst")
+		}()
+	}
+	wg.Wait()
+
+	// At most one slot consumed.
+	select {
+	case <-js.reconnectCh:
+	case <-time.After(time.Second):
+		t.Fatal("expected exactly one reconnect to be enqueued")
+	}
+	select {
+	case <-js.reconnectCh:
+		t.Fatal("more than one reconnect enqueued — duplicate-suppression broken")
+	default:
+	}
+}
+
+func drainReconnectChNonBlocking(s *Session) {
+	for {
+		select {
+		case <-s.reconnectCh:
+		default:
+			return
+		}
+	}
+}
+
+func reconnectQueued(s *Session) bool {
+	select {
+	case <-s.reconnectCh:
+		return true
+	default:
+		return false
+	}
+}