mirror of
https://github.com/openlibrecommunity/olcrtc.git
synced 2026-06-02 06:23:37 +02:00
test(jitsi): replace keepalive tests with paired chaos
This commit is contained in:
@@ -1,291 +0,0 @@
|
|||||||
// Real-server keepalive stress tests. These exercise the engine against a
|
|
||||||
// live Jitsi deployment and verify that:
|
|
||||||
//
|
|
||||||
// 1. The XMPP transport stays alive past Prosody's BOSH 60s idle timeout
|
|
||||||
// (bosh_max_inactivity in jitsi-meet.cfg.lua), i.e. our xmppKeepalive
|
|
||||||
// goroutine actually keeps the long-poll session pinned. Without the
|
|
||||||
// fix, WaitJingle returns "connection closed" exactly once per 60s.
|
|
||||||
//
|
|
||||||
// 2. Idle wait does not wedge the engine: after 90s alone in the room
|
|
||||||
// we are still able to issue Send/CanSend without ErrSessionClosed.
|
|
||||||
//
|
|
||||||
// Both tests are gated behind an env variable so the package's regular
|
|
||||||
// `go test` workflow stays hermetic and fast. To run them locally:
|
|
||||||
//
|
|
||||||
// OLCRTC_JITSI_KEEPALIVE_HOST=meet.handyweb.org \
|
|
||||||
// OLCRTC_JITSI_KEEPALIVE_ROOM=olcrtc-stress-$(date +%s) \
|
|
||||||
// go test -count=1 -v -timeout 5m \
|
|
||||||
// -run '^TestJitsiKeepalive' ./internal/engine/jitsi/...
|
|
||||||
//
|
|
||||||
// Reuse the same room name across runs sparingly: jicofo treats each room
|
|
||||||
// as a focus session and may take a few seconds to garbage-collect after
|
|
||||||
// the previous run leaves.
|
|
||||||
|
|
||||||
package jitsi
|
|
||||||
|
|
||||||
import (
|
|
||||||
"context"
|
|
||||||
"errors"
|
|
||||||
"fmt"
|
|
||||||
"os"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
|
||||||
"time"
|
|
||||||
|
|
||||||
"github.com/openlibrecommunity/olcrtc/internal/engine"
|
|
||||||
)
|
|
||||||
|
|
||||||
const (
|
|
||||||
envKeepaliveHost = "OLCRTC_JITSI_KEEPALIVE_HOST"
|
|
||||||
envKeepaliveRoom = "OLCRTC_JITSI_KEEPALIVE_ROOM"
|
|
||||||
)
|
|
||||||
|
|
||||||
func skipIfNoRealHost(t *testing.T) (host, room string) {
|
|
||||||
t.Helper()
|
|
||||||
host = strings.TrimSpace(os.Getenv(envKeepaliveHost))
|
|
||||||
if host == "" {
|
|
||||||
t.Skipf("set %s to a real Jitsi host (e.g. meet.handyweb.org) to enable", envKeepaliveHost)
|
|
||||||
}
|
|
||||||
room = strings.TrimSpace(os.Getenv(envKeepaliveRoom))
|
|
||||||
if room == "" {
|
|
||||||
room = fmt.Sprintf("olcrtc-keepalive-%d", time.Now().UnixNano())
|
|
||||||
}
|
|
||||||
return host, room
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestJitsiKeepaliveSurvivesProsodyBOSHIdle is the canary for the BOSH
|
|
||||||
// inactivity timeout regression: prior to the keepalive fix, joining a
|
|
||||||
// real Jitsi room and idling for 90 seconds always failed with the j
|
|
||||||
// library reporting "connection closed" because Prosody's BOSH module had
|
|
||||||
// expired the long-poll session.
|
|
||||||
//
|
|
||||||
// We deliberately do NOT call sess.Connect because Connect attempts a full
|
|
||||||
// j.JoinMUC which is more flaky against unknown deployments than a
|
|
||||||
// minimum-viable smoke. Instead, we exercise the keepalive paths under
|
|
||||||
// realistic conditions by:
|
|
||||||
//
|
|
||||||
// - Constructing a Session and JoinMUC-ing through the j library directly.
|
|
||||||
// - Storing the result so the engine's keepalive goroutines (started by
|
|
||||||
// Connect) would see jSess.Load() == this session.
|
|
||||||
// - Spinning the keepalive in-place against the live LowLevel() conn.
|
|
||||||
// - Verifying after 90 s that conn.Send still succeeds — which is exactly
|
|
||||||
// what Prosody's BOSH inactivity timer kills without the fix.
|
|
||||||
//
|
|
||||||
// Test takes ~95 s on a clean run, so it's gated behind an env var.
|
|
||||||
func TestJitsiKeepaliveSurvivesProsodyBOSHIdle(t *testing.T) {
|
|
||||||
host, room := skipIfNoRealHost(t)
|
|
||||||
|
|
||||||
const idle = 90 * time.Second
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), idle+30*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
sess, err := New(ctx, engine.Config{
|
|
||||||
URL: host,
|
|
||||||
Extra: map[string]string{credentialKeyRoom: room},
|
|
||||||
Name: "olcrtc-test",
|
|
||||||
OnData: func([]byte) {},
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("New: %v", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = sess.Close() }()
|
|
||||||
|
|
||||||
// Connect joins the MUC and starts every keepalive goroutine the
|
|
||||||
// engine ships with: bridgeKeepalive, xmppKeepalive, recvLoop,
|
|
||||||
// sendLoop, waitForJingle. The waitForJingle goroutine will sit
|
|
||||||
// idle since we never invite a peer — exactly the failure mode we
|
|
||||||
// want to validate.
|
|
||||||
if err := sess.Connect(ctx); err != nil {
|
|
||||||
t.Fatalf("Connect: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
js, ok := sess.(*Session)
|
|
||||||
if !ok {
|
|
||||||
t.Fatalf("sess type = %T, want *Session", sess)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Sanity: the underlying connection is live right after Connect.
|
|
||||||
jSess := js.jSess.Load()
|
|
||||||
if jSess == nil {
|
|
||||||
t.Fatal("jSess is nil right after Connect")
|
|
||||||
}
|
|
||||||
conn := jSess.LowLevel()
|
|
||||||
if conn == nil {
|
|
||||||
t.Fatal("LowLevel() is nil right after Connect")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Slowly poll over the idle window. We deliberately do NOT issue
|
|
||||||
// any application traffic — the only thing keeping the transport
|
|
||||||
// alive must be xmppKeepalive.
|
|
||||||
deadline := time.Now().Add(idle)
|
|
||||||
tick := time.NewTicker(15 * time.Second)
|
|
||||||
defer tick.Stop()
|
|
||||||
for time.Now().Before(deadline) {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Fatalf("test ctx died early: %v", ctx.Err())
|
|
||||||
case <-tick.C:
|
|
||||||
}
|
|
||||||
if js.closed.Load() {
|
|
||||||
t.Fatal("session marked closed during idle window — keepalive failed")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Final verification: a fresh ping must still round-trip. A failure
|
|
||||||
// here indicates Prosody terminated the BOSH session and is exactly
|
|
||||||
// the symptom the fix targets.
|
|
||||||
finalConn := js.jSess.Load().LowLevel()
|
|
||||||
if finalConn == nil {
|
|
||||||
t.Fatal("LowLevel() is nil after idle window")
|
|
||||||
}
|
|
||||||
id := finalConn.NextID()
|
|
||||||
ping := fmt.Sprintf(
|
|
||||||
`<iq type="get" to="%s" id="%s" xmlns="jabber:client"><ping xmlns="urn:xmpp:ping"/></iq>`,
|
|
||||||
finalConn.Host(), id,
|
|
||||||
)
|
|
||||||
if err := finalConn.Send(ping); err != nil {
|
|
||||||
t.Fatalf("post-idle XMPP send failed: %v (BOSH/WS session likely expired)", err)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestJitsiKeepaliveDoesNotMassReconnect verifies the lifetime fix: while
|
|
||||||
// idle, no spurious reconnects should be triggered, even though the room
|
|
||||||
// stays at min-participants=1 well past Jicofo's single-participant timer
|
|
||||||
// (default 20 s in reference.conf, but Jicofo only stops the conference,
|
|
||||||
// it does not kick our XMPP session). Before the fix, rtcpKeepalive on a
|
|
||||||
// previously-closed PC would fire "rtcp keepalive dead" reconnects in a
|
|
||||||
// tight loop.
|
|
||||||
func TestJitsiKeepaliveDoesNotMassReconnect(t *testing.T) {
|
|
||||||
host, room := skipIfNoRealHost(t)
|
|
||||||
|
|
||||||
const observe = 60 * time.Second
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), observe+30*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
sess, err := New(ctx, engine.Config{
|
|
||||||
URL: host,
|
|
||||||
Extra: map[string]string{credentialKeyRoom: room},
|
|
||||||
Name: "olcrtc-test",
|
|
||||||
OnData: func([]byte) {},
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("New: %v", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = sess.Close() }()
|
|
||||||
|
|
||||||
if err := sess.Connect(ctx); err != nil {
|
|
||||||
t.Fatalf("Connect: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
js, ok := sess.(*Session)
|
|
||||||
if !ok {
|
|
||||||
t.Fatalf("sess type = %T, want *Session", sess)
|
|
||||||
}
|
|
||||||
js.SetShouldReconnect(func() bool { return true })
|
|
||||||
|
|
||||||
deadline := time.Now().Add(observe)
|
|
||||||
for time.Now().Before(deadline) {
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Fatalf("test ctx died early: %v", ctx.Err())
|
|
||||||
case <-time.After(5 * time.Second):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
js.reconnectMu.Lock()
|
|
||||||
count := js.reconnectCount
|
|
||||||
js.reconnectMu.Unlock()
|
|
||||||
|
|
||||||
// We allow up to one reconnect during the observation window to
|
|
||||||
// cover legitimate transient hiccups; anything more indicates the
|
|
||||||
// keepalive lifetime regression.
|
|
||||||
if count > 1 {
|
|
||||||
t.Fatalf("observed %d reconnects in %s of idle — keepalive lifetime regression",
|
|
||||||
count, observe)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// TestJitsiSelfReconnectIsClean simulates the failure mode the production
|
|
||||||
// log showed: a forced engine-side reconnect should not race with a stale
|
|
||||||
// rtcpKeepalive goroutine and produce duplicate "rtcp keepalive dead"
|
|
||||||
// reconnect requests. The test triggers the supervisor manually, lets
|
|
||||||
// the recovery complete, and then waits in idle for double the grace
|
|
||||||
// period to make sure no follow-up reconnect spuriously fires.
|
|
||||||
func TestJitsiSelfReconnectIsClean(t *testing.T) {
|
|
||||||
host, room := skipIfNoRealHost(t)
|
|
||||||
|
|
||||||
settle := reconnectGrace + 5*time.Second
|
|
||||||
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 4*time.Minute)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
sess, err := New(ctx, engine.Config{
|
|
||||||
URL: host,
|
|
||||||
Extra: map[string]string{credentialKeyRoom: room},
|
|
||||||
Name: "olcrtc-test",
|
|
||||||
OnData: func([]byte) {},
|
|
||||||
})
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("New: %v", err)
|
|
||||||
}
|
|
||||||
defer func() { _ = sess.Close() }()
|
|
||||||
|
|
||||||
if err := sess.Connect(ctx); err != nil {
|
|
||||||
t.Fatalf("Connect: %v", err)
|
|
||||||
}
|
|
||||||
|
|
||||||
js, ok := sess.(*Session)
|
|
||||||
if !ok {
|
|
||||||
t.Fatalf("sess type = %T, want *Session", sess)
|
|
||||||
}
|
|
||||||
js.SetShouldReconnect(func() bool { return true })
|
|
||||||
|
|
||||||
// Trip a single reconnect via the supervisor channel rather than
|
|
||||||
// killing the network: this isolates the keepalive regression from
|
|
||||||
// real-network flakiness.
|
|
||||||
js.requestReconnect("test-induced reconnect")
|
|
||||||
|
|
||||||
// Wait for the supervisor goroutine (started by Connect via
|
|
||||||
// WatchConnection) to handle it. We check the counter, which is
|
|
||||||
// the canonical source of truth for "a reconnect attempt occurred".
|
|
||||||
deadline := time.Now().Add(2 * time.Minute)
|
|
||||||
for {
|
|
||||||
js.reconnectMu.Lock()
|
|
||||||
count := js.reconnectCount
|
|
||||||
js.reconnectMu.Unlock()
|
|
||||||
if count >= 1 {
|
|
||||||
break
|
|
||||||
}
|
|
||||||
if time.Now().After(deadline) {
|
|
||||||
t.Fatal("reconnect never registered in counter")
|
|
||||||
}
|
|
||||||
select {
|
|
||||||
case <-ctx.Done():
|
|
||||||
t.Fatalf("test ctx died during reconnect wait: %v", ctx.Err())
|
|
||||||
case <-time.After(time.Second):
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// The supervisor has started a reconnect; let it settle long enough
|
|
||||||
// to traverse the grace window. If a stale keepalive goroutine is
|
|
||||||
// alive, it would fire a second reconnect during this wait.
|
|
||||||
time.Sleep(settle)
|
|
||||||
|
|
||||||
js.reconnectMu.Lock()
|
|
||||||
count := js.reconnectCount
|
|
||||||
js.reconnectMu.Unlock()
|
|
||||||
|
|
||||||
// Allow up to 2 reconnects (the original + one allowed retry inside
|
|
||||||
// the same window), but anything ≥ 3 indicates the lifetime fix is
|
|
||||||
// not preventing duplicate firings.
|
|
||||||
if count >= 3 {
|
|
||||||
t.Fatalf("observed %d reconnects after a single trigger — duplicate firing regression",
|
|
||||||
count)
|
|
||||||
}
|
|
||||||
|
|
||||||
if errors.Is(ctx.Err(), context.DeadlineExceeded) {
|
|
||||||
t.Fatal("test ctx expired before settle finished")
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -0,0 +1,476 @@
|
|||||||
|
// Paired-instance chaos stress for the jitsi engine.
|
||||||
|
//
|
||||||
|
// Why paired: a single instance never receives session-initiate from
|
||||||
|
// Jicofo because of min-participants=2 (jicofo/.../reference.conf).
|
||||||
|
// Without a peer the bridge never opens and most of the engine's
|
||||||
|
// reconnect logic — peer-epoch latch, bridgeKeepalive, RTCP keepalive —
|
||||||
|
// is never exercised. The single-client tests proved that xmppKeepalive
|
||||||
|
// holds the BOSH session for a single endpoint, but the production
|
||||||
|
// failure mode the user actually observes (DTLS CloseNotify → cascading
|
||||||
|
// reconnects) is a property of the *paired* path.
|
||||||
|
//
|
||||||
|
// What this test does:
|
||||||
|
//
|
||||||
|
// 1. Spawn TWO Session instances against the same real Jitsi host and
|
||||||
|
// room, with shared bytes flowing between them.
|
||||||
|
// 2. Continuously pump small data through the bridge in both directions.
|
||||||
|
// 3. Periodically introduce chaos:
|
||||||
|
// - Force a teardownPC + requestReconnect on one side.
|
||||||
|
// - Long idle pauses (>60s) so both Prosody BOSH idle and JVB
|
||||||
|
// inactivityTimeout fire if any keepalive is broken.
|
||||||
|
// - Random side selection so both directions get exercised.
|
||||||
|
// 4. Track per-cycle outcomes and fail the test if either side
|
||||||
|
// permanently wedges (no Send for >2x the chaos cycle).
|
||||||
|
//
|
||||||
|
// Configuration via env (no flags so opt-in is one variable):
|
||||||
|
//
|
||||||
|
// OLCRTC_JITSI_PAIRED_HOST required, e.g. meet.handyweb.org
|
||||||
|
// OLCRTC_JITSI_PAIRED_ROOM optional, defaults to a unique name
|
||||||
|
// OLCRTC_JITSI_PAIRED_DURATION default 30m, "0"/"infinite" runs forever
|
||||||
|
// OLCRTC_JITSI_PAIRED_IDLE default 75s
|
||||||
|
// OLCRTC_JITSI_PAIRED_CHAOS_INTERVAL default 60s — how often to cause chaos
|
||||||
|
// OLCRTC_JITSI_PAIRED_VERBOSE default off
|
||||||
|
//
|
||||||
|
// Quick run:
|
||||||
|
//
|
||||||
|
// OLCRTC_JITSI_PAIRED_HOST=meet.handyweb.org \
|
||||||
|
// go test -count=1 -v -timeout 35m \
|
||||||
|
// -run '^TestJitsiPairedChaosStress$' ./internal/engine/jitsi/...
|
||||||
|
//
|
||||||
|
// Forever (Ctrl-C to stop, summary printed):
|
||||||
|
//
|
||||||
|
// OLCRTC_JITSI_PAIRED_HOST=meet.handyweb.org \
|
||||||
|
// OLCRTC_JITSI_PAIRED_DURATION=0 \
|
||||||
|
// go test -count=1 -v -timeout 0 \
|
||||||
|
// -run '^TestJitsiPairedChaosStress$' ./internal/engine/jitsi/...
|
||||||
|
|
||||||
|
package jitsi
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"math/rand"
|
||||||
|
"os"
|
||||||
|
"strings"
|
||||||
|
"sync"
|
||||||
|
"sync/atomic"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/openlibrecommunity/olcrtc/internal/engine"
|
||||||
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
envPairedHost = "OLCRTC_JITSI_PAIRED_HOST"
|
||||||
|
envPairedRoom = "OLCRTC_JITSI_PAIRED_ROOM"
|
||||||
|
envPairedDuration = "OLCRTC_JITSI_PAIRED_DURATION"
|
||||||
|
envPairedIdle = "OLCRTC_JITSI_PAIRED_IDLE"
|
||||||
|
envPairedChaosInterval = "OLCRTC_JITSI_PAIRED_CHAOS_INTERVAL"
|
||||||
|
envPairedVerbose = "OLCRTC_JITSI_PAIRED_VERBOSE"
|
||||||
|
)
|
||||||
|
|
||||||
|
type pairedConfig struct {
|
||||||
|
host, room string
|
||||||
|
duration time.Duration
|
||||||
|
idle time.Duration
|
||||||
|
chaosInterval time.Duration
|
||||||
|
verbose bool
|
||||||
|
}
|
||||||
|
|
||||||
|
func (c *pairedConfig) durationLabel() string {
|
||||||
|
if c.duration == 0 {
|
||||||
|
return "infinite"
|
||||||
|
}
|
||||||
|
return c.duration.String()
|
||||||
|
}
|
||||||
|
|
||||||
|
func readPairedConfig(t *testing.T) *pairedConfig {
|
||||||
|
t.Helper()
|
||||||
|
host := strings.TrimSpace(os.Getenv(envPairedHost))
|
||||||
|
if host == "" {
|
||||||
|
t.Skipf("set %s to a real Jitsi host (e.g. meet.handyweb.org) to enable", envPairedHost)
|
||||||
|
}
|
||||||
|
cfg := &pairedConfig{
|
||||||
|
host: host,
|
||||||
|
room: fmt.Sprintf("olcrtc-paired-%d", time.Now().UnixNano()),
|
||||||
|
duration: 30 * time.Minute,
|
||||||
|
idle: 75 * time.Second,
|
||||||
|
chaosInterval: 60 * time.Second,
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envPairedRoom)); v != "" {
|
||||||
|
cfg.room = v
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envPairedDuration)); v != "" {
|
||||||
|
switch strings.ToLower(v) {
|
||||||
|
case "0", "infinite", "forever":
|
||||||
|
cfg.duration = 0
|
||||||
|
default:
|
||||||
|
d, err := time.ParseDuration(v)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("%s=%q: %v", envPairedDuration, v, err)
|
||||||
|
}
|
||||||
|
cfg.duration = d
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envPairedIdle)); v != "" {
|
||||||
|
d, err := time.ParseDuration(v)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("%s=%q: %v", envPairedIdle, v, err)
|
||||||
|
}
|
||||||
|
cfg.idle = d
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envPairedChaosInterval)); v != "" {
|
||||||
|
d, err := time.ParseDuration(v)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("%s=%q: %v", envPairedChaosInterval, v, err)
|
||||||
|
}
|
||||||
|
cfg.chaosInterval = d
|
||||||
|
}
|
||||||
|
if v := strings.TrimSpace(os.Getenv(envPairedVerbose)); v != "" {
|
||||||
|
cfg.verbose = v != "0" && strings.ToLower(v) != "false"
|
||||||
|
}
|
||||||
|
return cfg
|
||||||
|
}
|
||||||
|
|
||||||
|
// pairedInstance wraps one half of the test pair and tracks rolling stats
|
||||||
|
// that the chaos loop uses to decide when to declare a wedge.
|
||||||
|
type pairedInstance struct {
|
||||||
|
name string
|
||||||
|
js *Session
|
||||||
|
|
||||||
|
mu sync.Mutex
|
||||||
|
receivedFromOther int64
|
||||||
|
lastReceiveAt time.Time
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pairedInstance) note(b []byte) {
|
||||||
|
if len(b) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
p.mu.Lock()
|
||||||
|
p.receivedFromOther++
|
||||||
|
p.lastReceiveAt = time.Now()
|
||||||
|
p.mu.Unlock()
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *pairedInstance) snapshot() (count int64, lastAt time.Time) {
|
||||||
|
p.mu.Lock()
|
||||||
|
defer p.mu.Unlock()
|
||||||
|
return p.receivedFromOther, p.lastReceiveAt
|
||||||
|
}
|
||||||
|
|
||||||
|
// startInstance spins up one Session at a time so the second one is
|
||||||
|
// guaranteed to see the first as a peer (Jicofo session-initiate fires
|
||||||
|
// only when min-participants is reached).
|
||||||
|
func startInstance(ctx context.Context, t *testing.T, cfg *pairedConfig, name string) (*pairedInstance, error) {
|
||||||
|
t.Helper()
|
||||||
|
inst := &pairedInstance{name: name}
|
||||||
|
|
||||||
|
sess, err := New(ctx, engine.Config{
|
||||||
|
URL: cfg.host,
|
||||||
|
Extra: map[string]string{credentialKeyRoom: cfg.room},
|
||||||
|
Name: name,
|
||||||
|
OnData: inst.note,
|
||||||
|
})
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("new %s: %w", name, err)
|
||||||
|
}
|
||||||
|
js, ok := sess.(*Session)
|
||||||
|
if !ok {
|
||||||
|
_ = sess.Close()
|
||||||
|
return nil, fmt.Errorf("%s: cast to *Session failed", name)
|
||||||
|
}
|
||||||
|
js.SetShouldReconnect(func() bool { return ctx.Err() == nil })
|
||||||
|
inst.js = js
|
||||||
|
|
||||||
|
if err := sess.Connect(ctx); err != nil {
|
||||||
|
_ = sess.Close()
|
||||||
|
return nil, fmt.Errorf("connect %s: %w", name, err)
|
||||||
|
}
|
||||||
|
go js.WatchConnection(ctx)
|
||||||
|
return inst, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitForBridge polls until the bridge is open on `inst` or the deadline
|
||||||
|
// passes. The bridge only opens after Jicofo issues session-initiate,
|
||||||
|
// which requires both participants to be in the room.
|
||||||
|
func waitForBridge(ctx context.Context, inst *pairedInstance, deadline time.Time) error {
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
if inst.js.bridgeReady.Load() {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(time.Second):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Errorf("%s: bridge not ready before deadline", inst.name)
|
||||||
|
}
|
||||||
|
|
||||||
|
// pumpLoop sends a small heartbeat payload from `from` to the other
|
||||||
|
// side every interval. The receive side uses inst.note to record arrival.
|
||||||
|
// We intentionally use the engine's Send (not SendTo) to exercise the
|
||||||
|
// peer-latch path.
|
||||||
|
func pumpLoop(ctx context.Context, t *testing.T, from *pairedInstance, interval time.Duration, payload []byte) {
|
||||||
|
t.Helper()
|
||||||
|
tick := time.NewTicker(interval)
|
||||||
|
defer tick.Stop()
|
||||||
|
seq := uint64(0)
|
||||||
|
for {
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-tick.C:
|
||||||
|
}
|
||||||
|
if !from.js.CanSend() {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seq++
|
||||||
|
buf := append([]byte(nil), payload...)
|
||||||
|
if len(buf) >= 8 {
|
||||||
|
for i := 0; i < 8; i++ {
|
||||||
|
buf[i] = byte(seq >> (8 * i))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if err := from.js.Send(buf); err != nil {
|
||||||
|
// Send may legitimately fail mid-reconnect; the chaos
|
||||||
|
// supervisor will catch a permanent wedge.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
type pairedStats struct {
|
||||||
|
cycles int64
|
||||||
|
chaosKicks int64
|
||||||
|
wedgesPair int64 // periods where neither side received any data
|
||||||
|
maxObservedRttMs atomic.Int64
|
||||||
|
startedAt time.Time
|
||||||
|
lastChaosAt time.Time
|
||||||
|
bothSidesReceived bool
|
||||||
|
}
|
||||||
|
|
||||||
|
// TestJitsiPairedChaosStress is the real chaos validator. It joins the
|
||||||
|
// same room with two engine instances, pumps data both ways, and then
|
||||||
|
// loops:
|
||||||
|
//
|
||||||
|
// - idle wait > Prosody BOSH timeout (60s) and JVB inactivityTimeout
|
||||||
|
// - forced teardownPC + requestReconnect on a randomly-chosen side
|
||||||
|
// - confirm both sides recover (CanSend == true and we see a fresh
|
||||||
|
// receive within a bounded window)
|
||||||
|
//
|
||||||
|
// Failure modes guarded:
|
||||||
|
//
|
||||||
|
// - One side wedges (CanSend stuck false) past idle + chaosInterval.
|
||||||
|
// - No application bytes flow across a chaos cycle (the recovery
|
||||||
|
// never re-establishes the bridge frame path).
|
||||||
|
// - Either side hits ErrSessionClosed at the engine level
|
||||||
|
// (the closed flag is the canonical "we gave up" signal).
|
||||||
|
//
|
||||||
|
//nolint:cyclop // chaos cycle structure naturally branches on phase + side
|
||||||
|
func TestJitsiPairedChaosStress(t *testing.T) {
|
||||||
|
cfg := readPairedConfig(t)
|
||||||
|
infinite := cfg.duration == 0
|
||||||
|
|
||||||
|
t.Logf("[paired] host=%s room=%s duration=%s idle=%s chaos-interval=%s verbose=%v",
|
||||||
|
cfg.host, cfg.room, cfg.durationLabel(), cfg.idle, cfg.chaosInterval, cfg.verbose)
|
||||||
|
|
||||||
|
var (
|
||||||
|
ctx context.Context
|
||||||
|
cancel context.CancelFunc
|
||||||
|
)
|
||||||
|
if infinite {
|
||||||
|
ctx, cancel = context.WithCancel(context.Background())
|
||||||
|
} else {
|
||||||
|
ctx, cancel = context.WithTimeout(context.Background(), cfg.duration+5*time.Minute)
|
||||||
|
}
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
// Spin up Alice first so she's already in the room when Bob arrives —
|
||||||
|
// this guarantees min-participants triggers session-initiate.
|
||||||
|
alice, err := startInstance(ctx, t, cfg, "alice")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("alice: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = alice.js.Close() }()
|
||||||
|
|
||||||
|
// Brief settle so Alice is fully in the MUC before Bob joins.
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
|
||||||
|
bob, err := startInstance(ctx, t, cfg, "bob")
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("bob: %v", err)
|
||||||
|
}
|
||||||
|
defer func() { _ = bob.js.Close() }()
|
||||||
|
|
||||||
|
// Now Jicofo should issue session-initiate to both. Give it some time
|
||||||
|
// to actually open the bridge on each side.
|
||||||
|
bridgeBudget := time.Now().Add(90 * time.Second)
|
||||||
|
if err := waitForBridge(ctx, alice, bridgeBudget); err != nil {
|
||||||
|
t.Fatalf("alice bridge: %v", err)
|
||||||
|
}
|
||||||
|
if err := waitForBridge(ctx, bob, bridgeBudget); err != nil {
|
||||||
|
t.Fatalf("bob bridge: %v", err)
|
||||||
|
}
|
||||||
|
t.Log("[paired] both bridges ready, starting pumps")
|
||||||
|
|
||||||
|
// Background pumps: each side sends a heartbeat every 2s. The other
|
||||||
|
// side records arrivals via OnData. This is the actual end-to-end
|
||||||
|
// liveness signal — if it stops flowing, the bridge is dead.
|
||||||
|
pumpCtx, pumpCancel := context.WithCancel(ctx)
|
||||||
|
defer pumpCancel()
|
||||||
|
payload := []byte("0123456789abcdef-paired-keepalive-stress-payload")
|
||||||
|
go pumpLoop(pumpCtx, t, alice, 2*time.Second, payload)
|
||||||
|
go pumpLoop(pumpCtx, t, bob, 2*time.Second, payload)
|
||||||
|
|
||||||
|
// Wait for the first roundtrip in each direction so we know the
|
||||||
|
// pumps are functional before chaos starts.
|
||||||
|
if err := waitFirstReceive(ctx, alice, bob, 60*time.Second); err != nil {
|
||||||
|
t.Fatalf("first roundtrip: %v", err)
|
||||||
|
}
|
||||||
|
t.Log("[paired] first bidirectional roundtrip OK, entering chaos loop")
|
||||||
|
|
||||||
|
stats := pairedStats{startedAt: time.Now()}
|
||||||
|
defer func() { reportPairedStats(t, &stats, cfg) }()
|
||||||
|
stats.bothSidesReceived = true
|
||||||
|
|
||||||
|
rng := rand.New(rand.NewSource(time.Now().UnixNano())) //nolint:gosec // test randomness only
|
||||||
|
|
||||||
|
deadline := time.Time{}
|
||||||
|
if !infinite {
|
||||||
|
deadline = stats.startedAt.Add(cfg.duration)
|
||||||
|
}
|
||||||
|
chaosTick := time.NewTicker(cfg.chaosInterval)
|
||||||
|
defer chaosTick.Stop()
|
||||||
|
|
||||||
|
for {
|
||||||
|
stats.cycles++
|
||||||
|
|
||||||
|
if !infinite && time.Now().After(deadline) {
|
||||||
|
t.Logf("[paired] cycle=%d budget exhausted, ending", stats.cycles)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
t.Logf("[paired] cycle=%d ctx ended (%v), ending", stats.cycles, ctx.Err())
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if alice.js.closed.Load() || bob.js.closed.Load() {
|
||||||
|
t.Fatalf("session closed mid-stress: alice.closed=%v bob.closed=%v",
|
||||||
|
alice.js.closed.Load(), bob.js.closed.Load())
|
||||||
|
}
|
||||||
|
|
||||||
|
// === Phase A: idle while pumps continue ===
|
||||||
|
// Tests that neither BOSH nor JVB inactivityTimeout fires
|
||||||
|
// while we're pumping at 2s intervals.
|
||||||
|
if cfg.verbose {
|
||||||
|
t.Logf("[paired][%d] idle observation %s", stats.cycles, cfg.idle)
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-time.After(cfg.idle):
|
||||||
|
}
|
||||||
|
|
||||||
|
// === Phase B: pick a victim and chaos ===
|
||||||
|
victim := alice
|
||||||
|
victimName := "alice"
|
||||||
|
if rng.Intn(2) == 1 {
|
||||||
|
victim = bob
|
||||||
|
victimName = "bob"
|
||||||
|
}
|
||||||
|
if cfg.verbose {
|
||||||
|
t.Logf("[paired][%d] CHAOS victim=%s — teardownPC + requestReconnect", stats.cycles, victimName)
|
||||||
|
}
|
||||||
|
victim.js.teardownPC()
|
||||||
|
victim.js.requestReconnect(fmt.Sprintf("paired chaos cycle=%d victim=%s", stats.cycles, victimName))
|
||||||
|
stats.chaosKicks++
|
||||||
|
stats.lastChaosAt = time.Now()
|
||||||
|
|
||||||
|
// === Phase C: recovery deadline ===
|
||||||
|
// Bound the recovery window. If the victim does not produce
|
||||||
|
// a fresh receive on the survivor within recoveryBudget, the
|
||||||
|
// engine has wedged.
|
||||||
|
recoveryBudget := cfg.idle + 60*time.Second
|
||||||
|
survivor := alice
|
||||||
|
survivorName := "alice"
|
||||||
|
if victim == alice {
|
||||||
|
survivor = bob
|
||||||
|
survivorName = "bob"
|
||||||
|
}
|
||||||
|
if err := waitFreshReceive(ctx, survivor, recoveryBudget); err != nil {
|
||||||
|
stats.wedgesPair++
|
||||||
|
t.Errorf("[paired][%d] WEDGE survivor=%s did not receive after %s chaos on %s: %v",
|
||||||
|
stats.cycles, survivorName, recoveryBudget, victimName, err)
|
||||||
|
// Try to keep going; production behaviour is what we're
|
||||||
|
// trying to capture, not a single hard failure.
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
if cfg.verbose {
|
||||||
|
t.Logf("[paired][%d] recovered, survivor=%s saw fresh receive", stats.cycles, survivorName)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Honour the chaos tick so we don't burn through cycles
|
||||||
|
// faster than the configured cadence.
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return
|
||||||
|
case <-chaosTick.C:
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitFirstReceive blocks until each side has seen at least one OnData
|
||||||
|
// invocation. Returns ctx.Err() or a deadline error.
|
||||||
|
func waitFirstReceive(ctx context.Context, a, b *pairedInstance, budget time.Duration) error {
|
||||||
|
deadline := time.Now().Add(budget)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
ac, _ := a.snapshot()
|
||||||
|
bc, _ := b.snapshot()
|
||||||
|
if ac > 0 && bc > 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(500 * time.Millisecond):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
ac, _ := a.snapshot()
|
||||||
|
bc, _ := b.snapshot()
|
||||||
|
return fmt.Errorf("did not see bidirectional roundtrip in %s (alice=%d bob=%d)",
|
||||||
|
budget, ac, bc)
|
||||||
|
}
|
||||||
|
|
||||||
|
// waitFreshReceive blocks until target sees a NEW receive after this call,
|
||||||
|
// i.e. the count strictly increases. This is how we observe that the
|
||||||
|
// bridge fully recovered: bytes are arriving from the (forced-to-reconnect)
|
||||||
|
// peer.
|
||||||
|
func waitFreshReceive(ctx context.Context, target *pairedInstance, budget time.Duration) error {
|
||||||
|
startCount, _ := target.snapshot()
|
||||||
|
deadline := time.Now().Add(budget)
|
||||||
|
for time.Now().Before(deadline) {
|
||||||
|
c, _ := target.snapshot()
|
||||||
|
if c > startCount {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
select {
|
||||||
|
case <-ctx.Done():
|
||||||
|
return ctx.Err()
|
||||||
|
case <-time.After(500 * time.Millisecond):
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return fmt.Errorf("no new receive in %s (count stuck at %d)", budget, startCount)
|
||||||
|
}
|
||||||
|
|
||||||
|
func reportPairedStats(t *testing.T, s *pairedStats, cfg *pairedConfig) {
|
||||||
|
t.Helper()
|
||||||
|
elapsed := time.Since(s.startedAt).Round(time.Second)
|
||||||
|
t.Logf(
|
||||||
|
"[paired] DONE elapsed=%s cycles=%d chaosKicks=%d wedges=%d duration=%s idle=%s",
|
||||||
|
elapsed, s.cycles, s.chaosKicks, s.wedgesPair, cfg.durationLabel(), cfg.idle,
|
||||||
|
)
|
||||||
|
if s.wedgesPair > 0 {
|
||||||
|
t.Errorf("observed %d pair wedges (recovery never produced new bytes)", s.wedgesPair)
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user