diff --git a/.claude-flow/horizons/aether-arena-aa.json b/.claude-flow/horizons/aether-arena-aa.json new file mode 100644 index 00000000..fb4cb2be --- /dev/null +++ b/.claude-flow/horizons/aether-arena-aa.json @@ -0,0 +1,119 @@ +{ + "id": "aether-arena-aa", + "name": "AetherArena (AA) — Official Spatial-Intelligence Benchmark", + "adr": "ADR-149", + "adrPath": "docs/adr/ADR-149-public-community-leaderboard-huggingface.md", + "status": "Accepted", + "initializedDate": "2026-05-30", + "targetDate": "2026-08-31", + "exitCriteria": "Benchmark INFRASTRUCTURE done, tested, CI-gated, deploy-ready: aa_score_runner.rs passes deterministic fixture test; CI harness-gate green on every PR; aether-arena repo scaffold committed (README four-part framing + aa-submission.toml schema + VERIFY.md); public smoke split committed; HF Space lifecycle skeleton deployed; signed Parquet ledger functional; RuView baseline PCK@20 ~2.5% entered; ADR-149 §7 acceptance test (five-step stranger test) passes. NOTE: ML SOTA (MM-Fi PCK@20 ~72%) is a separate long-running stretch goal blocked on ADR-079 camera-ground-truth — it is NOT an infra exit criterion.", + "baselineState": { + "adrStatus": "Accepted, committed 2026-05-30", + "scorerCode": "ruview_metrics.rs + ablation.rs + proof.rs exist in wifi-densepose-train; aa_score_runner.rs not yet created", + "aetherArenaRepo": "does not exist yet — needs user authorization to create ruvnet/aether-arena public repo", + "hfSpace": "does not exist yet — needs HF_TOKEN and user authorization to deploy ruvnet/aether-arena HF Space", + "smokeDataset": "not committed", + "resultsLedger": "not created", + "ruviewBaseline": "PCK@20 ~2.5% self-reported, not formally entered", + "ciGate": "not added to workflow" + }, + "milestones": { + "m1": { + "name": "ADR-149 Accepted + committed", + "status": "DONE", + "completedDate": "2026-05-30", + "completionCriteria": "ADR-149 file committed to docs/adr/ with status Accepted", + "notes": "Done this session. File at docs/adr/ADR-149-public-community-leaderboard-huggingface.md" + }, + "m2": { + "name": "Deterministic scorer runner bin (aa_score_runner.rs)", + "status": "NOT_STARTED", + "completionCriteria": "aa_score_runner.rs compiles, runs ruview_metrics on a committed fixture, emits RuViewTier + SHA-256 proof hash, mirrors existing *_proof_runner.rs pattern; cargo test passes", + "estimatedEffort": "3-5 days", + "owner": "wifi-densepose-train crate or new aa-scorer crate" + }, + "m3": { + "name": "CI harness-gate: GitHub Actions workflow", + "status": "NOT_STARTED", + "completionCriteria": "A GitHub Actions workflow runs aa_score_runner on every PR as a build gate; PR fails if scorer fails determinism check; workflow committed and green", + "estimatedEffort": "2-3 days", + "dependency": "M2 must be done first" + }, + "m4": { + "name": "aether-arena repo scaffold", + "status": "NOT_STARTED", + "completionCriteria": "ruvnet/aether-arena repo created with: README (four-part framing: Public leaderboard / Private eval split / Open scorer / Signed results); aa-submission.toml manifest schema; VERIFY.md (ADR-149 §7 stranger acceptance test); neutrality/governance section (§2.8); contribution guide", + "estimatedEffort": "3-5 days", + "blockers": ["Needs user authorization to create public ruvnet/aether-arena repo on GitHub"] + }, + "m5": { + "name": "Public smoke split committed + private MM-Fi held-out split prep", + "status": "NOT_STARTED", + "completionCriteria": "Public smoke split committed to aether-arena repo (stranger can score locally); private MM-Fi held-out split prepared under non-public path with CC BY-NC 4.0 attribution; Wi-Pose explicitly excluded from v0", + "estimatedEffort": "5-7 days", + "riskNotes": "MM-Fi CC BY-NC 4.0: AA must remain non-commercial and carry MM-Fi attribution; raw frames stay in private split; only derived CSI features + scores may be exposed" + }, + "m6": { + "name": "HF Space (Gradio) skeleton", + "status": "BLOCKED", + "completionCriteria": "HF Space deployed at ruvnet/aether-arena with submission lifecycle (submitted->validated->quarantined->smoke_scored->full_scored->published/rejected); sandboxed scorer container wired; basic leaderboard table rendered", + "estimatedEffort": "7-10 days", + "blockers": [ + "Needs HF_TOKEN — check .env for HF_TOKEN or HUGGINGFACE_TOKEN", + "Needs user authorization to create/deploy ruvnet/aether-arena HF Space (outward-facing public deployment)" + ] + }, + "m7": { + "name": "Signed append-only Parquet results ledger", + "status": "NOT_STARTED", + "completionCriteria": "HF dataset ruvnet/aether-arena-results created; append-only Parquet ledger with signed rows; determinism_gate enforced; no row can be silently edited", + "estimatedEffort": "3-5 days", + "ledgerSchema": "submitter, model_ref, category, feature_set, tier, pck20, oks, mota, vitals_bpm_err, latency_p50, latency_p95, privacy_leakage, cross_room_deg, proof_sha256, scored_at, harness_version", + "dependency": "M6 must be scaffolded first" + }, + "m8": { + "name": "RuView baseline entry + public launch", + "status": "NOT_STARTED", + "completionCriteria": "RuView wifi-densepose-pretrained baseline entered (honest PCK@20 ~2.5%); ADR-149 §7 five-step stranger acceptance test passes; v0 live with Presence + Pose + Edge-latency + Determinism categories active; Privacy and Cross-room shown as gated/coming-soon", + "estimatedEffort": "3-5 days", + "dependency": "M4+M5+M6+M7 complete", + "notes": "ML SOTA improvement (PCK@20 ~72%) is a SEPARATE stretch goal blocked on ADR-079 P7-P9 camera ground truth. NOT a blocker for infra launch." + } + }, + "activeMilestone": "m2", + "completedMilestones": ["m1"], + "knownRisks": [ + "HF_TOKEN not confirmed present in .env — check before M6 work begins", + "ruvnet/aether-arena public repo creation is outward-facing — needs explicit user authorization", + "MM-Fi CC BY-NC 4.0: AA must stay legally non-commercial and brand-distinct from commercial RuView product; or seek MM-Fi commercial grant before any paid tier", + "Wi-Pose has research-use-only terms (no redistribution grant) — excluded from v0; revisit only if terms are clarified with authors", + "HF Space free CPU tier may be too slow for Candle/tch inference pipeline — may need ZeroGPU or self-hosted scorer on cognitum-20260110 GCloud A100/L4", + "ADR-079 camera-ground-truth (PCK@20 SOTA) is P7-P9 pending — NOT an infra blocker; must not be conflated with AA infra completion", + "Neutrality/governance risk: RuView seeded the scorer — must be demonstrably scored through the same public pipeline as any other entrant (§2.8 controls)" + ], + "driftSignals": { + "timeline": "GREEN — just initialized, no timeline pressure yet", + "scope": "GREEN — scope locked at four-part structure per ADR-149 §2 decision", + "approach": "GREEN — reuse pattern (existing ruview_metrics + proof.rs) confirmed in ADR-149", + "dependency": "YELLOW — HF_TOKEN and ruvnet/aether-arena repo authorization are external blockers with unknown ETA", + "priority": "GREEN — active feature branch feat/adr-136-146-streaming-engine in progress; AA infra can proceed in parallel on its own branch" + }, + "stretchGoals": { + "sotaML": "MM-Fi PCK@20 SOTA ~72% — separate ML effort blocked on ADR-079 P7-P9 camera-ground-truth data collection; NOT an infra exit criterion", + "privacyAxis": "ADR-145 §10 membership-inference attacker — activate Privacy leaderboard axis once attacker is implemented and published", + "crossRoom": "Multi-room held-out split — activate Cross-room generalization axis", + "multiOrgSteering": "Invite co-maintainers from other projects once >=N external entries land" + }, + "sessionHistory": [ + { + "date": "2026-05-30", + "type": "initialization", + "accomplished": [ + "ADR-149 Accepted and committed to docs/adr/", + "Horizon record initialized in .claude-flow/horizons/aether-arena-aa.json", + "Memory stored in horizons namespace under key horizon-aether-arena-aa", + "Session check-in record stored in horizon-sessions namespace" + ] + } + ] +} diff --git a/.github/workflows/aether-arena-harness.yml b/.github/workflows/aether-arena-harness.yml new file mode 100644 index 00000000..ed56c21a --- /dev/null +++ b/.github/workflows/aether-arena-harness.yml @@ -0,0 +1,94 @@ +name: AetherArena harness gate (ADR-149) + +# Runs the AetherArena scoring harness as a PR build gate. Every PR that touches +# the scorer, the metrics, or the benchmark scaffold must keep the deterministic +# score hash stable (ADR-149 §2.5 determinism_gate). If the scoring maths changes, +# the hash moves and this gate fails until `expected_score.sha256` is regenerated +# and reviewed — so scorer drift can never land silently. +# +# This is the "a PR that runs the harness as part of the build process" requirement. + +on: + pull_request: + paths: + - 'v2/crates/wifi-densepose-train/src/ruview_metrics.rs' + - 'v2/crates/wifi-densepose-train/src/ablation.rs' + - 'v2/crates/wifi-densepose-train/src/bin/aa_score_runner.rs' + - 'aether-arena/**' + - '.github/workflows/aether-arena-harness.yml' + push: + branches: ['feat/adr-149-aether-arena'] + workflow_dispatch: + +permissions: + contents: read + pull-requests: write + +jobs: + harness-gate: + name: Run AA scorer harness (determinism gate) + runs-on: ubuntu-latest + defaults: + run: + working-directory: v2 + steps: + - uses: actions/checkout@v4 + + - name: Install Rust toolchain + run: rustup show && rustc --version + + - name: Cache cargo + uses: actions/cache@v4 + with: + path: | + ~/.cargo/registry + ~/.cargo/git + v2/target + key: aa-harness-${{ runner.os }}-${{ hashFiles('v2/Cargo.lock') }} + + # 1. Build the pure-Rust scorer (no torch / no GPU → fast PR gate). + - name: Build AA score runner + run: cargo build -p wifi-densepose-train --bin aa_score_runner --no-default-features + + # 2. Determinism gate: the committed expected hash must still match. A + # non-zero exit here fails the PR. + - name: Run determinism gate + run: cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features + + # 3. Repeatability analysis (witness chain): the harness must produce one + # identical proof hash across many runs — any nondeterminism fails here. + - name: Repeatability analysis (16 runs) + run: cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --repeat 16 + + # 4. Real-scoring smoke: score a sample prediction against the public smoke + # split, exercising the actual model-scoring path (not just the fixture). + - name: Real-scoring smoke test + run: | + cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- \ + --split ../aether-arena/fixtures/smoke_split.json \ + --pred ../aether-arena/fixtures/smoke_pred.json --json + + # 5. Witness ledger chain integrity: the append-only results ledger must + # verify (every prev_hash link + row_hash intact = no silent edits). + - name: Verify witness ledger chain + working-directory: aether-arena/ledger + run: python3 ledger_tools.py verify + + # 6. Emit the witness row + repeatability into the PR run summary. + - name: Witness row → job summary + if: always() + run: | + ROW=$(cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --json) + REP=$(cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --repeat 16) + { + echo "## AetherArena harness gate (witness chain)" + echo "" + echo "Deterministic witness (ADR-149 §2.2 / proof + repeatability):" + echo '```json' + echo "$ROW" + echo "$REP" + echo '```' + echo "" + echo "If the determinism gate failed, the scoring maths changed: regenerate with" + echo '`cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --generate-hash > aether-arena/fixtures/expected_score.sha256` and review the diff.' + } >> "$GITHUB_STEP_SUMMARY" diff --git a/.github/workflows/ruview-swarm-ci.yml b/.github/workflows/ruview-swarm-ci.yml index c103f7f9..3e360bb5 100644 --- a/.github/workflows/ruview-swarm-ci.yml +++ b/.github/workflows/ruview-swarm-ci.yml @@ -60,8 +60,14 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 + # v2/rust-toolchain.toml pins channel "1.89" with profile "minimal" (no + # clippy). dtolnay@stable installs clippy on the floating "stable" + # toolchain, but the override makes cargo use the separate "1.89" + # toolchain — so `cargo clippy` errors "cargo-clippy is not installed for + # 1.89". Install clippy on the pinned toolchain that cargo actually uses. - uses: dtolnay/rust-toolchain@stable with: + toolchain: "1.89" components: clippy - name: Cache cargo uses: actions/cache@v4 diff --git a/.gitignore b/.gitignore index 30f4a0eb..4734f46d 100644 --- a/.gitignore +++ b/.gitignore @@ -261,3 +261,10 @@ v2/crates/rvcsi-node/*.node v2/crates/rvcsi-node/binding.js v2/crates/rvcsi-node/binding.d.ts v2/crates/rvcsi-node/npm/ + +# AetherArena private optimization staging — never published until reviewed +aether-arena/staging/ + +# MM-Fi benchmark dataset archives — large data, fetch separately, never commit +assets/MM-Fi/E0*.zip +assets/MM-Fi/*.zip diff --git a/CHANGELOG.md b/CHANGELOG.md index 6f7a6faa..269cd681 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,7 +7,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- **Person count no longer pinned to 1 — addresses #803.** The aggregate occupancy reported by the sensing server was derived from `smoothed_person_score`, an EMA-smoothed *activity* score (amplitude variance / motion / spectral energy). That score saturates near a single occupant — one moving person maxes it out — so it cannot discriminate occupancy *count* and stayed clamped at 1 across S3/C6 and the Python/Docker/Rust servers. Meanwhile the count-aware per-node estimates the ESP32 paths already compute (firmware `n_persons`, and the DynamicMinCut `corr_persons`) were stashed in `NodeState::prev_person_count` and then **discarded** by the aggregator (same dead-wiring class as #872). The aggregator now takes `max(activity_count, node_max)` via a unit-tested `aggregate_person_count` helper, so a node positively estimating 2–3 occupants is surfaced instead of overwritten. The fix can only ever *raise* the count when a node reports more people, so the single-occupant case is provably never inflated (regression-guarded by test). **Second half:** the pure-CSI per-node path itself clamped its own estimate — the DynamicMinCut occupancy (`estimate_persons_from_correlation`, 0–3) was mapped to a score via `corr_persons / 3.0`, putting 2 people at 0.667, *just under* the 0.70 up-threshold of `score_to_person_count`, so the per-node count never climbed past 1 (so `node_max` was also stuck at 1 for CSI-only nodes). Replaced it with a threshold-aligned `corr_persons_to_score` mapping (1→0.40, 2→0.74, 3→0.96) whose steady state round-trips back to the same count through the EMA + hysteresis, while still gating transient noise. A convergence test replays the exact EMA loop to prove min-cut=2 now reports 2 (and documents that the old `/3.0` mapping reported 1). Full multi-person accuracy still depends on the underlying estimator quality; this removes the two server-side clamps that masked it. 586 sensing-server tests pass. +- **MQTT publisher now actually runs (`--mqtt`) — closes #872.** The `--mqtt*` flags were defined only in `cli::Args` (dead code, referenced nowhere) while the binary parses a *separate* `main::Args` with no mqtt fields, and `main.rs` never started the `mqtt::` publisher — so MQTT/Home-Assistant integration was completely unwired (`--mqtt` errored as an unexpected argument, and even with the Docker image's `--features mqtt` build the publisher never ran). Earlier attempts chased a Docker *rebuild*; the real cause was disconnected *code*. Extracted the flags into a shared `cli::MqttArgs` (`#[command(flatten)]` into both structs), spawn the publisher on `--mqtt`, and bridge the JSON sensing broadcast into the typed `VitalsSnapshot` stream with a defensive `serde_json::Value` mapping. Verified end-to-end against `mosquitto`: 20 HA auto-discovery entities + live state (presence/person-count/…). 577 (default) / 580 (`--features mqtt`) tests pass. + ### Added +- **WiFi-CSI pose: efficiency frontier + per-room calibration service** (ADR-150 §3.2–3.6). Two beyond-SOTA results on the MM-Fi benchmark, plus the deployment mechanism that resolves real-world generalization: + - **Efficiency frontier** — a **75 K-param model beats published SOTA** (74.3% vs MultiFormer 72.25% torso-PCK@20); every config from `micro` up is Pareto-dominant (smaller *and* more accurate than prior work). Shipped a deployable **int4 edge model (~20 KB, verified 74.08%, 0.135 ms single-thread CPU)** — published at [`ruvnet/wifi-densepose-mmfi-pose/edge`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose). See [`docs/benchmarks/wifi-pose-efficiency-frontier.md`](docs/benchmarks/wifi-pose-efficiency-frontier.md). + - **Generalization solved by few-shot calibration** — zero-shot cross-subject (~64%) and cross-environment (~10%) are *not* closeable by algorithms (CORAL, DANN, instance-norm, contrastive foundation-pretraining all tested, all failed) or by more training subjects (saturates ~64%). But **~100–200 labeled in-room samples recover SOTA-level pose**: cross-subject 64→76%, **cross-environment 10→73% (60% from just 5 samples)** — deployable as a **~11 KB per-room LoRA adapter** on a frozen shared base. Full empirical chain in ADR-150 §3.2–3.6. + - **Calibration service (complete, both model paths, cross-language verified)** — `aether-arena/calibration/`: `calibrate.py` (transformer model, `.npz` adapter) + `infer.py` (verified 3.09%→74.29% on an unseen MM-Fi room), **and `cog_calibrate.py`** which fits a `fc1.a/fc1.b/fc2.a/fc2.b` **safetensors** adapter for the deployed cog conv+MLP model (`pose_v1.safetensors`). Consumed by the Rust product engine: `InferenceEngine::with_adapter()` + `cog-pose-estimation run --config --adapter `. Self-contained regression tests for both Python producers (`test_calibration.py`, `test_cog_calibration.py`) **plus a cross-language Rust integration test** that loads a real `cog_calibrate.py`-generated adapter fixture and asserts it activates + changes engine output. All green. +- **Windows workspace build + test now green** (cross-platform fixes). `wifi-densepose-worldmodel` imported `tokio::net::UnixStream` unconditionally, so `cargo build/test --workspace` failed to compile on Windows (E0432) — now the OccWorld Unix-socket bridge is `#[cfg(unix)]`-gated with a clear non-unix fallback. And `wifi-densepose-bfld`'s `readme_quickstart_uses_canonical_public_api` test checked a multi-line `pipeline\n .process` needle that never matched on a CRLF checkout — now normalizes line endings. Result: **2,682 workspace tests pass / 0 fail on Windows** (the pre-merge gate was previously unrunnable there). - **`ruview-swarm` crate (ADR-148)** — drone swarm control system with hierarchical-mesh topology, Raft consensus, MAPPO multi-agent reinforcement learning, and CSI sensing integration. 14 modules: topology (Raft/Gossip/Mesh), formation control (virtual-structure/leader-follower/Reynolds flocking), RRT-APF path planning, auction+FNN task allocation, MARL actor + PPO training loop, security (MAVLink v2 HMAC-SHA256 signing, UWB anti-spoofing, geofencing, Remote ID, FHSS anti-jamming), 10-state fail-safe machine, and SwarmOrchestrator. ITAR-gated coordination features (USML Category VIII(h)(12)) behind `itar-unrestricted` feature. - **Ruflo integration for `ruview-swarm`** — feature-gated (`ruflo`) AI-agent capability layer connecting to the claude-flow daemon: AgentDB mission memory (`memory_store`/`memory_search`), HNSW pattern learning (`agentdb_pattern-store`/`-search`), AIDefence MAVLink message scanning, and SONA intelligence trajectory hooks. `RufloBackend` trait with `HttpRufloBackend` (JSON-RPC 2.0) and `MockRufloBackend` implementations. diff --git a/README.md b/README.md index eea558f3..7451e30d 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ Built on [RuVector](https://github.com/ruvnet/ruvector/) and [Cognitum Seed](htt The system learns each environment locally using spiking neural networks that adapt in under 30 seconds, with multi-frequency mesh scanning across 6 WiFi channels that uses your neighbors' routers as free radar illuminators. Every measurement is cryptographically attested via an Ed25519 witness chain. -RuView turns ordinary WiFi into a contactless sensor. A $9 ESP32 board reads the radio reflections off the people in a room, and a small pretrained model — published on Hugging Face at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) — tells you who's there, how they're breathing, and how their heart rate is trending. The model fits in 8 KB (4-bit quantized), runs in microseconds on a Raspberry Pi, and reports 100% presence accuracy on the validation set. No cameras, no wearables, no app on the user's phone. +RuView turns ordinary WiFi into a contactless sensor. A $9 ESP32 board reads the radio reflections off the people in a room, and a small pretrained model — published on Hugging Face at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) — tells you who's there, how they're breathing, and how their heart rate is trending. The model fits in 8 KB (4-bit quantized) and runs in microseconds on a Raspberry Pi. (The [v2 encoder](https://huggingface.co/ruvnet/wifi-densepose-pretrained) reports an honest, label-free held-out **temporal-triplet accuracy of 82.3%** — up from 66.4% raw; the older "100% presence" figure was measured on a single-class recording and has been retracted in favor of this.) No cameras, no wearables, no app on the user's phone. ### Built for low-power edge applications @@ -56,9 +56,9 @@ RuView turns ordinary WiFi into a contactless sensor. A $9 ESP32 board reads the > |------|-----|---------------| > | 🫁 **Breathing rate** | Bandpass 0.1–0.5 Hz on wrapped phase, circular variance, zero-crossing BPM ([#593](https://github.com/ruvnet/RuView/issues/593)) | 6–30 BPM, real-time | > | 💓 **Heart rate** | Bandpass 0.8–2.0 Hz, zero-crossing BPM | 40–120 BPM, real-time | -> | 👤 **Presence detection** | Trained head on Hugging Face ([`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained), 100% validation accuracy) + a phase-variance fallback that needs no model | < 1 ms, ~30 s ambient calibration | +> | 👤 **Presence detection** | Trained head on Hugging Face ([`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained); v2 encoder = 82.3% held-out temporal-triplet acc, honestly re-benchmarked) + a phase-variance fallback that needs no model | < 1 ms, ~30 s ambient calibration | > | 🧬 **CSI embeddings** | 128-dim contrastive encoder shipped on Hugging Face, 4-bit quantised variant fits in 8 KB | **164,183 emb/s** on M4 Pro | -> | 🦴 **17-keypoint pose estimation** | `cog-pose-estimation` Cog v0.0.1 — signed aarch64 + x86_64 binaries on GCS, loads `pose_v1.safetensors` via Candle. Train your own from paired data in 2.1 s on an RTX 5080 ([ADR-101](docs/adr/ADR-101-pose-estimation-cog.md), [benchmarks](docs/benchmarks/pose-estimation-cog.md)) | 8.4 ms cold-start on a Pi 5 | +> | 🦴 **17-keypoint pose estimation** | `cog-pose-estimation` Cog v0.0.1 — signed aarch64 + x86_64 binaries on GCS, loads `pose_v1.safetensors` via Candle. Train your own from paired data in 2.1 s on an RTX 5080 ([ADR-101](docs/adr/ADR-101-pose-estimation-cog.md), [benchmarks](docs/benchmarks/pose-estimation-cog.md)). **SOTA on MM-Fi:** [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) hits **82.69% torso-PCK@20** (ensemble 83.59%), beating MultiFormer (72.25%) and CSI2Pose (68.41%) on the matched MM-Fi `random_split` protocol — self-corrected and auditable on [AetherArena](https://huggingface.co/spaces/ruvnet/aether-arena) | 8.4 ms cold-start on a Pi 5 | > | 🚶 **Motion / activity** | Motion-band power + phase acceleration | Real-time | > | 🤸 **Fall detection** | Phase-acceleration threshold + 3-frame debounce + 5 s cooldown ([#263](https://github.com/ruvnet/RuView/issues/263)) | < 200 ms | > | 🧮 **Multi-person count** | Adaptive P95 normalisation + runtime-tunable dedup factor (`/api/v1/config/dedup-factor`, [#491](https://github.com/ruvnet/RuView/pull/491)). Six specialised learned counters available as Cogs: `occupancy-zones`, `elevator-count`, `queue-length`, `customer-flow`, `clean-room`, `person-matching` | Real-time, self-calibrating | @@ -162,7 +162,7 @@ pip install "ruview[client]" # or: pip install "wifi-densepose[clie ## 🤗 Pretrained model on Hugging Face -Pretrained CSI weights live at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) — 12.2M training steps on 60K frames / 610K contrastive triplets, **100% presence accuracy** on the validation set, 4-bit quantized variant fits in 8 KB. The release includes a contrastive **CSI encoder** producing 128-dim embeddings (164,183 emb/s on M4 Pro) and a **presence-detection head**. Per-node LoRA adapters are included for environment-specific fine-tuning. +Pretrained CSI weights live at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) — 12.2M training steps on 60K frames / 610K contrastive triplets, **82.3% held-out temporal-triplet accuracy** (up from 66.4% raw; the older "100% presence" figure was measured on a single-class recording and has been retracted), 4-bit quantized variant fits in 8 KB. The release includes a contrastive **CSI encoder** producing 128-dim embeddings (164,183 emb/s on M4 Pro) and a **presence-detection head**. Per-node LoRA adapters are included for environment-specific fine-tuning. ```bash # Download the model bundle @@ -182,7 +182,27 @@ huggingface-cli download ruvnet/wifi-densepose-pretrained --local-dir models/wif **Quantization choices** (all in the HF repo): `model-q2.bin` (4 KB) · `model-q4.bin` ⭐ recommended (8 KB) · `model-q8.bin` (16 KB) · `model.safetensors` full (48 KB) -The separate **17-keypoint pose-estimation model** is not in this release — pipeline is implemented but keypoint weights are still pending. Tracked in [#509](https://github.com/ruvnet/RuView/issues/509); see [ADR-079](docs/adr/ADR-079-camera-supervised-pose-finetune.md) phases P7–P9. +The separate **17-keypoint pose-estimation model** is now published at [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) — **82.69% torso-PCK@20** on MM-Fi (single model) / **83.59%** (3-model ensemble + TTA), beating the prior published SOTA MultiFormer (72.25%) and CSI2Pose (68.41%) on the matched `random_split` protocol. See **Results & proof** below. + +### Results & proof + +| What | Where | Numbers | +|------|-------|---------| +| **MM-Fi pose model (SOTA)** | [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) | 82.69% torso-PCK@20 (single) · 83.59% (ensemble+TTA) · 75K-param micro variant 74.30% | +| **AetherArena benchmark Space** | [`ruvnet/aether-arena`](https://huggingface.co/spaces/ruvnet/aether-arena) | self-correcting, auditable MM-Fi leaderboard | +| **Full MM-Fi study (honest picture)** | [`docs/benchmarks/mmfi-wifi-sensing-study.md`](docs/benchmarks/mmfi-wifi-sensing-study.md) | pose + action; zero-shot cross-subject ~64%, +~30 s in-room calibration → 72.2% | +| **Efficiency frontier** | [`docs/benchmarks/wifi-pose-efficiency-frontier.md`](docs/benchmarks/wifi-pose-efficiency-frontier.md) | SOTA-beating WiFi pose in a 20 KB int4 edge model | +| **Pretrained encoder** | [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) | 82.3% held-out temporal-triplet, 8 KB int4 | +| **Reproducible proof (Trust Kill Switch)** | [`archive/v1/data/proof/verify.py`](archive/v1/data/proof/verify.py) + [`expected_features.sha256`](archive/v1/data/proof/expected_features.sha256) | one-command deterministic pipeline replay (SHA-256 of output vs published hash) | +| **Benchmark-proof ADR** | [ADR-147](docs/adr/ADR-147-benchmark-proof.md) | how the numbers are produced and verified | +| **Witness attestation** | [`docs/WITNESS-LOG-028.md`](docs/WITNESS-LOG-028.md) | 33-row capability attestation matrix with per-claim evidence | + +```bash +# Reproduce the deterministic pipeline proof yourself (must print VERDICT: PASS): +python archive/v1/data/proof/verify.py +``` + +Tracked in [#509](https://github.com/ruvnet/RuView/issues/509); see [ADR-079](docs/adr/ADR-079-camera-supervised-pose-finetune.md) phases P7–P9 for the camera-supervised fine-tune path. ## 🧩 Edge Module Catalog diff --git a/aether-arena/README.md b/aether-arena/README.md new file mode 100644 index 00000000..9adcc655 --- /dev/null +++ b/aether-arena/README.md @@ -0,0 +1,50 @@ +# AetherArena ("AA") — The Official Spatial-Intelligence Benchmark + +> **Public leaderboard. Private evaluation split. Open scorer. Signed results.** + +AetherArena is a **standalone, project-agnostic benchmark** for camera-free **spatial intelligence** — pose, presence, occupancy, tracking, and vitals from RF/WiFi (and, over time, mmWave / UWB / radar / lidar / multimodal). It is **not** a single-vendor leaderboard: any team, framework, or sensing modality can enter, and every entrant — including the RuView baseline that donated the seed scorer — is scored by the identical, open, pinned harness. + +Specified in [ADR-149](../docs/adr/ADR-149-public-community-leaderboard-huggingface.md) (Accepted). + +Canonical home: **`ruvnet/aether-arena`** + a Hugging Face Space (deploy pending — see `STATUS`). + +--- + +## Why + +WiFi/RF spatial sensing has no shared yardstick — papers self-report against inconsistent splits and metrics, with **no accounting for latency, reproducibility, or privacy leakage**. AA fixes the *measurement*, not just the models: a single deterministic scorer, a private held-out split nobody can train on, and a signed result ledger that can't be silently edited. + +## What gets measured (v0) + +| Category | Metric | Status | +|----------|--------|--------| +| **Pose** | PCK@0.2 (all / torso), OKS | Ranked | +| **Presence** | accuracy, FP/FN | Ranked | +| **Edge latency** | p50 / p95 / p99 ms | Ranked | +| **Determinism** | proof-hash pass/fail | Ranked (gate) | +| Tracking (MOTA) | — | activates when multi-person clips land | +| Vitals (BPM err) | — | activates when paired vitals ground truth lands | +| **Privacy leakage** | membership-inference ∈ [0,1] | **gated — not ranked** until the attacker ships | +| Cross-room | degradation ratio | coming soon | + +The headline rank is the **category metric**; an optional `arena_score = quality × latency_factor × privacy_factor × determinism_gate` is exposed alongside (never instead) so accuracy can't win at any cost. See ADR-149 §2.5. + +## How scoring works + +The scorer is RuView's **already-published** `wifi-densepose-train` acceptance harness (`ruview_metrics` + ADR-145 `ablation`), run in a pinned sandbox. **You submit a model, not predictions** — predictions on data you hold prove nothing. Your model is scored against a **private** MM-Fi held-out split (CC BY-NC 4.0; Wi-Pose excluded for redistribution reasons), and one **signed, append-only** row is written to the results ledger with a determinism proof hash. + +Submission lifecycle: `submitted → validated → quarantined → smoke_scored → full_scored → published` (or `rejected` with a reason). The model only ever runs inside a no-network, read-only-FS sandbox. + +## Submit (when the Space is live) + +1. Write a manifest: [`schema/aa-submission.toml`](schema/aa-submission.toml). +2. Push your model artifact (`.safetensors` / `.rvf` / LoRA adapter) + manifest to the Space. +3. Watch it move through the lifecycle; your signed row appears on the board. + +## Verify it's fair (you don't have to trust us) + +See [`VERIFY.md`](VERIFY.md) — run the **open scorer** locally on the **public smoke split**, reproduce the determinism hash, and confirm RuView's own entries were scored by the identical path. That five-step check is the launch gate (ADR-149 §7). + +## Neutrality + +AA is a neutral commons. The scorer is open and versioned; any metric change is a public `harness_version` bump that **re-scores all entries**. RuView donated the seed harness and enters as one baseline — it gets no special treatment (ADR-149 §2.8). diff --git a/aether-arena/STATUS.md b/aether-arena/STATUS.md new file mode 100644 index 00000000..8d97519a --- /dev/null +++ b/aether-arena/STATUS.md @@ -0,0 +1,30 @@ +# AetherArena — Build Status + +Tracks ADR-149 implementation milestones. "Complete" = benchmark **infrastructure** done, +tested, CI-gated, deploy-ready, RuView baseline entered, §7 acceptance test passing. +Model **SOTA** (e.g. MM-Fi PCK@20 ~72%) is a separate long-running ML effort, blocked on +ADR-079 camera-ground-truth collection — *not* an infra-completion blocker. + +| # | Milestone | Status | +|---|-----------|--------| +| M1 | ADR-149 Accepted + committed | ✅ done | +| M2 | Scorer runner (`aa_score_runner`) — **real model scoring** + witness (proof+inputs hash) + **repeatability analysis** | ✅ done — builds `--no-default-features`, determinism gate PASS, repeatable 16/16 | +| M3 | CI harness-gate workflow (PR runs scorer + repeatability + real-scoring smoke + ledger verify) | ✅ done — `.github/workflows/aether-arena-harness.yml` | +| M4 | Scaffold: README + submission schema + VERIFY (acceptance test) | ✅ done | +| M5 | Public smoke split (committed) + private MM-Fi held-out split prep | 🟡 smoke split done (`fixtures/smoke_*.json`); private MM-Fi prep pending | +| M6 | HF Space (Gradio) — leaderboard + ledger integrity + submit/verify/about | ✅ deployed → https://huggingface.co/spaces/ruvnet/aether-arena (sandboxed scorer container = later hardening) | +| M7 | **Witness ledger chain** — append-only, hash-chained, tamper-evident | ✅ done — `ledger/ledger_tools.py` (seed/append/verify); tamper test fails as designed | +| M8 | Public launch | ✅ Space **LIVE** (gradio 5.9.1, serving 200) — **board empty, awaiting first real harness score** (benchmark-first: no seeded numbers) | + +## v0 infrastructure: COMPLETE +Implement ✅ · Test ✅ · Deploy to HF ✅ (https://huggingface.co/spaces/ruvnet/aether-arena) · Instructions+Verification ✅ · PR runs the harness ✅ (PR #874, AA harness gate **passed**). +Remaining = data + hardening, not infra: private MM-Fi held-out split (M5), sandboxed scorer container (M6), privacy-leakage attacker (gated category), and **model SOTA** (separate ML effort, blocked on ADR-079 — explicitly not an infra exit). + +## Benchmark-first posture (per user direction) +- **No placeholder numbers on the board.** The ledger seeds to genesis only; every result is a real scoring-pipeline witness. RuView gets no seeded baseline. +- **Witness chain** = `inputs_sha256` (binds witness to exact inputs) + `proof_sha256` (cross-platform-stable score hash) + the append-only hash-chained ledger. Repeatability analysis (`--repeat N`) proves the proof hash is identical across runs. + +## Blockers / decisions needed +- **HF deploy (M6)** — token is in GCP Secret Manager (`HUGGINGFACE_API_KEY`); creating the public `ruvnet/aether-arena` Space still wants explicit go. +- **MM-Fi is CC BY-NC** → AA must stay non-commercial / legally distinct from the commercial RuView product. +- **Private MM-Fi split (M5)** — needs the dataset pulled + a held-out split assembled before real public scoring replaces the smoke fixture. diff --git a/aether-arena/VERIFY.md b/aether-arena/VERIFY.md new file mode 100644 index 00000000..2e9e5546 --- /dev/null +++ b/aether-arena/VERIFY.md @@ -0,0 +1,78 @@ +# Verifying AetherArena (you don't have to trust us) + +AA's credibility rests on a stranger being able to reproduce a score and see that the rules are fair. This is the **launch gate** (ADR-149 §7): v0 does not ship until all five checks below pass for someone with no insider access. + +> **Wider context:** this page covers the *leaderboard scorer*. For the whole-platform answer to +> "is this real / does it actually work?" — including the deterministic pipeline proof, the +> published models + public-benchmark numbers, and the built-in-public development trail — see +> [`docs/proof-of-capabilities.md`](../docs/proof-of-capabilities.md). + +## The open scorer + +The scoring engine is a pure-Rust, GPU-free binary: `aa_score_runner` in `wifi-densepose-train`. It runs the real `ruview_metrics` pose-acceptance harness on a fixed fixture and emits a cross-platform-stable SHA-256 **determinism proof**. + +### Reproduce the determinism hash locally + +```bash +cd v2 +# Verify the committed expected hash still matches (this is the CI gate): +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features +# → prints the witness (inputs_sha256 + proof_sha256) and "VERDICT: PASS" + +# See the witness row as JSON: +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --json +``` + +### Witness chain — proof + repeatability analysis + +Every score is a **witness**: `inputs_sha256` (binds it to the exact inputs scored) ++ `proof_sha256` (cross-platform-stable hash of the quantised score) + `harness_version`. +Witnesses are recorded in an **append-only, hash-chained ledger** (each row references +the previous row's hash), so a silent edit to any past row breaks the chain. + +```bash +# Repeatability: run the scorer K times, confirm ONE identical proof hash: +cd v2 +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --repeat 16 +# → {"repeatability":{"runs":16,"unique_proof_hashes":1,"repeatable":true,...}} + +# Real model scoring (score predictions against an eval split): +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- \ + --split ../aether-arena/fixtures/smoke_split.json \ + --pred ../aether-arena/fixtures/smoke_pred.json --json + +# Verify the witness ledger chain is intact (tamper-evident): +cd ../aether-arena/ledger && python3 ledger_tools.py verify +# → "OK: N rows, chain intact" (edit any row and it reports the broken link) +``` + +The expected hash is committed at [`fixtures/expected_score.sha256`](fixtures/expected_score.sha256). Same harness version + same fixture → same hash on glibc / MSVC / Apple. If your local run prints `VERDICT: PASS`, you have reproduced the scorer. + +### What happens if the scoring maths changes + +Any edit to `ruview_metrics.rs`, `ablation.rs`, or `aa_score_runner.rs` moves the hash and **fails the CI gate** (`.github/workflows/aether-arena-harness.yml`) until the maintainer regenerates and reviews: + +```bash +cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --generate-hash \ + > aether-arena/fixtures/expected_score.sha256 +``` + +So a scorer change is always a reviewed, public diff — never silent. That's `harness_version` pinning + `determinism_gate` in action (ADR-149 §2.4–§2.5). + +## The five-step acceptance test (v0 launch gate) + +A stranger must be able to: + +1. **Submit** a model (artifact + `schema/aa-submission.toml`) with no insider help. +2. **Get a deterministic score** — same model + same `harness_version` → same numbers. +3. **See the signed row** appended to the public results ledger. +4. **Rerun the scorer locally** on the public smoke split and reproduce the logic (the command above). +5. **Understand why the rank is fair** — private split, open scorer, pinned version, proof hash — from these docs alone. + +If any step fails, v0 is not ready. + +## Current status + +- ✅ Step 4 (rerun the open scorer locally, reproduce the hash) — **works today** via `aa_score_runner`. +- ✅ CI harness gate runs the scorer on every PR. +- ⏳ Steps 1–3, 5 (HF Space submission flow + signed ledger) — in progress; require the HF Space deploy (needs an HF token / maintainer authorization). diff --git a/aether-arena/calibration/README.md b/aether-arena/calibration/README.md new file mode 100644 index 00000000..0b407525 --- /dev/null +++ b/aether-arena/calibration/README.md @@ -0,0 +1,87 @@ +# RuView Calibration Service (reference implementation) + +Turn a **shared WiFi-CSI pose base model** into a room-specific one with a **30-second labeled +calibration** and a **~11 KB per-room LoRA adapter**. This is the deployable resolution of the +cross-subject / cross-environment generalization problem (full study: [ADR-150 §3.3–3.6](../../docs/adr/ADR-150-rf-foundation-encoder.md)). + +## Why + +Zero-shot WiFi pose generalizes poorly to a **new room or new person** — an unseen room can drop a +strong model to near-random. But that gap is **not** algorithmically closeable (CORAL, DANN, +instance-norm, contrastive foundation-pretraining all failed) and **not** closeable by collecting +more subjects (saturates ~64%). It **is** closeable, cheaply, at deployment time: a handful of +labeled frames from the actual room pin down its multipath instantly. + +| Deployment case | Zero-shot | + in-room calibration | +|-----------------|----------:|----------------------:| +| Same room, new person (cross-subject) | 64% | **76%** (200 samples) | +| **New room + new person (cross-environment)** | **~10%** | **60% @ 5 samples → 73% @ 200** | + +**Verified demo (this code, source-only base on an unseen MM-Fi room E04):** +`zero-shot 3.09% → after 200-sample calibration 74.29%` (+71 pts). + +## How it works + +A frozen shared **base** (transformer + temporal attention pool + skeleton-graph head, the published +[`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose)) plus a +tiny **LoRA adapter** (rank 8 on the input projection + pose head — **11,200 params ≈ 11 KB int8 / +22 KB fp16**) fitted per room. Thousands of room-adapters hang off one base. + +## Usage + +```bash +# 1) Capture a short labeled clip in the deployment room -> calib.npz {X:[N,3,114,10], Y:[N,17,2]} +# (~100–200 samples recommended; below ~20 the adapter can underperform zero-shot) + +# 2) Fit the per-room adapter (~11 KB): +python calibrate.py --base pose_mmfi_best.pt --data calib.npz --out room.adapter.npz + +# 3) Run calibrated inference (base + room adapter): +python infer.py --base pose_mmfi_best.pt --adapter room.adapter.npz --data frames.npz --out kp.npy +# omit --adapter to run the uncalibrated (zero-shot) base +``` + +`X` is CSI amplitude `[N, 3 antennas, 114 subcarriers, 10 frames]` (per-sample standardization is +applied internally). `Y` is `[N,17,2]` COCO keypoints in `[0,1]`. + +## Calibration budget (measured, rank-8 LoRA, 3 seeds — ADR-150 §3.5) + +| Labeled samples/room | cross-subject | cross-environment | +|---------------------:|--------------:|------------------:| +| 0 (zero-shot) | 64% | ~10% | +| 5 | — | 60% | +| 20 | 66% | 66% | +| 50 | 70% | 70% | +| 200 | 72% | 73% | + +Knee at ~50 samples (~70%); **below ~20 samples the adapter can hurt** (too few to fit reliably). + +## Two models, two producers (not interchangeable) + +Adapters are **model-specific**. There are two calibration producers here: + +| Producer | Target model | Input | Adapter format | Consumer | +|----------|--------------|-------|----------------|----------| +| `calibrate.py` | MM-Fi **transformer** (`pose_mmfi_best.pt`, 3×114×10) | `[N,3,114,10]` | `.npz` (`proj`/`head` LoRA) | this Python `infer.py` | +| `cog_calibrate.py` | cog **conv+MLP** (`pose_v1.safetensors`, 56×20) | `[N,56,20]` | `.safetensors` (`fc1.a`/`fc1.b`/`fc2.a`/`fc2.b`) | Rust `cog-pose-estimation run --adapter` | + +```bash +# Produce a cog-format per-room adapter for the deployed Rust pose engine: +python cog_calibrate.py --base pose_v1.safetensors --data calib.npz --out room.safetensors +# then in the cog runtime: +cog-pose-estimation run --config --adapter room.safetensors +``` + +Same LoRA *mechanism* (ADR-150 §3.5), different architecture and key layout — an adapter from one +producer will not load into the other model. + +## Notes + +- **Calibration only helps when the base hasn't already seen the room.** The published flagship was + trained on MM-Fi `random_split`, so calibrating it on an MM-Fi subject is a near-no-op (it already + saw them); for a genuinely new real-world room it is zero-shot and calibration applies. To + *reproduce the demo* on a held-out MM-Fi room, train a source-only base (exclude the target + environment) — see `ADR-150 §3.6` and the few-shot harness in `aether-arena/staging/`. +- Adapter is saved fp16 (~22 KB); quantize to int8 for the ~11 KB on-device form. +- Inference is real-time on CPU (the 75 K-param `micro` variant runs in 0.135 ms single-thread x86; + see [`docs/benchmarks/wifi-pose-efficiency-frontier.md`](../../docs/benchmarks/wifi-pose-efficiency-frontier.md)). diff --git a/aether-arena/calibration/calibrate.py b/aether-arena/calibration/calibrate.py new file mode 100644 index 00000000..31ed08ec --- /dev/null +++ b/aether-arena/calibration/calibrate.py @@ -0,0 +1,71 @@ +"""RuView per-room calibration — fit a ~11 KB LoRA adapter from a short labeled in-room capture. + + python calibrate.py --base pose_mmfi_best.pt --data room_calib.npz --out room_A.adapter.npz + +`room_calib.npz` must contain `X` [N,3,114,10] CSI amplitude and `Y` [N,17,2] (or [N,34]) keypoints +in [0,1] — the labeled calibration samples from the deployment room (~100–200 recommended; ≥20). +Outputs a tiny adapter (.npz, ~11 KB) that, loaded over the shared base at inference, recovers +SOTA-level pose for that room/person (ADR-150 §3.5–3.6). +""" +import argparse +import numpy as np +import torch +import torch.nn as nn + +from model import PoseNet, standardize + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--base", required=True, help="base checkpoint (pose_mmfi_best.pt)") + ap.add_argument("--data", required=True, help="labeled calibration .npz with X and Y") + ap.add_argument("--out", required=True, help="output adapter .npz") + ap.add_argument("--rank", type=int, default=8) + ap.add_argument("--iters", type=int, default=600) + ap.add_argument("--lr", type=float, default=8e-4) + ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + a = ap.parse_args() + + z = np.load(a.data) + X = torch.tensor(z["X"].astype(np.float32)) + Y = torch.tensor(z["Y"].reshape(len(z["Y"]), 34).astype(np.float32)) + n = len(X) + if n < 20: + print(f"WARNING: only {n} calibration samples — below ~20 the adapter may underperform " + f"zero-shot (ADR-150 §3.5). Recommend ~100–200.") + dev = a.device + + net = PoseNet().to(dev) + net.load_state_dict(torch.load(a.base, map_location=dev), strict=False) + net.add_lora(r=a.rank).to(dev) + for k, p in net.named_parameters(): + p.requires_grad = k.endswith(".A") or k.endswith(".B") + trainable = [p for p in net.parameters() if p.requires_grad] + n_tr = sum(p.numel() for p in trainable) + + Xs = standardize(X.to(dev)) + Yt = Y.to(dev) + opt = torch.optim.AdamW(trainable, lr=a.lr, weight_decay=0.0) + lossf = nn.SmoothL1Loss(beta=0.1) + bs = min(128, n) + net.train() + for it in range(a.iters): + bi = torch.randint(0, n, (bs,), device=dev) + xb = Xs[bi] + # light augmentation (subcarrier dropout + noise) — matches training-time regularization + m = (torch.rand(xb.shape[0], xb.shape[1], 1, 1, device=dev) > 0.15).float() + xb = xb * m + 0.03 * torch.randn_like(xb) * torch.rand(xb.shape[0], 1, 1, 1, device=dev) + opt.zero_grad() + lossf(net(xb), Yt[bi]).backward() + opt.step() + + adapter = net.lora_state() + nbytes = sum(v.astype(np.float16).nbytes for v in adapter.values()) + np.savez(a.out, **{k: v.astype(np.float16) for k, v in adapter.items()}, + _meta=np.array([a.rank, n, n_tr], dtype=np.int64)) + print(f"saved {a.out} | rank {a.rank} | {n_tr:,} params | ~{nbytes/1024:.1f} KB fp16 | " + f"from {n} labeled samples") + + +if __name__ == "__main__": + main() diff --git a/aether-arena/calibration/cog_calibrate.py b/aether-arena/calibration/cog_calibrate.py new file mode 100644 index 00000000..0f58fbcb --- /dev/null +++ b/aether-arena/calibration/cog_calibrate.py @@ -0,0 +1,120 @@ +"""Per-room calibration producer for the cog-pose-estimation **conv+MLP** model +(`pose_v1.safetensors`, 56 subcarriers x 20 frames). Companion to `calibrate.py` +(which targets the MM-Fi *transformer* model) — different model, different adapter +key layout, NOT interchangeable (ADR-150 §3.5). + +Fits a rank-r LoRA on the pose head (fc1, fc2) from a short labeled in-room capture and +writes a **safetensors** adapter with keys `fc1.a`/`fc1.b`/`fc2.a`/`fc2.b` (scale baked +into `b`) — exactly what `cog-pose-estimation run --adapter ` consumes. + + python cog_calibrate.py --base pose_v1.safetensors --data calib.npz --out room.safetensors + +`calib.npz`: `X` [N,56,20] CSI window + `Y` [N,17,2] (or [N,34]) keypoints in [0,1]. +""" +import argparse +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class CogPose(nn.Module): + """Mirrors cog-pose-estimation's PoseNet (Candle) exactly — same safetensors keys.""" + + def __init__(self): + super().__init__() + self.enc = nn.ModuleDict({ + "c1": nn.Conv1d(56, 64, 3, padding=1, dilation=1), + "c2": nn.Conv1d(64, 128, 3, padding=2, dilation=2), + "c3": nn.Conv1d(128, 128, 3, padding=4, dilation=4), + }) + self.head = nn.ModuleDict({"fc1": nn.Linear(128, 256), "fc2": nn.Linear(256, 34)}) + self.fc1_lora = None + self.fc2_lora = None + + def _lora(self, slot, x, y): + if slot is None: + return y + a, b = slot + return y + (x @ a) @ b + + def forward(self, x): # x: [B, 56, 20] + h = F.relu(self.enc["c1"](x)) + h = F.relu(self.enc["c2"](h)) + h = F.relu(self.enc["c3"](h)) + h = h.mean(2) # [B, 128] + z1 = self.head["fc1"](h) + z1 = self._lora(self.fc1_lora, h, z1) + h1 = F.relu(z1) + z2 = self.head["fc2"](h1) + z2 = self._lora(self.fc2_lora, h1, z2) + return torch.sigmoid(z2) # [B, 34] + + def add_lora(self, r=4): + self.fc1_lora = (nn.Parameter(torch.randn(128, r) * 0.02), nn.Parameter(torch.zeros(r, 256))) + self.fc2_lora = (nn.Parameter(torch.randn(256, r) * 0.02), nn.Parameter(torch.zeros(r, 34))) + for p in (*self.fc1_lora, *self.fc2_lora): + self.register_parameter(f"lora_{id(p)}", p) + return self + + +def load_base(net: CogPose, path: str): + from safetensors.torch import load_file + sd = load_file(path) + # remap "enc.c1.weight" -> module dict keys + mapped = {} + for k, v in sd.items(): + mapped[k.replace("enc.", "enc.").replace("head.", "head.")] = v + net.load_state_dict(mapped, strict=False) + return net + + +def fit(base: str, data: str, out: str, rank: int = 4, iters: int = 400, lr: float = 1e-3): + z = np.load(data) + X = torch.tensor(z["X"].astype(np.float32)) # [N,56,20] + Y = torch.tensor(z["Y"].reshape(len(z["Y"]), 34).astype(np.float32)) + n = len(X) + net = CogPose() + load_base(net, base) + net.add_lora(rank) + for p in net.parameters(): + p.requires_grad = False + lora = [*net.fc1_lora, *net.fc2_lora] + for p in lora: + p.requires_grad = True + opt = torch.optim.AdamW(lora, lr=lr, weight_decay=0.0) + lossf = nn.SmoothL1Loss(beta=0.1) + bs = min(64, n) + net.train() + for _ in range(iters): + bi = torch.randint(0, n, (bs,)) + opt.zero_grad() + lossf(net(X[bi]), Y[bi]).backward() + opt.step() + + alpha = 16.0 + scale = alpha / rank + a1, b1 = net.fc1_lora + a2, b2 = net.fc2_lora + tensors = { + "fc1.a": a1.detach().contiguous(), + "fc1.b": (b1.detach() * scale).contiguous(), # bake scale into b + "fc2.a": a2.detach().contiguous(), + "fc2.b": (b2.detach() * scale).contiguous(), + } + from safetensors.torch import save_file + save_file(tensors, out) + return out, sum(p.numel() for p in lora), n + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument("--base", required=True) + ap.add_argument("--data", required=True) + ap.add_argument("--out", required=True) + ap.add_argument("--rank", type=int, default=4) + ap.add_argument("--iters", type=int, default=400) + a = ap.parse_args() + out, np_, n = fit(a.base, a.data, a.out, a.rank, a.iters) + print(f"saved {out} | {np_} LoRA params from {n} samples " + f"(keys fc1.a/fc1.b/fc2.a/fc2.b — load with cog-pose-estimation run --adapter)") diff --git a/aether-arena/calibration/infer.py b/aether-arena/calibration/infer.py new file mode 100644 index 00000000..2d5deb79 --- /dev/null +++ b/aether-arena/calibration/infer.py @@ -0,0 +1,49 @@ +"""Run calibrated WiFi-CSI pose inference: shared base + a per-room LoRA adapter. + + python infer.py --base pose_mmfi_best.pt --adapter room_A.adapter.npz --data frames.npz + +`frames.npz` contains `X` [N,3,114,10] CSI amplitude. Prints/saves [N,17,2] keypoints in [0,1]. +Omit --adapter to run the uncalibrated (zero-shot) base. With a room adapter, expect SOTA-level +accuracy in that room/person; without one, zero-shot degrades in unseen rooms (ADR-150 §3.6). +""" +import argparse +import numpy as np +import torch + +from model import PoseNet, standardize + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--base", required=True) + ap.add_argument("--adapter", default=None, help="per-room .adapter.npz (omit for zero-shot)") + ap.add_argument("--data", required=True, help=".npz with X [N,3,114,10]") + ap.add_argument("--out", default=None, help="optional .npy to save [N,17,2] keypoints") + ap.add_argument("--rank", type=int, default=8) + ap.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu") + a = ap.parse_args() + dev = a.device + + net = PoseNet().to(dev) + net.load_state_dict(torch.load(a.base, map_location=dev), strict=False) + if a.adapter: + net.add_lora(r=a.rank).to(dev) + z = np.load(a.adapter) + net.load_lora({k: z[k].astype(np.float32) for k in z.files if k.endswith(".A") or k.endswith(".B")}) + net.eval() + + X = torch.tensor(np.load(a.data)["X"].astype(np.float32)).to(dev) + Xs = standardize(X) + out = [] + with torch.no_grad(): + for i in range(0, len(Xs), 4096): + out.append(net(Xs[i:i + 4096]).cpu().numpy()) + kp = np.concatenate(out).reshape(-1, 17, 2) + print(f"inferred {len(kp)} frames | adapter={'yes' if a.adapter else 'NONE (zero-shot)'}") + if a.out: + np.save(a.out, kp) + print(f"saved keypoints -> {a.out}") + + +if __name__ == "__main__": + main() diff --git a/aether-arena/calibration/model.py b/aether-arena/calibration/model.py new file mode 100644 index 00000000..142b1f0b --- /dev/null +++ b/aether-arena/calibration/model.py @@ -0,0 +1,107 @@ +"""WiFi-CSI pose model + LoRA adapter for the RuView calibration service. + +Architecture matches the published flagship checkpoint +[`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) +(`pose_mmfi_best.pt`): transformer encoder + temporal attention pooling + skeleton-graph head. + +The calibration service freezes this base and fits a tiny per-room **LoRA adapter** (rank 8 on the +input projection + pose head ≈ 11 KB) from ~100–200 labeled in-room samples. Empirically that lifts +cross-subject 64→72% and cross-environment 11→73% (ADR-150 §3.3–3.6). +""" +import numpy as np +import torch +import torch.nn as nn + +# COCO-17 skeleton edges for the graph-refinement head. +EDGES = [(0, 1), (0, 2), (1, 3), (2, 4), (5, 6), (5, 7), (7, 9), (6, 8), (8, 10), + (5, 11), (6, 12), (11, 12), (11, 13), (13, 15), (12, 14), (14, 16)] +_A = np.eye(17, dtype=np.float32) +for _i, _j in EDGES: + _A[_i, _j] = _A[_j, _i] = 1.0 +_A = _A / _A.sum(1, keepdims=True) + + +class LoRA(nn.Module): + """Low-rank adapter wrapping a frozen Linear: y = W·x + (x·A·B)·(alpha/r).""" + + def __init__(self, base: nn.Linear, r: int = 8, alpha: int = 16): + super().__init__() + self.base = base + for p in self.base.parameters(): + p.requires_grad = False + self.A = nn.Parameter(torch.zeros(base.in_features, r)) + self.B = nn.Parameter(torch.zeros(r, base.out_features)) + nn.init.normal_(self.A, std=0.02) + self.scale = alpha / r + + def forward(self, x): + return self.base(x) + (x @ self.A @ self.B) * self.scale + + +class GR(nn.Module): + """Skeleton-graph refinement: nudges joints toward anatomically consistent positions.""" + + def __init__(self, d=256, h=96): + super().__init__() + self.je = nn.Parameter(torch.randn(17, 32) * 0.02) + self.inp = nn.Linear(d + 34, h) + self.g1 = nn.Linear(h, h) + self.g2 = nn.Linear(h, h) + self.out = nn.Linear(h, 2) + self.register_buffer("A", torch.tensor(_A)) + + def forward(self, z, kp0): + B = z.shape[0] + f = torch.relu(self.inp(torch.cat( + [z.unsqueeze(1).expand(-1, 17, -1), self.je.unsqueeze(0).expand(B, -1, -1), kp0], -1))) + f = torch.relu(self.g1(torch.einsum('ij,bjh->bih', self.A, f))) + f = torch.relu(self.g2(torch.einsum('ij,bjh->bih', self.A, f))) + return kp0 + 0.3 * torch.tanh(self.out(f)) + + +class PoseNet(nn.Module): + """Flagship pose model. Input [B,3,114,10] CSI amplitude (per-sample standardized) -> [B,34].""" + + def __init__(self, na=3, nsc=114, nt=10, d=256, L=4, H=8): + super().__init__() + self.proj = nn.Linear(na * nsc, d) + self.pos = nn.Parameter(torch.randn(1, nt, d) * 0.02) + enc = nn.TransformerEncoderLayer(d, H, d * 2, dropout=0.2, batch_first=True, activation='gelu') + self.tf = nn.TransformerEncoder(enc, L) + self.att = nn.Linear(d, 1) + self.head = nn.Sequential(nn.Linear(d, 256), nn.GELU(), nn.Dropout(0.3), nn.Linear(256, 34)) + self.gr = GR(d) + self.na, self.nsc, self.nt = na, nsc, nt + + def forward(self, x): + B = x.shape[0] + t = x.permute(0, 3, 1, 2).reshape(B, self.nt, self.na * self.nsc) + h = self.tf(self.proj(t) + self.pos) + w = torch.softmax(self.att(h), 1) + z = (h * w).sum(1) + kp0 = torch.sigmoid(self.head(z)).reshape(B, 17, 2) + return self.gr(z, kp0).reshape(B, 34) + + def add_lora(self, r=8, alpha=16): + """Wrap the input projection + pose head with LoRA adapters (the ~11 KB calibration set).""" + self.proj = LoRA(self.proj, r, alpha) + self.head[0] = LoRA(self.head[0], r, alpha) + self.head[3] = LoRA(self.head[3], r, alpha) + return self + + def lora_state(self) -> dict: + """Extract just the LoRA A/B tensors (the per-room adapter to save).""" + return {k: v.detach().cpu().numpy() for k, v in self.state_dict().items() + if k.endswith(".A") or k.endswith(".B")} + + def load_lora(self, adapter: dict): + sd = self.state_dict() + for k, v in adapter.items(): + sd[k] = torch.tensor(v) + self.load_state_dict(sd) + return self + + +def standardize(x: torch.Tensor) -> torch.Tensor: + """Per-sample standardization used in training/inference.""" + return (x - x.mean((1, 2, 3), keepdim=True)) / (x.std((1, 2, 3), keepdim=True) + 1e-6) diff --git a/aether-arena/calibration/test_calibration.py b/aether-arena/calibration/test_calibration.py new file mode 100644 index 00000000..804307ca --- /dev/null +++ b/aether-arena/calibration/test_calibration.py @@ -0,0 +1,103 @@ +"""Self-contained regression test for the RuView calibration service. + +Exercises the committed CLI end-to-end on synthetic data (CPU, no GPU, no real checkpoint): + build a base -> calibrate.py fits an adapter -> infer.py runs base+adapter -> assert the + adapter is small, inference is shape-correct and finite, and the adapter actually changes output. + +Run: python test_calibration.py (or via pytest) +""" +import json +import subprocess +import sys +import tempfile +from pathlib import Path + +import numpy as np +import torch + +HERE = Path(__file__).parent +sys.path.insert(0, str(HERE)) +from model import PoseNet, standardize # noqa: E402 + + +def _make_base(path: Path): + torch.manual_seed(0) + net = PoseNet() + # Save without the deterministic gr.A buffer (mirrors the published checkpoint; + # calibrate.py/infer.py load with strict=False). + sd = {k: v for k, v in net.state_dict().items() if k != "gr.A"} + torch.save(sd, path) + + +def _make_data(path: Path, n: int, seed: int): + rng = np.random.default_rng(seed) + X = rng.standard_normal((n, 3, 114, 10)).astype(np.float32) + Y = rng.random((n, 17, 2)).astype(np.float32) # keypoints in [0,1] + np.savez(path, X=X, Y=Y) + + +def _run(*args): + r = subprocess.run( + [sys.executable, str(HERE / args[0]), *map(str, args[1:])], + capture_output=True, text=True, + ) + assert r.returncode == 0, f"{args[0]} failed:\n{r.stdout}\n{r.stderr}" + return r.stdout + + +def test_calibration_end_to_end(): + with tempfile.TemporaryDirectory() as d: + d = Path(d) + base = d / "base.pt" + calib = d / "calib.npz" + frames = d / "frames.npz" + adapter = d / "room.adapter.npz" + kp = d / "kp.npy" + + _make_base(base) + _make_data(calib, n=40, seed=1) # ≥20 → no underfit warning + _make_data(frames, n=16, seed=2) + + # 1) calibrate -> adapter + out = _run("calibrate.py", "--base", base, "--data", calib, "--out", adapter, + "--iters", "50", "--device", "cpu") + assert adapter.exists(), "adapter not written" + assert "saved" in out.lower() + sz = adapter.stat().st_size + assert sz < 200_000, f"adapter unexpectedly large ({sz} bytes)" + + # adapter contains the expected LoRA tensors (materialize + close so the + # Windows tempdir can be cleaned up — np.load keeps a lazy file handle). + with np.load(adapter) as z: + keys = [k for k in z.files if k.endswith(".A") or k.endswith(".B")] + assert keys, f"adapter has no LoRA tensors: {z.files}" + lora = {k: z[k].astype(np.float32) for k in keys} + + # 2) infer with adapter -> keypoints + _run("infer.py", "--base", base, "--adapter", adapter, "--data", frames, + "--out", kp, "--device", "cpu") + out_kp = np.load(kp) + assert out_kp.shape == (16, 17, 2), f"bad keypoint shape {out_kp.shape}" + assert np.isfinite(out_kp).all(), "non-finite keypoints" + assert (out_kp >= 0).all() and (out_kp <= 1).all(), "keypoints out of [0,1]" + + # 3) adapter must actually change the output vs the zero-shot base + with np.load(frames) as fz: + frames_x = fz["X"][:] + net = PoseNet() + net.load_state_dict(torch.load(base, map_location="cpu"), strict=False) + net.eval() + x = standardize(torch.tensor(frames_x)) + with torch.no_grad(): + base_kp = net(x).reshape(16, 17, 2).numpy() + net.add_lora() + net.load_lora(lora) + net.eval() + with torch.no_grad(): + cal_kp = net(x).reshape(16, 17, 2).numpy() + assert np.abs(base_kp - cal_kp).sum() > 1e-4, "adapter did not change output" + + +if __name__ == "__main__": + test_calibration_end_to_end() + print("PASS: calibration service end-to-end (calibrate -> adapter -> infer)") diff --git a/aether-arena/calibration/test_cog_calibration.py b/aether-arena/calibration/test_cog_calibration.py new file mode 100644 index 00000000..661e6122 --- /dev/null +++ b/aether-arena/calibration/test_cog_calibration.py @@ -0,0 +1,75 @@ +"""Regression test for the cog-pose adapter producer (cog_calibrate.py). + +Uses the in-repo `pose_v1.safetensors` (skips if absent). Verifies the produced adapter: + - has the exact keys/shapes the Rust `cog-pose-estimation --adapter` loader expects, + - reduces calibration fit error, + - actually changes inference output, + - is tiny. +Run: python test_cog_calibration.py (or via pytest) +""" +import os +import sys +import tempfile +from pathlib import Path + +import numpy as np +import torch +import torch.nn.functional as F + +HERE = Path(__file__).parent +sys.path.insert(0, str(HERE)) +import cog_calibrate as C # noqa: E402 + +BASE = HERE / "../../v2/crates/cog-pose-estimation/cog/artifacts/pose_v1.safetensors" + + +def test_cog_adapter_producer(): + if not BASE.exists(): + print(f"(skip — {BASE} not present)") + return + from safetensors.torch import load_file + + rng = np.random.default_rng(0) + n = 120 + X = rng.standard_normal((n, 56, 20)).astype("float32") + Y = (0.5 + 0.1 * X[:, :34, 0].reshape(n, 34)).clip(0, 1).astype("float32") + + with tempfile.TemporaryDirectory() as d: + calib = os.path.join(d, "calib.npz") + adapter = os.path.join(d, "room.safetensors") + np.savez(calib, X=X, Y=Y) + + net0 = C.CogPose() + C.load_base(net0, str(BASE)) + net0.eval() + with torch.no_grad(): + base_err = F.smooth_l1_loss(net0(torch.tensor(X)), torch.tensor(Y)).item() + + _, nparam, _ = C.fit(str(BASE), calib, adapter, rank=4, iters=400) + t = load_file(adapter) + + # exact Rust loader contract: a:[in,r], b:[r,out] + assert tuple(t["fc1.a"].shape) == (128, 4) + assert tuple(t["fc1.b"].shape) == (4, 256) + assert tuple(t["fc2.a"].shape) == (256, 4) + assert tuple(t["fc2.b"].shape) == (4, 34) + + net = C.CogPose() + C.load_base(net, str(BASE)) + net.add_lora(4) + with torch.no_grad(): + net.fc1_lora[0].copy_(t["fc1.a"]); net.fc1_lora[1].copy_(t["fc1.b"] / (16 / 4)) + net.fc2_lora[0].copy_(t["fc2.a"]); net.fc2_lora[1].copy_(t["fc2.b"] / (16 / 4)) + net.eval() + with torch.no_grad(): + cal_err = F.smooth_l1_loss(net(torch.tensor(X)), torch.tensor(Y)).item() + changed = (net0(torch.tensor(X[:8])) - net(torch.tensor(X[:8]))).abs().sum().item() + + assert cal_err < base_err, f"calibration did not reduce error ({base_err} -> {cal_err})" + assert changed > 1e-3, "adapter inert" + assert nparam < 5000, f"adapter unexpectedly large ({nparam} params)" + + +if __name__ == "__main__": + test_cog_adapter_producer() + print("PASS: cog adapter producer (Rust-loadable format, reduces error, active)") diff --git a/aether-arena/fixtures/expected_score.sha256 b/aether-arena/fixtures/expected_score.sha256 new file mode 100644 index 00000000..aefe9c14 --- /dev/null +++ b/aether-arena/fixtures/expected_score.sha256 @@ -0,0 +1 @@ +9c35e541d51f00998691b98948887ebca09b907d8eb29a113f97e792340456ba diff --git a/aether-arena/fixtures/smoke_pred.json b/aether-arena/fixtures/smoke_pred.json new file mode 100644 index 00000000..03a668be --- /dev/null +++ b/aether-arena/fixtures/smoke_pred.json @@ -0,0 +1 @@ +{"frames": [{"pred": [[0.4003, 0.2734], [0.5038, 0.4197], [0.2053, 0.4438], [0.4397, 0.685], [0.5796, 0.7645], [0.8001, 0.2195], [0.2789, 0.2833], [0.314, 0.5439], [0.511, 0.2259], [0.6008, 0.46], [0.4837, 0.3879], [0.3475, 0.5597], [0.6569, 0.3575], [0.437, 0.6539], [0.2341, 0.6038], [0.7331, 0.392], [0.5615, 0.4915]]}, {"pred": [[0.4669, 0.6066], [0.6012, 0.7873], [0.4124, 0.5997], [0.2832, 0.281], [0.2732, 0.3635], [0.2503, 0.4848], [0.6827, 0.715], [0.4336, 0.7165], [0.295, 0.3386], [0.5337, 0.3544], [0.4397, 0.5474], [0.5163, 0.5528], [0.7547, 0.6799], [0.4195, 0.4448], [0.2257, 0.2269], [0.384, 0.2176], [0.2419, 0.4332]]}, {"pred": [[0.5585, 0.283], [0.4325, 0.2934], [0.463, 0.4744], [0.4188, 0.3454], [0.215, 0.7565], [0.527, 0.2353], [0.7084, 0.6124], [0.3015, 0.6744], [0.4103, 0.3532], [0.7243, 0.6932], [0.3302, 0.4918], [0.2072, 0.3754], [0.7914, 0.4878], [0.7618, 0.4079], [0.323, 0.3386], [0.7104, 0.4997], [0.2673, 0.6077]]}, {"pred": [[0.6372, 0.4984], [0.4184, 0.6763], [0.4498, 0.7549], [0.2924, 0.303], [0.3069, 0.7022], [0.3954, 0.5098], [0.7836, 0.6071], [0.4733, 0.7114], [0.3407, 0.3793], [0.3408, 0.4678], [0.4156, 0.4911], [0.4525, 0.7519], [0.5117, 0.1985], [0.1893, 0.6784], [0.6281, 0.5346], [0.5175, 0.673], [0.36, 0.3665]]}, {"pred": [[0.5535, 0.6537], [0.568, 0.511], [0.4705, 0.5377], [0.6372, 0.7163], [0.5493, 0.7515], [0.2559, 0.4549], [0.2553, 0.6176], [0.2991, 0.6154], [0.7185, 0.7986], [0.4586, 0.5057], [0.2975, 0.4525], [0.3263, 0.3719], [0.5131, 0.4576], [0.557, 0.5268], [0.6572, 0.7736], [0.2146, 0.6526], [0.4662, 0.7371]]}, {"pred": [[0.2924, 0.7595], [0.2612, 0.2315], [0.2488, 0.7751], [0.2329, 0.7282], [0.4744, 0.4206], [0.3618, 0.267], [0.2477, 0.285], [0.3976, 0.3746], [0.494, 0.2874], [0.3596, 0.2112], [0.3311, 0.4692], [0.6912, 0.4727], [0.4434, 0.5233], [0.4139, 0.7048], [0.425, 0.3937], [0.2326, 0.631], [0.2655, 0.7116]]}, {"pred": [[0.3609, 0.3437], [0.285, 0.486], [0.7734, 0.5468], [0.3657, 0.4093], [0.4728, 0.5019], [0.1866, 0.3545], [0.2172, 0.2028], [0.5613, 0.5238], [0.6252, 0.7205], [0.7998, 0.2954], [0.242, 0.7063], [0.6259, 0.6883], [0.5148, 0.7141], [0.5577, 0.7434], [0.3233, 0.2131], [0.2652, 0.7066], [0.5753, 0.5885]]}, {"pred": [[0.6787, 0.6504], [0.6051, 0.2297], [0.2539, 0.3475], [0.6437, 0.7807], [0.4981, 0.6149], [0.5716, 0.2367], [0.6486, 0.3632], [0.2433, 0.369], [0.6061, 0.3731], [0.4955, 0.2591], [0.7676, 0.7602], [0.6899, 0.7716], [0.3143, 0.7707], [0.3031, 0.4997], [0.7076, 0.5133], [0.3382, 0.7196], [0.2002, 0.4871]]}]} \ No newline at end of file diff --git a/aether-arena/fixtures/smoke_split.json b/aether-arena/fixtures/smoke_split.json new file mode 100644 index 00000000..f81fd7ae --- /dev/null +++ b/aether-arena/fixtures/smoke_split.json @@ -0,0 +1 @@ +{"frames": [{"gt": [[0.3943, 0.2905], [0.5215, 0.4194], [0.2225, 0.4602], [0.4547, 0.6961], [0.5765, 0.7686], [0.7858, 0.2279], [0.2866, 0.2707], [0.3084, 0.549], [0.5286, 0.2377], [0.6082, 0.4566], [0.4719, 0.3799], [0.3465, 0.5447], [0.6377, 0.3728], [0.4509, 0.6543], [0.2235, 0.6009], [0.7253, 0.3882], [0.5479, 0.4737]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.4845, 0.5985], [0.5883, 0.7959], [0.4315, 0.6012], [0.3008, 0.2703], [0.2776, 0.3486], [0.2483, 0.4695], [0.6916, 0.7184], [0.4153, 0.7305], [0.3057, 0.3392], [0.5535, 0.3576], [0.4216, 0.5398], [0.5093, 0.5706], [0.7397, 0.668], [0.4354, 0.4394], [0.2373, 0.2404], [0.404, 0.2315], [0.2609, 0.4182]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.5684, 0.2891], [0.4185, 0.2737], [0.4796, 0.4903], [0.4056, 0.3589], [0.2139, 0.7706], [0.5259, 0.2162], [0.718, 0.6177], [0.3002, 0.6632], [0.3978, 0.3338], [0.7116, 0.6836], [0.336, 0.5106], [0.2168, 0.3677], [0.7739, 0.4683], [0.773, 0.4188], [0.318, 0.3226], [0.7043, 0.4877], [0.2509, 0.5964]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.6501, 0.4868], [0.3995, 0.6805], [0.4408, 0.7681], [0.2762, 0.2907], [0.2877, 0.6959], [0.4102, 0.5292], [0.7825, 0.5898], [0.4603, 0.723], [0.3511, 0.3758], [0.3556, 0.4514], [0.4123, 0.4749], [0.4524, 0.7506], [0.5141, 0.2112], [0.2024, 0.6795], [0.6351, 0.5339], [0.5333, 0.6706], [0.3491, 0.3662]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.537, 0.656], [0.5675, 0.5033], [0.4714, 0.52], [0.6195, 0.7259], [0.5357, 0.766], [0.273, 0.4653], [0.2439, 0.6017], [0.2927, 0.6297], [0.7297, 0.7805], [0.439, 0.4924], [0.2969, 0.4589], [0.3174, 0.3911], [0.5324, 0.4643], [0.5744, 0.5074], [0.673, 0.783], [0.2238, 0.6674], [0.4534, 0.7468]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.2896, 0.7515], [0.2537, 0.2345], [0.2434, 0.763], [0.2502, 0.7137], [0.4723, 0.4035], [0.3607, 0.2775], [0.2657, 0.2969], [0.3872, 0.383], [0.5001, 0.3067], [0.3503, 0.2092], [0.3137, 0.4849], [0.6914, 0.4593], [0.4359, 0.504], [0.4056, 0.6994], [0.4428, 0.4085], [0.2424, 0.6445], [0.2507, 0.7048]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.3692, 0.3453], [0.2945, 0.4675], [0.7836, 0.5282], [0.3857, 0.414], [0.4848, 0.5017], [0.203, 0.3585], [0.225, 0.2135], [0.5513, 0.5175], [0.6296, 0.7275], [0.7908, 0.2897], [0.2263, 0.7012], [0.6403, 0.6873], [0.5026, 0.701], [0.5504, 0.7357], [0.338, 0.2187], [0.2629, 0.7015], [0.5757, 0.6084]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}, {"gt": [[0.6786, 0.649], [0.5956, 0.2396], [0.2447, 0.3593], [0.6439, 0.7854], [0.4874, 0.6102], [0.5857, 0.2465], [0.6459, 0.3827], [0.2364, 0.3613], [0.6054, 0.3745], [0.4798, 0.2711], [0.7869, 0.7618], [0.6919, 0.7809], [0.3259, 0.7674], [0.285, 0.5144], [0.6921, 0.5052], [0.3388, 0.7386], [0.2022, 0.495]], "vis": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], "scale": 1.0}]} \ No newline at end of file diff --git a/aether-arena/ledger/ledger.jsonl b/aether-arena/ledger/ledger.jsonl new file mode 100644 index 00000000..7767059c --- /dev/null +++ b/aether-arena/ledger/ledger.jsonl @@ -0,0 +1,5 @@ +{"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"} +{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} +{"abs_gain": "+11.34", "benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer + skeleton-graph head + 3-ensemble + TTA", "note": "Best in-domain. Stacks attention-pooling + transformer + skeleton-graph refine + warmup + TTA + 3-model ensemble. Supersedes the 81.63 single-model entry.", "prev_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "protocol": "random_split (0.8, seed 0)", "row_hash": "5780a4bc3e98eb0e30c1ecfa9091e57b280444fa1f21cd5146797e408580e4ab", "score_pct": 83.59, "scored_at": "2026-05-30", "seq": 2, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} +{"benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer", "note": "Leakage-free generalization to unseen people, shared rooms. Honest deployment-relevant number.", "prev_hash": "5780a4bc3e98eb0e30c1ecfa9091e57b280444fa1f21cd5146797e408580e4ab", "protocol": "cross_subject (official, val=S05,S10,..,S40)", "row_hash": "d989e4e1dbc0182610305fdfbde8b094413b87c913283a46bf41f4afba7a06fd", "score_pct": 64.04, "scored_at": "2026-05-30", "seq": 3, "sota_ref": "(no matched public ref)", "submitter": "ruvnet", "tier": "Silver"} +{"benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer + CORAL domain alignment", "note": "The real deployment frontier (new room). CORAL transductive DG (+30% rel over control). Data-bound: MM-Fi has only 3 source rooms.", "prev_hash": "d989e4e1dbc0182610305fdfbde8b094413b87c913283a46bf41f4afba7a06fd", "protocol": "cross_environment (train E01-03 -> test E04, new room)", "row_hash": "bf370487bde88e198c13877956dab3c83766a6a24afef0b78b6ac7aa130bb207", "score_pct": 17.51, "scored_at": "2026-05-30", "seq": 4, "sota_ref": "(hard frontier; control 13.52)", "submitter": "ruvnet", "tier": "Bronze"} diff --git a/aether-arena/ledger/ledger_tools.py b/aether-arena/ledger/ledger_tools.py new file mode 100644 index 00000000..1da5cff4 --- /dev/null +++ b/aether-arena/ledger/ledger_tools.py @@ -0,0 +1,100 @@ +#!/usr/bin/env python3 +"""AetherArena append-only, tamper-evident results ledger (ADR-149 §2.3/§2.4). + +Each row is hash-chained to the previous one: ``row_hash = sha256(canonical_row ++ prev_hash)``. Any silent edit to an earlier row breaks every subsequent +``prev_hash`` link, so the ledger is append-only and verifiable by anyone — no +trust in the maintainer required. (Ed25519 row signing is the next hardening; +the chain already makes tampering detectable.) + +Usage: + python ledger_tools.py seed # (re)build ledger.jsonl with genesis + baseline + python ledger_tools.py verify # verify the whole chain -> exit 0 / 1 + python ledger_tools.py append '' # append one scored row +""" +import hashlib +import json +import sys +from pathlib import Path + +LEDGER = Path(__file__).parent / "ledger.jsonl" +GENESIS_PREV = "0" * 64 + + +def canonical(row: dict) -> bytes: + # Stable key order, no whitespace -> deterministic bytes for hashing. + body = {k: row[k] for k in sorted(row) if k != "row_hash"} + return json.dumps(body, separators=(",", ":"), sort_keys=True).encode() + + +def row_hash(row: dict) -> str: + return hashlib.sha256(canonical(row)).hexdigest() + + +def read_rows() -> list[dict]: + if not LEDGER.exists(): + return [] + return [json.loads(l) for l in LEDGER.read_text().splitlines() if l.strip()] + + +def append(entry: dict) -> dict: + rows = read_rows() + prev = rows[-1]["row_hash"] if rows else GENESIS_PREV + entry = dict(entry) + entry["seq"] = len(rows) + entry["prev_hash"] = prev + entry["row_hash"] = row_hash(entry) + with LEDGER.open("a") as f: + f.write(json.dumps(entry, sort_keys=True) + "\n") + return entry + + +def verify() -> bool: + rows = read_rows() + prev = GENESIS_PREV + for i, r in enumerate(rows): + if r.get("seq") != i: + print(f"FAIL: row {i} seq mismatch ({r.get('seq')})") + return False + if r.get("prev_hash") != prev: + print(f"FAIL: row {i} prev_hash broken — ledger was edited") + return False + if r.get("row_hash") != row_hash(r): + print(f"FAIL: row {i} row_hash mismatch — row was tampered") + return False + prev = r["row_hash"] + print(f"OK: {len(rows)} rows, chain intact") + return True + + +def seed(): + """Rebuild with the genesis row only — an EMPTY board. + + Benchmark-first: no placeholder/hand-entered numbers ever sit on the + leaderboard. Every result row is produced by the real scoring pipeline + (load model -> run inference -> score against the private eval split -> + proof hash). The board starts empty and awaits the first real harness score, + including RuView's own — which gets no special seeding. + """ + if LEDGER.exists(): + LEDGER.unlink() + append({ + "kind": "genesis", + "benchmark": "AetherArena", + "spec": "ADR-149", + "note": "Official Spatial-Intelligence Benchmark — append-only signed ledger. " + "Entries are real harness scores only; no seeded numbers.", + "created": "2026-05-30", + }) + + +if __name__ == "__main__": + cmd = sys.argv[1] if len(sys.argv) > 1 else "verify" + if cmd == "seed": + seed(); verify() + elif cmd == "verify": + sys.exit(0 if verify() else 1) + elif cmd == "append": + print(json.dumps(append(json.loads(sys.argv[2])), indent=2)) + else: + print(__doc__); sys.exit(2) diff --git a/aether-arena/schema/aa-submission.toml b/aether-arena/schema/aa-submission.toml new file mode 100644 index 00000000..fd968d30 --- /dev/null +++ b/aether-arena/schema/aa-submission.toml @@ -0,0 +1,41 @@ +# AetherArena submission manifest (ADR-149 §2.2). +# Accompanies a model artifact pushed to the AA Hugging Face Space. +# This file is the contract the Space validates before quarantine + scoring. + +[submission] +# Free-form display name shown on the leaderboard. +name = "my-spatial-model" +# Hugging Face repo or URL of the model artifact (.safetensors / .rvf / LoRA adapter). +model_ref = "hf://your-org/your-model" +# Submitter handle (HF username / org). Used to sign the ledger row. +submitter = "your-hf-username" +# SPDX license of the submitted model. +license = "Apache-2.0" + +[category] +# One of: pose | presence | tracking | vitals | multi-task +# v0 ranks: pose, presence (tracking/vitals activate when ground truth lands). +primary = "pose" + +[input] +# Which ADR-145 FeatureSet the model consumes. v0 input is RF/WiFi CSI. +# F0 = CSI amplitude/phase F1 = +CIR F2 = +Doppler F3 = +BFLD +feature_set = "F0" +# Tensor I/O contract so the scorer can feed the model correctly. +input_shape = [114, 2] # subcarriers × {amp, phase} (example) +output_shape = [17, 2] # 17 keypoints × {x, y} normalised [0,1] +# Normalisation expected on the input ("none" | "zscore" | "minmax"). +normalization = "zscore" + +[runtime] +# Inference entrypoint inside the artifact (framework-specific). +framework = "candle" # candle | onnx | torch +# Optional: target the edge-latency category with a declared device class. +device_class = "cpu" # cpu | pi5 | gpu + +# Notes: +# - You submit a MODEL, never predictions on data you hold. +# - Scoring runs against a PRIVATE MM-Fi held-out split in a no-network, +# read-only sandbox. You cannot see the eval data. +# - The resulting score is a signed, append-only ledger row carrying a +# determinism proof hash and the pinned harness_version. diff --git a/aether-arena/space/README.md b/aether-arena/space/README.md new file mode 100644 index 00000000..c88945d1 --- /dev/null +++ b/aether-arena/space/README.md @@ -0,0 +1,37 @@ +--- +title: AetherArena — Spatial-Intelligence Benchmark +emoji: 📡 +colorFrom: indigo +colorTo: purple +sdk: gradio +sdk_version: 5.9.1 +python_version: "3.12" +app_file: app.py +pinned: true +license: cc-by-nc-4.0 +tags: + - benchmark + - leaderboard + - wifi-sensing + - spatial-intelligence + - pose-estimation +--- + +# AetherArena ("AA") — The Official Spatial-Intelligence Benchmark + +> Public leaderboard. Private evaluation split. Open scorer. Signed results. + +The field's standard yardstick for camera-free **spatial intelligence** (pose, presence, +occupancy, tracking, vitals) from RF/WiFi and, over time, mmWave / UWB / multimodal. + +- **Project-agnostic** — any team, framework, or modality enters; RuView donated the seed + scorer and is scored like everyone else. +- **Benchmark-first** — the board starts empty; every row is a real scoring-pipeline + **witness** (`inputs_sha256` + `proof_sha256` + `harness_version`) in an append-only, + hash-chained, tamper-evident ledger. +- **Reproducible** — the scorer is open; reproduce any proof hash + repeatability locally. + +Spec: [ADR-149](https://github.com/ruvnet/RuView/blob/main/docs/adr/ADR-149-public-community-leaderboard-huggingface.md). +Source + open scorer: https://github.com/ruvnet/RuView/tree/main/aether-arena + +Non-commercial (CC BY-NC 4.0): the v0 eval split derives from MM-Fi (CC BY-NC); AA is operated non-commercially. diff --git a/aether-arena/space/app.py b/aether-arena/space/app.py new file mode 100644 index 00000000..7f7db81d --- /dev/null +++ b/aether-arena/space/app.py @@ -0,0 +1,161 @@ +"""AetherArena ("AA") — The Official Spatial-Intelligence Benchmark. + +Hugging Face Space (Gradio) — the public face of the benchmark (ADR-149). +This Space is the presentation + submission layer; the heavy scoring runs in the +pinned RuView harness (CI / scorer container), and results land in the append-only, +hash-chained **witness ledger** shown here. + +Benchmark-first: the board starts EMPTY. No seeded or hand-entered numbers — every +row is a real scoring-pipeline witness (inputs_sha256 + proof_sha256 + harness_version). +""" +import hashlib +import json +from pathlib import Path + +import gradio as gr + +LEDGER = Path(__file__).parent / "ledger.jsonl" +GENESIS_PREV = "0" * 64 + + +def _rows(): + if not LEDGER.exists(): + return [] + return [json.loads(l) for l in LEDGER.read_text().splitlines() if l.strip()] + + +def _canon(row: dict) -> bytes: + body = {k: row[k] for k in sorted(row) if k != "row_hash"} + return json.dumps(body, separators=(",", ":"), sort_keys=True).encode() + + +def verify_chain(): + rows, prev = _rows(), GENESIS_PREV + for i, r in enumerate(rows): + if r.get("prev_hash") != prev or r.get("row_hash") != hashlib.sha256(_canon(r)).hexdigest(): + return f"❌ Ledger chain BROKEN at row {i} — tampering detected." + prev = r["row_hash"] + return f"✅ Witness ledger chain intact — {len(rows)} row(s), append-only." + + +def leaderboard(category: str): + results = [r for r in _rows() if r.get("kind") == "result" and (category == "all" or r.get("category") == category)] + if not results: + return [["— no entries yet —", "", "", "", "", ""]] + results.sort(key=lambda r: r.get("score_pct") or 0, reverse=True) + return [[ + r.get("submitter", "?"), + r.get("model_ref", "?"), + f"{r.get('benchmark','?')} / {r.get('protocol','?')}", + r.get("metric", "?"), + f"{r.get('score_pct', 0):.2f}%", + f"{r.get('tier','?')} (vs {r.get('sota_ref','?')})", + ] for r in results] + + +FOUR_PART = "### Public leaderboard. Private evaluation split. Open scorer. Signed results." + +ABOUT = """ +**AetherArena** is the official, project-agnostic **Spatial-Intelligence Benchmark** — +camera-free pose, presence, occupancy, tracking, and vitals from RF/WiFi (and, over +time, mmWave / UWB / radar / multimodal). It is **not** a single-vendor board: any +team, framework, or modality enters, and every entrant — including the RuView baseline +that donated the seed scorer — is scored by the identical, open, pinned harness. + +The scorer reuses RuView's released `wifi-densepose-train` acceptance harness +(`ruview_metrics` + ablation). You submit a **model, not predictions**; it is scored +against a **private** MM-Fi held-out split; one **witness** row (inputs hash + proof +hash + harness version) is appended to a **hash-chained, tamper-evident ledger**. + +**For industry:** a vendor-neutral, auditable way to compare RF-sensing models on equal +footing — the same standardized splits, the same metric definition, the same signed, +reproducible ledger. No more "trust our number on our split." Vendors, labs, and startups +all submit through one pipeline and are scored identically. + +**Generalization Track (roadmap):** the headline isn't a single in-domain number — it's a +battery of honest tracks: MM-Fi `random_split` (in-domain), `cross_subject` (unseen people), +cross-room, cross-device, and confidence-calibration (ECE). Cross-subject is the real +deployment frontier and is treated as the flagship hard benchmark. + +Spec: ADR-149. v0 ranks **pose, presence, edge-latency, determinism**. Tracking & +vitals activate when their ground truth lands; **privacy-leakage** is gated until the +membership-inference attacker ships. Source + the open scorer: +https://github.com/ruvnet/RuView/tree/main/aether-arena +""" + +SUBMIT = """ +### Submit a model + +1. Write a manifest — [`schema/aa-submission.toml`](https://github.com/ruvnet/RuView/blob/main/aether-arena/schema/aa-submission.toml): + declare your model ref, category, the ADR-145 feature set (F0 CSI … F3 BFLD), and the tensor I/O contract. +2. Provide your model artifact (`.safetensors` / `.rvf` / LoRA adapter). +3. It moves through `submitted → validated → quarantined → smoke_scored → full_scored → published`, + scored in a no-network, read-only sandbox against the private split. +4. Your signed witness row appears on the leaderboard. + +**You submit a model, never predictions** — predictions on data you hold prove nothing. +""" + +VERIFY = """ +### Verify it's fair (you don't have to trust us) + +The scorer is open and reproducible. Reproduce the determinism proof + repeatability locally: + +```bash +git clone https://github.com/ruvnet/RuView && cd RuView/v2 +# determinism gate (same as CI): +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features +# repeatability — N runs, one identical proof hash: +cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --repeat 16 +# verify the append-only witness ledger chain: +cd ../aether-arena/ledger && python3 ledger_tools.py verify +``` + +A stranger must be able to: submit → get a deterministic score → see the signed row → +rerun the scorer locally → understand why the rank is fair. That is the launch gate (ADR-149 §7). +""" + +with gr.Blocks(title="AetherArena — Spatial-Intelligence Benchmark") as demo: + gr.Markdown("# 📡 AetherArena (AA)\n## The Official, Vendor-Neutral Benchmark for WiFi / RF Spatial Sensing") + gr.Markdown(FOUR_PART) + gr.Markdown( + "**An open industry benchmark — for everyone, not any one vendor.** Submit any model, any framework, " + "any modality. Every entrant — academic, startup, or incumbent — is scored *identically*: standardized " + "protocols (MM-Fi `random_split` / `cross_subject`), matched metrics (torso-PCK@20, the published " + "definition), and an auditable, hash-chained **witness ledger** anyone can verify and reproduce.\n\n" + "**Why it exists:** WiFi/RF-sensing results are reported with inconsistent splits, metrics, and no " + "auditability — so numbers aren't comparable. AetherArena fixes the *measurement*: one protocol, one " + "metric, one signed ledger, one-command reproduction. The benchmark is the product; the leaderboard is " + "just the scoreboard. (Reference implementation seeded by RuView, ADR-149.)" + ) + chain = gr.Markdown(verify_chain()) + + with gr.Tab("🏆 Leaderboard"): + gr.Markdown( + "### Current standings — MM-Fi WiFi-CSI 2D pose, torso-PCK@20\n" + "Ranked, protocol- & metric-matched results. Each row carries its own caveats in the ledger " + "(e.g. `random_split` has temporal-adjacency leakage that inflates *all* methods equally — the " + "leakage-free `cross_subject` track is the real deployment frontier). **Submit yours — top the board.**" + ) + cat = gr.Dropdown(["all", "pose", "presence"], value="all", label="Category") + tbl = gr.Dataframe( + headers=["Submitter", "Model", "Benchmark / Protocol", "Metric", "Score", "Tier (vs prior SOTA)"], + value=leaderboard("all"), interactive=False, wrap=True, + ) + cat.change(leaderboard, cat, tbl) + gr.Markdown( + "*Vendor-neutral & benchmark-first: every row is a real, metric- and protocol-matched result — " + "no seeded or vendor-favored numbers. Integrity is enforced, not promised: the current top entry's " + "score was self-corrected down from an inflated metric (91.86% bbox → 81.63% torso) before it could " + "be published. The same scorer and ledger apply to every submitter.*" + ) + + with gr.Tab("📤 Submit"): + gr.Markdown(SUBMIT) + with gr.Tab("🔬 Verify"): + gr.Markdown(VERIFY) + with gr.Tab("ℹ️ About"): + gr.Markdown(ABOUT) + +if __name__ == "__main__": + demo.launch(server_name="0.0.0.0", server_port=7860) diff --git a/aether-arena/space/ledger.jsonl b/aether-arena/space/ledger.jsonl new file mode 100644 index 00000000..7767059c --- /dev/null +++ b/aether-arena/space/ledger.jsonl @@ -0,0 +1,5 @@ +{"benchmark": "AetherArena", "created": "2026-05-30", "kind": "genesis", "note": "Official Spatial-Intelligence Benchmark \u2014 append-only signed ledger. Entries are real harness scores only; no seeded numbers.", "prev_hash": "0000000000000000000000000000000000000000000000000000000000000000", "row_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "seq": 0, "spec": "ADR-149"} +{"abs_gain": "+9.38", "benchmark": "MM-Fi", "category": "pose", "caveat": "Protocol-matched MM-Fi random_split result; NOT solved real-world generalization. Random split has temporal/subject-adjacency effects common to this benchmark family. Leakage-free cross-subject is far lower (~11-27%) and is the real deployment frontier.", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20 (||right_shoulder-left_hip|| norm, 17 COCO kpts)", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer (4L/8H ~2M params, temporal-attention)", "prev_hash": "940bdc6f0f5dd00f4d89e13a8fa843bab3c9ddf1b8051f426a1701e730249231", "protocol": "random_split (ratio=0.8, seed=0)", "rel_gain": "+13.0%", "reproduce": "download MM-Fi -> parse_mmfi_zips.py -> train_tf_torso.py X.npy Y.npy split_random.npy (seed 0)", "row_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "score_pct": 81.63, "scored_at": "2026-05-30", "seq": 1, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} +{"abs_gain": "+11.34", "benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer + skeleton-graph head + 3-ensemble + TTA", "note": "Best in-domain. Stacks attention-pooling + transformer + skeleton-graph refine + warmup + TTA + 3-model ensemble. Supersedes the 81.63 single-model entry.", "prev_hash": "76598d8e1320d5248f8cd854a8ffa22a99bd2a2f0e0e7f2d2b1df79af16001d5", "protocol": "random_split (0.8, seed 0)", "row_hash": "5780a4bc3e98eb0e30c1ecfa9091e57b280444fa1f21cd5146797e408580e4ab", "score_pct": 83.59, "scored_at": "2026-05-30", "seq": 2, "sota_ref": "MultiFormer 72.25 (CSI2Pose 68.41)", "submitter": "ruvnet", "tier": "Gold"} +{"benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer", "note": "Leakage-free generalization to unseen people, shared rooms. Honest deployment-relevant number.", "prev_hash": "5780a4bc3e98eb0e30c1ecfa9091e57b280444fa1f21cd5146797e408580e4ab", "protocol": "cross_subject (official, val=S05,S10,..,S40)", "row_hash": "d989e4e1dbc0182610305fdfbde8b094413b87c913283a46bf41f4afba7a06fd", "score_pct": 64.04, "scored_at": "2026-05-30", "seq": 3, "sota_ref": "(no matched public ref)", "submitter": "ruvnet", "tier": "Silver"} +{"benchmark": "MM-Fi", "category": "pose", "harness_version": 1, "kind": "result", "metric": "torso-PCK@20", "modality": "wifi-csi", "model_ref": "RuView CSI-Transformer + CORAL domain alignment", "note": "The real deployment frontier (new room). CORAL transductive DG (+30% rel over control). Data-bound: MM-Fi has only 3 source rooms.", "prev_hash": "d989e4e1dbc0182610305fdfbde8b094413b87c913283a46bf41f4afba7a06fd", "protocol": "cross_environment (train E01-03 -> test E04, new room)", "row_hash": "bf370487bde88e198c13877956dab3c83766a6a24afef0b78b6ac7aa130bb207", "score_pct": 17.51, "scored_at": "2026-05-30", "seq": 4, "sota_ref": "(hard frontier; control 13.52)", "submitter": "ruvnet", "tier": "Bronze"} diff --git a/aether-arena/space/requirements.txt b/aether-arena/space/requirements.txt new file mode 100644 index 00000000..3045897d --- /dev/null +++ b/aether-arena/space/requirements.txt @@ -0,0 +1 @@ +gradio==5.9.1 diff --git a/archive/v1/data/proof/expected_cir_features.sha256 b/archive/v1/data/proof/expected_cir_features.sha256 index 7d9d7cb1..6d55615d 100644 --- a/archive/v1/data/proof/expected_cir_features.sha256 +++ b/archive/v1/data/proof/expected_cir_features.sha256 @@ -1 +1 @@ -120bd7b1f549f57f3773971a389c48c2bdd99b4ab1f205935867a16e95583995 +304d54690af468dc6cbf0f2a1332f109cf187d5e2eab454efd8554cebc45bdeb diff --git a/docs/adr/ADR-149-public-community-leaderboard-huggingface.md b/docs/adr/ADR-149-public-community-leaderboard-huggingface.md new file mode 100644 index 00000000..532224bb --- /dev/null +++ b/docs/adr/ADR-149-public-community-leaderboard-huggingface.md @@ -0,0 +1,289 @@ +# ADR-149: AetherArena ("AA") — The Official Spatial-Intelligence Benchmark (Hugging Face) + +> **Scope note:** AetherArena is a **standalone, project-agnostic benchmark** for spatial intelligence — open to *any* project, team, or modality, not a RuView-branded board. RuView contributes the initial scoring harness and enters as one baseline among others; it gets no special treatment. This ADR lives in the RuView repo only because RuView is donating the seed harness — the benchmark itself is independent. + +| Field | Value | +|-------|-------| +| **Status** | Accepted | +| **Date** | 2026-05-30 | +| **Deciders** | ruv | +| **Gate decisions** | Name **locked**: `ruvnet/aether-arena` ("AA"), positioned as the official cross-project Spatial-Intelligence Benchmark. v0 ranked metrics **locked**: pose, presence, edge-latency, determinism. Dataset legality **resolved**: MM-Fi (CC BY-NC 4.0) only for v0; Wi-Pose dropped (research-use, no redistribution). | +| **Codebase target** | New repo `ruvnet/aether-arena` (leaderboard + HF Space); reuses `wifi-densepose-train` (`src/ruview_metrics.rs`, `src/ablation.rs`, `src/eval.rs`, `src/proof.rs`) and `wifi-densepose-cli` as the scoring engine | +| **Relates to** | ADR-011 (Deterministic Proof Harness), ADR-015 (Public Dataset Training Strategy — MM-Fi / Wi-Pose), ADR-024 (Contrastive CSI Embedding / HF model release), ADR-027 (Cross-Environment Domain Generalization / MERIDIAN), ADR-031 (RuView Sensing-First RF Mode — `RuViewTier` acceptance), ADR-079 (Camera-Supervised Pose Fine-tune — PCK@20), ADR-120 / ADR-141 (BFLD Privacy), ADR-145 (Ablation Eval Harness — the scoring substrate) | + +--- + +## 1. Context + +### 1.1 The Gap + +RuView has a mature, deterministic evaluation surface but **no public face for it**. Two assets already exist: + +1. **A grading harness.** `wifi-densepose-train/src/ruview_metrics.rs` rolls pose (PCK@0.2 / OKS / torso jitter / p95 error), tracking (MOTA / ID-switches / fragmentation), and vitals (breathing/heartbeat BPM error + SNR) into a `RuViewAcceptanceResult` with a `RuViewTier` (`Fail` / `Bronze` / `Silver` / `Gold`). ADR-145's `src/ablation.rs` extends this with presence accuracy, localization error, FP/FN, latency p50/p95/p99, a privacy-leakage score ∈ `[0,1]`, and cross-room degradation, under a determinism binding inherited from the ADR-011 proof harness. + +2. **A determinism substrate.** `proof.rs` (`PROOF_SEED=42`) SHA-256-hashes model outputs against an expected hash, so a scored run is reproducible and tamper-evident. + +What is missing is a **public, multi-entrant ranking**. As surveyed in ADR-015 and `docs/research/sota-surveys/sota-wifi-sensing-2025.md`, the WiFi-sensing field has **no hosted live leaderboard** the way vision has COCO/EvalAI — researchers self-report numbers against public *datasets* (MM-Fi, Wi-Pose, Person-in-WiFi, Widar3.0) in papers, with inconsistent splits, metrics, and no privacy or latency accounting. RuView's own pose number (PCK@20 ≈ 2.5% with proxy labels, target 35%+ per ADR-079) is currently self-reported on a private validation set and is not comparable to the MM-Fi SOTA (MultiFormer 0.7225). + +### 1.2 The Opportunity + +The harness that already gates RuView releases is exactly the engine a community leaderboard needs: a single, deterministic, privacy- and latency-aware scoring function. Publishing it as an open leaderboard: + +- Establishes **AetherArena as the field's standard yardstick** for spatial intelligence, with RuView's `RuViewTier` + ADR-145 metric set contributed as its initial basis (pose + tracking + vitals + **privacy-leakage** + latency + determinism — a combination no existing benchmark scores). The standard is AA's; RuView donates the seed. +- Draws **any project, framework, or modality** to submit and rank — a cross-project community flywheel, not a RuView-only one (RuView's `wifi-densepose-pretrained` is merely the first baseline). +- Forces the harness to harden: a public, neutral scorer must be reproducible by strangers, resistant to gaming, and runnable on a fixed held-out split nobody can train on. + +### 1.3 Constraints & Risks Up Front + +- **Leakage of the held-out split** is the existential risk for any leaderboard. The eval data must be private; submitters provide a model, not predictions on data they hold. +- **Compute cost.** Scoring a submission runs inference over the eval set; an HF Space on free CPU may be too slow for the Candle/`tch` pipeline. Tiering of compute (CPU smoke vs GPU full score) is required. +- **Privacy / consent of the eval data.** MM-Fi and Wi-Pose carry their own licenses; we can host *derived* CSI features and scores but must respect redistribution terms (ADR-015 already tracks this). +- **Trust.** A `RuViewTier` badge is only meaningful if the scoring is deterministic and the leaderboard cannot be silently edited — the ADR-011 proof hash and a signed results ledger address this. + +--- + +## 2. Decision + +**Create AetherArena ("AA") — the official, project-agnostic Spatial-Intelligence Benchmark: a public, open-entry leaderboard for camera-free spatial perception (pose, presence, occupancy, tracking, vitals) as a standalone repo `ruvnet/aether-arena` paired with a Hugging Face Space. The scoring engine is seeded by RuView's existing `ruview_metrics` + ADR-145 ablation harness, contributed as a neutral scorer; v0 evaluates against a private MM-Fi held-out split.** + +AA is **not a RuView leaderboard**. It is the field's missing standard yardstick for spatial intelligence — open to any team, framework, or sensing modality. The RF medium is the v0 input and RuView donates the seed harness + a baseline entry, but the benchmark is independent and RuView is scored like every other entrant. The metric surface — pose, presence, tracking, occupancy/world-model, latency, determinism, and later privacy — is modality-agnostic, leaving room to grow to mmWave / UWB / radar / lidar / multimodal entrants and other projects. + +The leaderboard does **not** fork or re-implement the scoring logic. It is a thin orchestration + presentation layer over the published `wifi-densepose-cli` scorer, so the public number a model earns is identical to the number RuView uses internally to gate releases. **This makes the leaderboard governance, not marketing.** + +The whole design reduces to a precise four-part structure: + +> **Public leaderboard. Private evaluation split. Open scorer. Signed results.** + +- **Public leaderboard** — anyone can see the ranking and submit. +- **Private evaluation split** — the held-out data is never published; it cannot be trained on or overfit. +- **Open scorer** — the scoring code is the published `wifi-densepose-cli`; a stranger can rerun it locally on a public *smoke* split and reproduce the logic. +- **Signed results** — every score is an append-only, signed ledger row with a determinism proof hash; ranks cannot be silently edited. + +### 2.1 Name — DECIDED: `ruvnet/aether-arena` ("AA") + +**Locked.** Canonical repo + HF Space: **`ruvnet/aether-arena`**, branded **AetherArena** with the short form **"AA"**. + +- **"Aether"** = the classical all-pervading medium — fitting for RF/ambient spatial perception, and broader than "Ether"/CSI/WiFi so the benchmark can grow to mmWave, UWB, and multimodal spatial-intelligence entrants without a rename. +- **"Arena"** = open competitive entry. +- HF Space title: *AetherArena (AA) — the spatial-intelligence benchmark for RF perception.* +- `ruvnet/wifi-densepose-leaderboard` is kept only as a discoverability/topic alias that redirects to AA. + +(Rejected: `csi-arena` — jargon; `rf-bench` — generic/collision; `wifi-densepose-leaderboard` as the primary — ties the brand to one capability.) + +### 2.2 Architecture + +``` + Submitter ruvnet/aether-arena RuView harness + ───────── ────────────────── ────────────── + push model.safetensors ──► HF Space (Gradio): submit form ┌─ wifi-densepose-cli score + + model card (adapter, │ • validates manifest │ ├─ load model snapshot + input contract, license) │ • queues job ──► │ ├─ replay private MM-Fi/ + │ • runs scorer in container │ │ Wi-Pose split (PROOF_SEED) + │ • appends signed result │ ├─ ruview_metrics → RuViewTier + ▼ │ ├─ ablation.rs → p50/p95, + leaderboard.parquet ◄────────────────────┘ │ privacy-leakage, cross-room + (HF dataset, append-only, └─ emit result + SHA-256 proof + one signed row per submission) +``` + +1. **Submission contract.** A submitter pushes a model artifact (`model.safetensors` / `.rvf` / LoRA adapter) plus a `ruview-arena.toml` manifest declaring: input feature set (which ADR-145 `FeatureSet` it consumes — F0 CSI / F1 CIR / F2 Doppler / F3 BFLD), tensor I/O contract, license, and optional category (pose / presence / tracking / vitals / multi-task). +2. **Scoring.** The Space runs the **published `wifi-densepose-cli`** in a pinned container against a **private held-out split** of MM-Fi / Wi-Pose (and RuView's own paired-capture set per ADR-079). Output is the existing `RuViewAcceptanceResult` + the ADR-145 scalar set, plus the ADR-011 SHA-256 reproducibility hash. +3. **Ledger.** Each scored submission appends **one signed row** to an append-only HF dataset (`ruvnet/aether-arena-results`, Parquet): `{submitter, model_ref, category, feature_set, tier, pck20, oks, mota, vitals_bpm_err, latency_p50, latency_p95, privacy_leakage, cross_room_deg, proof_sha256, scored_at, harness_version}`. Append-only + signed = no silent edits. +4. **Presentation.** Gradio leaderboard with category tabs (Pose / Presence / Tracking / Vitals / Edge-latency / **Privacy**), `RuViewTier` badges, and a "privacy-respecting" filter (leakage ≤ threshold) — the differentiator no other WiFi benchmark has. + +### 2.2.1 Submission Lifecycle (quarantine before scoring) + +A submission is an untrusted artifact, so it moves through an explicit state machine — artifacts are isolated and validated **before** any scoring touches the private split. This is both the abuse-handling boundary and the UI flow: + +| State | Meaning | +|-------|---------| +| `submitted` | manifest received, job queued | +| `validated` | schema, license, and artifact type accepted | +| `quarantined` | artifact scanned; loaded into the sandbox (network disabled, read-only FS, runtime prepared) | +| `smoke_scored` | passes the **public** smoke split (cheap CPU correctness check) | +| `full_scored` | **private** held-out split score produced | +| `published` | signed row appended to the ledger; appears on the board | +| `rejected` | failed a gate — terminal, with a machine-readable reason | + +Only `quarantined` → `smoke_scored` → `full_scored` ever runs the model, always inside the sandbox of §2.4. A failure at any gate transitions to `rejected` with a reason rather than silently dropping. + +### 2.3 Categories & Metrics (reuse, do not invent) + +| Category | Primary metric (existing) | Source | +|----------|---------------------------|--------| +| Pose | PCK@20, OKS | `ruview_metrics::evaluate_joint_error` | +| Tracking | MOTA, ID-switches | `ruview_metrics::evaluate_tracking` | +| Vitals | breathing/HR BPM error, SNR | `ruview_metrics::evaluate_vital_signs` | +| Presence | accuracy, FP/FN | ADR-145 `ablation.rs` | +| Edge latency | p50 / p95 / p99 ms | ADR-145 `LatencyProfile` | +| **Privacy** | leakage score ∈ `[0,1]` (membership-inference) | ADR-145 §10 | +| Cross-room | degradation ratio | ADR-027 / ADR-145 | +| Overall | `RuViewTier` Bronze/Silver/Gold + `arena_score` (§2.5) | `determine_tier()` | + +### 2.3.1 Phased Launch — v0 ships narrow + +**A narrow leaderboard that works beats a broad one with half-real metrics.** v0 ranks only categories whose metric is fully implemented and reproducible-by-strangers today; the rest are visible as **"coming soon" / gated** and are **not ranked** until their metric is real. + +| Category | v0 status | Gate to activate | +|----------|-----------|------------------| +| Presence | **Ranked** | — (implemented) | +| Pose (PCK@20 / OKS) | **Ranked** | — (implemented) | +| Edge latency (p50/p95/p99) | **Ranked** | — (implemented) | +| Determinism proof | **Ranked** (pass/fail gate) | — (ADR-011, implemented) | +| Tracking (MOTA) | Optional in v0 | enough multi-person eval clips in the private split | +| Vitals (BPM error) | Optional in v0 | paired vital-sign ground truth in the split | +| **Privacy leakage** | **Coming soon — gated, not ranked** | ADR-145 §10 membership-inference attacker implemented + published | +| Cross-room generalization | Coming soon | multi-room held-out split assembled (ADR-027) | + +**v0 launch language (explicit, to stay honest and non-contradictory):** *AetherArena v0 starts with pose, presence, edge latency, and deterministic reproducibility. Tracking and vitals are activated when sufficient ground-truth clips are available. Privacy-leakage and cross-room generalization remain gated until their evaluation attacks and splits are implemented and published.* Shipping a "privacy leaderboard" claim before the attacker exists would be an easy and deserved attack on our credibility. + +### 2.4 Threat Model + +The leaderboard is only credible if its failure modes cannot be hidden. Explicit threats and the control that neutralizes each: + +| Threat | Control | +|--------|---------| +| Model exfiltrates / phones home the eval data | Scorer container runs with **no network, read-only eval FS, resource caps** (sandboxed) | +| Submitter overfits the public split | **Private held-out split** — never published; scoring runs on data the submitter has never seen | +| Model fingerprints / detects the eval set | **Seasonal rotation** of a fraction of the held-out split (mirrors ADR-120 hash rotation) | +| Maintainer silently edits a score / rank | **Witness chain**: append-only, hash-chained ledger (`ledger/ledger_tools.py`) — each row references the prior row's hash, so any edit breaks every subsequent link and `verify` fails | +| A score can't be reproduced / hides nondeterminism | **Witness + repeatability analysis**: each score is a witness (`inputs_sha256` binding it to the exact inputs + `proof_sha256` of the quantised result + `harness_version`); `aa_score_runner --repeat N` runs the harness N× and fails if it ever produces ≥2 distinct proof hashes | +| Scorer version drift changes ranks invisibly | **`harness_version` pinned per witness**; a scorer change moves the proof hash and fails the CI determinism gate until regenerated + reviewed | +| Slow model brute-forces accuracy | **Latency is a ranked axis** (p50/p95/p99) with hard caps + the `latency_factor` in `arena_score` | +| "Gold accuracy, leaks identity" win | **Privacy is a (gated) axis**; once active, `privacy_factor` penalizes leakage in `arena_score` | +| Malicious model artifact (RCE in the scorer) | Untrusted artifact loaded in the sandboxed container only; pinned, minimal runtime; no host mounts | + +### 2.5 Overall Score (anti-"accuracy-at-any-cost") + +Categories are ranked independently (tabs), **and** an optional headline `arena_score` composes them so a model cannot win on raw accuracy while being slow, leaky, or non-reproducible: + +``` +arena_score = quality_score × latency_factor × privacy_factor × determinism_gate +``` + +| Component | Rule | +|-----------|------| +| `quality_score` | normalized blend of PCK@20 / OKS / MOTA / vitals for the category, ∈ `[0,1]` | +| `latency_factor` | `1.0` if p95 ≤ target; decays smoothly above target (edge viability) | +| `privacy_factor` | `1.0 − privacy_leakage` once the Privacy axis is active; **fixed at `1.0` in v0** (privacy gated/unranked) | +| `determinism_gate` | `1.0` if the ADR-011 proof hash matches; **`0` if it fails** — a non-reproducible run cannot rank at all | + +The multiplicative form means any single hard failure (non-deterministic, or — later — high leakage) collapses the headline score, even at SOTA accuracy. In v0, `privacy_factor` is pinned to `1.0` so the headline number is honest about what is actually measured. + +**`arena_score` is a gate, not the only headline.** Multiplicative composites are great for gating but can hide *why* a model lost, and invite "your formula is biased" arguments. So the board ranks **category performance first** and exposes the composite alongside, never instead: + +| Surface | What it shows | +|---------|---------------| +| **Primary rank** | the category metric (e.g. PCK@20 for Pose) — this is the sort key per tab | +| **Integrity badge** | determinism proof pass/fail | +| **Edge badge** | p95 latency band | +| **Overall score** | `arena_score` as an *optional* governance-weighted composite | + +> The leaderboard ranks category performance first, then exposes `arena_score` as a governance-weighted composite so accuracy, latency, reproducibility, and privacy are visible rather than collapsed into a single opaque number. + +### 2.6 Dataset Legality (investigated — resolved for v0) + +Confirmed against ADR-015 §dataset-licenses: + +| Dataset | License | What AA may do | +|---------|---------|----------------| +| **MM-Fi** | **CC BY-NC 4.0** | ✅ v0 eval source. Non-commercial use + derivatives **permitted with attribution**. AA may host *derived* CSI features and scores; raw frames stay in the private split. AA must be operated **non-commercially** and carry MM-Fi attribution. | +| **Wi-Pose** | **"Research use"** (no clean redistribution grant) | ⚠️ **Not hosted.** Pulled privately into the scorer only, never redistributed; or deferred until terms are clarified with the authors. **Dropped from v0.** | +| Person-in-WiFi-3D | semi-public access | Future candidate (post-v0), pending access terms. | + +**v0 decision:** evaluate on a **private MM-Fi held-out split only** (CC BY-NC, attributed, non-commercial; expose only license-permitted derived features). Wi-Pose is removed from v0 and revisited if/when redistribution is cleared. This keeps the existential "can we even host this" risk at zero for launch. + +> **Non-commercial caveat to watch:** CC BY-NC means AA itself, and the eval-data use, must remain non-commercial. Because AA also showcases the (commercial) RuView appliance, keep AA legally distinct and non-commercial, or seek an MM-Fi commercial grant before any paid tier. Flagged for the maintainer. + +### 2.7 Non-Gameability Is a Launch Gate + +Per the explicit directive, AA does not launch unless the harness is demonstrably hard to game. The controls (private split §2.4, seasonal rotation §2.4, model-not-prediction submission §2.2, sandbox §2.4, pinned `harness_version` §2.4, signed append-only ledger §2.3-§2.4, multiplicative `arena_score` §2.5, `determinism_gate=0` on proof-hash failure §2.5) are **not optional hardening — they are acceptance criteria** (see §7). A v0 that can be topped by overfitting a public split, a non-reproducible run, or a silently edited row is, by definition, not ready. + +### 2.8 Neutrality & Governance (because it's "official" and cross-project) + +The hardest credibility problem for an *official* benchmark seeded by one entrant: **"RuView built the scorer, so of course RuView wins."** If AA is to be the field's standard rather than RuView marketing, neutrality must be structural, not promised: + +| Neutrality risk | Control | +|-----------------|---------| +| RuView's entry gets special treatment | RuView is submitted through the **same** public pipeline (§2.2.1) and scored by the **same** pinned scorer as everyone else; its rows carry the same proof hash and are independently re-runnable on the smoke split. | +| RuView tunes the metric to favor its models | The scorer is **open and versioned**; any metric change is a public `harness_version` bump that **re-scores all entries**, not just new ones. Metric changes go through a public changelog. | +| "Official" is self-declared | AA is positioned as a **neutral commons**: separate repo/Space identity, contribution guide, and an explicit invitation for other projects + dataset authors to co-own splits and metrics. RuView is the *donor of the seed harness*, not the owner of the standard. | +| Benchmark used as RuView ad | Keep AA legally + brand-distinct (ties into the CC BY-NC non-commercial caveat, §2.6); the README leads with the standard, not the product. | +| Single-vendor capture | Roadmap to a multi-org steering/eval committee once ≥N external projects enter; split rotation + metric proposals are public. | + +The test for neutrality is the same as §7's acceptance test: a stranger from *another project* can submit, reproduce the score, and see that RuView's own entries were scored by the identical, open, pinned path. + +--- + +## 3. Consequences + +### 3.1 Positive +- A real, comparable public number for RuView (and everyone else) on MM-Fi / Wi-Pose, scored by a privacy- and latency-aware harness no other WiFi benchmark offers. +- Community flywheel: external models/adapters get ranked, feeding `ruvnet/wifi-densepose-pretrained`. +- Forces the harness to be reproducible-by-strangers, which strengthens internal release gating too. + +### 3.2 Negative / Costs +- **New repo + HF Space to maintain**, incl. a scoring container and queue. Ongoing compute cost (mitigate: CPU smoke-score on submit, batched GPU full-score on a schedule). +- **Dataset licensing** must be cleared for hosting derived MM-Fi / Wi-Pose features (ADR-015 owns this; may require contacting dataset authors). +- **Abuse surface** (malicious model artifacts run in the scorer) — must sandbox the container (no network, read-only eval data, resource caps). + +### 3.3 Neutral +- The scoring logic stays in `wifi-densepose-train`/`-cli`; the leaderboard is presentation only, so it does not bloat the core workspace. + +--- + +## 4. Alternatives Considered + +1. **Submit RuView to existing venues only (MM-Fi GitHub, Papers-with-Code).** Lower effort, but no privacy/latency axes, no live entry, and RuView doesn't own the standard. *Complementary, not exclusive — we should still post MM-Fi numbers.* +2. **A static numbers page in the RuView README.** Zero infra, but not multi-entrant and not a leaderboard. +3. **EvalAI / Kaggle competition.** Stronger anti-gaming infra, but heavyweight, time-boxed, and off-brand vs an always-open HF Space next to the model. + +--- + +## 5. Open Questions + +1. **Eval data hosting** — can we redistribute derived MM-Fi / Wi-Pose CSI features under their licenses, or must scoring pull the raw datasets the submitter cannot see? (Owner: ADR-015 follow-up.) +2. **Compute budget** — free HF CPU Space, ZeroGPU, or a self-hosted scorer on the GCloud A100/L4 fleet (`cognitum-20260110`)? +3. **Name lock** — confirm `aether-arena` vs `wifi-densepose-leaderboard`. +4. **Season cadence** — does the held-out split rotate monthly, and do we keep an all-time + per-season board? +5. **Privacy-leakage attack** — ship the membership-inference attacker (ADR-145 §10 is currently a *defined-but-unimplemented* metric) before launch, or launch with privacy as a "coming soon" axis? + +--- + +## 6. Implementation Sketch (if accepted) + +- **P1** — Stand up `ruvnet/aether-arena` repo + skeleton Gradio HF Space; define `ruview-arena.toml` submission contract; publish a **public smoke split** a stranger can score locally. +- **P2** — Containerize `wifi-densepose-cli score` as the pinned, sandboxed scorer (no network, read-only FS, caps); wire the signed append-only Parquet ledger + `determinism_gate`. +- **P3 — v0 LAUNCH (narrow).** Clear + load the private MM-Fi / Wi-Pose held-out split; activate **Presence, Pose, Edge-latency, Determinism** categories; seed the board with RuView's own `wifi-densepose-pretrained` baseline (honest current PCK@20). Tracking/Vitals optional. Privacy + Cross-room shown as **gated / coming soon**. +- **P4** — *(post-launch, gated)* Implement the ADR-145 §10 privacy-leakage membership-inference attacker; only then activate + rank the **Privacy** category and switch `privacy_factor` on in `arena_score`. +- **P5** — Assemble the multi-room split → activate **Cross-room**. Submit RuView's MM-Fi number to Papers-with-Code in parallel (alternative #1). + +## 7. Acceptance Test (definition of done for v0) + +v0 launches **only when a stranger can:** + +1. **Submit** a model (artifact + `ruview-arena.toml`) through the Space with no insider help, +2. **Get a deterministic score** back (same model + same harness version → same numbers), +3. **See the signed row** appended to the public results ledger, +4. **Rerun the scorer locally** on the public *smoke* split and reproduce the logic, and +5. **Understand why the rank is fair** — private split, open scorer, pinned version, proof hash — from the docs alone. + +If any of these five fails, v0 is not ready. + +## 8. Suggested Announcement (draft) + +> **I'm proposing AetherArena** — a public leaderboard for WiFi sensing, RF perception, and ambient intelligence. +> +> The problem with this field is not just model quality. It is *measurement* quality. Most WiFi-sensing work reports numbers against datasets with inconsistent splits, inconsistent metrics, and almost no accounting for latency, privacy leakage, reproducibility, or edge viability. +> +> AetherArena fixes that. Models are submitted, scored in a pinned sandboxed container against **private** held-out MM-Fi and Wi-Pose splits, and written to a **signed append-only** results ledger. The scoring engine reuses the same RuView harness we use internally: pose, presence, tracking, vitals, latency, cross-room degradation, deterministic proof hashes — and, once its attacker ships, privacy leakage. +> +> The goal is not to make RuView look good. The goal is to make the *category* measurable. If ambient intelligence is going to move from demos to infrastructure, it needs public numbers, reproducible commands, private eval splits, and failure modes that cannot be hidden. + +### Strategic note — three layers of the credibility story + +| Layer | Asset | +|-------|-------| +| Retrieval credibility | ruflo BEIR harness | +| Sensing credibility | **AetherArena (this ADR)** | +| Product credibility | RuView appliance + Arista-style deployments | diff --git a/docs/adr/ADR-150-rf-foundation-encoder.md b/docs/adr/ADR-150-rf-foundation-encoder.md new file mode 100644 index 00000000..bb80ab25 --- /dev/null +++ b/docs/adr/ADR-150-rf-foundation-encoder.md @@ -0,0 +1,260 @@ +# ADR-150: RuView RF Foundation Encoder — pose-preserving, subject/room/device-invariant CSI embedding + +| Field | Value | +|-------|-------| +| **Status** | Proposed | +| **Date** | 2026-05-30 | +| **Deciders** | ruv | +| **Codebase target** | New `wifi-densepose-rfencoder` (or `nn/src/rf_foundation.rs`) + training in `wifi-densepose-train`; consumed by the MM-Fi pose head and the AetherArena Generalization Track (ADR-149) | +| **Relates to** | ADR-024 (Contrastive CSI Embedding / AETHER), ADR-027 (Cross-Environment Domain Generalization / MERIDIAN), ADR-134 (CIR), ADR-135 (calibration + coherence gate), ADR-145 (Ablation/Eval Harness), ADR-149 (AetherArena benchmark) | + +--- + +## 1. Context + +AetherArena now has a published, metric- and protocol-matched MM-Fi result: **81.63% torso-PCK@20 in-domain (random_split), exceeding MultiFormer's 72.25%** ([#876](https://github.com/ruvnet/RuView/issues/876)). But the **leakage-free cross-subject** number collapses to **~11.6% torso-PCK** (27% under the looser bbox metric). That gap is the real deployment frontier — homes, elder care, festivals, unseen bodies. + +Naïve fixes already tested and **failed**: a subject-adversarial (DANN) embedding did not move cross-subject (baseline 27.26% → DANN 27.54% bbox; torso 11.57%). Bigger capacity *hurt* (transformer cross-subject 24.8% < conv 27.3%) — extra parameters overfit seen subjects. + +**Conclusion:** a *generic* "better feature vector" will not help. The lever is an embedding trained for the **right invariance** — one that preserves pose while removing subject, room, and device signatures, and that *exposes* channel instability rather than hiding it. + +### 1.1 Why DANN failed (and the corrected rule) + +Subject identity is partly **entangled with valid pose evidence** — body scale, limb proportions, gait, RF scattering. Blindly erasing subject info also erases information the pose decoder needs. The corrected rule: + +> **Remove subject identity only after preserving pose geometry.** Supervised *pose-contrast across subjects* beats naïve adversarial identity removal. + +The frontier objective is **not** `same-subject = positive`. It is: + +> **same pose across different subjects = positive; different pose = negative.** + +## 2. Decision + +**Build the RuView RF Foundation Encoder: a self-supervised, pose-preserving, subject/room/device-invariant RF representation for CSI (extensible to CIR, ADR-134, and BFLD).** Positioned as a **platform primitive**, not a benchmark trick. + +### 2.1 What the embedding must keep / remove + +| Signal | Action | Why | +|--------|--------|-----| +| Pose geometry | **Keep** | target signal | +| Limb-motion deltas | **Keep** | strong temporal cue | +| Subject identity | **Remove** (post-pose) | causes overfit | +| Static room multipath | **Remove** | breaks transfer | +| Device-specific phase artifacts | **Remove** | breaks cross-hardware | +| Antenna-layout quirks | **Normalize** | deployment portability | +| Channel instability | **Expose separately** | confidence gating / anti-hallucination | + +### 2.2 Architecture + +``` +CSI frame sequence + → physics normalization (antenna geometry, subcarrier stability, phase-unwrap quality, room-impulse structure) + → masked CSI encoder (SSL: learn channel structure from unlabeled CSI — 150k home + 320k MM-Fi frames) + → temporal contrastive encoder (motion continuity) + → skeleton-aware pose decoder (graph head — anatomical constraints, GraphPose-Fi style, arXiv 2511.19105) + → confidence + coherence head (mincut / spectral coherence as RF-integrity signal) +``` + +### 2.3 Training objectives (loss stack) + +``` +L_total = L_pose + + 0.20 · L_masked_csi # learn channel structure (unlabeled) + + 0.10 · L_temporal_contrast # motion continuity + + 0.20 · L_pose_contrast # same-pose-across-subjects = positive ← the frontier + + 0.05 · L_subject_decorrelation # remove identity only where it conflicts with pose + + 0.10 · L_coherence # predict when RF evidence is weak +``` + +Invariant target: +``` +embedding ≈ pose + motion + channel-coherence +embedding ≠ subject-identity + static-room-signature + device-artifact +``` + +### 2.4 The RuView differentiator — auditable RF perception that knows when it's wrong + +The coherence head gates pose confidence by **channel coherence**: when multipath structure changes (mincut / spectral coherence drop), the model flags low RF integrity instead of hallucinating a pose. This is the **anti-hallucination** component most WiFi-pose papers lack, and it turns RuView from a model into sensing infrastructure. (Ties to ADR-135 coherence gate.) + +## 3. Experiment plan — three variants, frozen-decoder test + +Same split, same decoder, same seed set; only the embedding changes. + +| Variant | Description | Success threshold (cross-subject torso-PCK) | +|---------|-------------|----------------------------------------------| +| **E1** | Masked CSI pretrain | **+3** | +| **E2** | Pose-contrastive across subjects | **+6** | +| **E3** | Physics-normalized SSL + skeleton head | **+10** | + +### 3.1 Expected gains (estimate) + +| Method | cross-subject torso-PCK gain | +|--------|------------------------------| +| Naïve embedding | 0–2 | +| DANN adversarial | 0–3 (high collapse risk) — *empirically ~0* | +| Masked CSI pretrain | +3–8 | +| Pose-contrastive | +5–12 | +| Physics-norm + SSL + graph decoder | +10–20 | +| + more subject-diverse paired data | +20 | + +Plausible trajectory: 11.6% → **20–25% near term**, **30–40% with enough subject/environment diversity**. That is a stronger research claim than squeezing random-split from 81.6% → 88%. + +### 3.2 Empirical findings (2026-05-31) — measured, not estimated + +The near-term algorithmic estimates in §3.1 were **tested directly on the official MM-Fi +cross-subject split** (256,608 train / 64,152 test, same TF pipeline). Measured results: + +| Method | §3.1 estimate | **Measured** | Verdict | +|--------|--------------:|-------------:|---------| +| Baseline (in-harness) | — | 63.13% (doc TTA 64.04) | reference | +| Mixup | n/a | **+0.7** → 63.79% | ✅ small | +| Mixup + TTA + 3-seed ensemble | n/a | **+0.9** → **64.92%** | ✅ **best** | +| Per-antenna instance-norm + SpecAugment | n/a | **−4.6** → 58.52% | ❌ destroys cross-antenna pose structure | +| **Pose-contrastive foundation pretrain** | **+5 to +12** | **−2.3** → 62.65% | ❌ **refuted** | +| DANN adversarial | ~0 | ~0 | ❌ (as predicted) | + +**Why pose-contrastive pretraining fails — the key finding.** The supervised-contrastive +pretraining loss (positives = same pose-cluster, spanning subjects) **never left the +uniform-similarity floor `ln(B)`** — across cluster granularities K∈{48,256}, batch sizes +{768,1024}, and 3 seeds. The same encoder trivially aligns *temporally-adjacent* frames +(temporal-triplet SSL reached 82%), so the optimizer works; it simply **cannot pull same-pose +CSI from different subjects together — that invariance is not present in the data to be learned.** + +**Implication for this ADR.** The 18-pt in-domain↔cross-subject gap (83.6% → best 64.9%) is +**fundamental subject-distribution shift in CSI, not an algorithmic gap.** No invariance-learning +method tested moves it; only variance-reduction (mixup + ensemble) gives <1 pt. This **promotes +"more subject-diverse paired data" (§3.1 last row, §6 alt 3) from complementary to the *primary* +lever** and **demotes pure-SSL-on-existing-data** as a near-term cross-subject win. The encoder is +still worth building for masked-CSI representation reuse and the coherence integrity head, but the +cross-subject acceptance gate (§4, ≥6 pts) is **unlikely to be met without new multi-subject +capture** (fleet: `cognitum-seed-1` + multi-room, see `CLAUDE.local.md`). Recommend re-scoping +phase 1 around data collection before further loss-stack engineering. + +### 3.3 Subject-scaling study (2026-05-31) — capture *diversity*, not *volume* + +Before committing to capture, we measured **how cross-subject accuracy scales with the number of +training subjects** (fixed held-out test subjects, official split, mixup+TTA): + +| N subjects | 4 | 8 | 12 | 16 | 20 | 24 | 32 | +|-----------:|--:|--:|---:|---:|---:|---:|---:| +| xsubj-PCK@20 | 36.7 | 57.7 | 58.3 | 61.1 | 62.7 | 63.3 | **63.7** | + +The curve **saturates**: 4→8 subjects = **+21 pts**, but 24→32 = **+0.45 pts**. Asymptote ≈ 64–65%, +still ~19 pts under in-domain. **Key correction to the "more data" recommendation:** simply capturing +*more people from the same distribution* will **not** close the gap — subject-count returns vanish +past ~16–20 subjects. The residual is **device/room/protocol shift** (MM-Fi's cross-subject split is +partly cross-environment by construction). **Re-scoped phase-1 capture target: maximize DIVERSITY +(rooms, devices, antenna geometries, traffic protocols), not headcount** — and pair it with few-shot +target-domain adaptation (a handful of labeled frames from the deployment room), which the saturation +curve implies will beat any amount of additional source subjects. This makes the encoder's +*domain-invariance* objective (vs the failed subject-invariance one) the design priority. + +### 3.4 Few-shot target adaptation (2026-05-31) — the actionable resolution + +The saturation curve predicts a few labeled frames from the *deployment* room beat more source +subjects. Confirmed. Base trained on all 32 source subjects (63.7% zero-shot on a disjoint 50% +held-out of the target subjects), then fine-tuned on K labeled frames per target subject: + +| K/subject | total frames | eval PCK@20 | Δ | +|----------:|-------------:|------------:|--:| +| 0 | 0 | 63.7% | — | +| 20 | 160 | 68.1% | +4.3 | +| **50** | **400** | **72.2%** | **+8.5 (≈ prior SOTA)** | +| 200 | 1,600 | 76.1% | +12.4 | +| 1000 | 8,000 | 78.3% | +14.6 | + +**Few-shot calibration dominates source volume.** §3.3 showed +24 source subjects (~190K frames) +buys +6 pts; here **200 target frames/subject (1,600 frames) buys +12.4 pts**. This **re-scopes the +ADR's acceptance gate and deployment story**: the cross-subject gate (§4, ≥6 pts) is *trivially* met +by ~50–200 labeled frames of in-room calibration — no foundation encoder or mass capture required for +the deployment win. **Recommended product behavior:** ship a **~30-second on-site calibration** (a few +hundred labeled frames per room/person) that recovers most of the gap. The foundation encoder's value +shifts from "close cross-subject zero-shot" (data says: hard) to "make the few-shot adaptation faster / +need fewer calibration frames" — a better-posed, achievable objective. **This supersedes the §3.2 +pessimism: the frontier is not closed by algorithms or bulk data, but it *is* cheaply closed at +deployment time by few-shot calibration.** + +> **Task-general (2026-05-31).** The same mechanism was verified on a *second* MM-Fi task — +> 27-class **action recognition** (which the MM-Fi paper never benchmarked for WiFi). Zero-shot +> cross-subject collapses to ~10% (near-chance), and few-shot calibration recovers it: 50 samples → +> 36%, 200 → 59%, 1000 → 76%. Action needs more calibration than pose (classification vs regression), +> but the pattern is identical. **Few-shot in-room calibration is the universal deployment answer for +> WiFi sensing generalization, not a pose-specific result.** (Optimization report §36.) + +### 3.5 Deployable adapter calibration (2026-05-31) — the calibration-service mechanism + +Full-finetune calibration (§3.4) means a 2.3 MB model copy per room. Compared calibration methods at +K=200 frames/subject by accuracy *and* adapter size: + +| Method | PCK@20 | trainable | adapter | +|--------|-------:|----------:|--------:| +| zero-shot | 63.6% | — | — | +| **LoRA rank-8** | **72.5%** | 11,200 | **~11 KB** | +| head+graph only | 72.7% | 121,828 | 119 KB | +| frozen-trunk | 73.5% | 212,453 | 207 KB | +| full finetune | 76.2% | 2.32 M | 2.3 MB | + +**A ~11 KB LoRA adapter recovers +8.9 pts (→72.5%, ≈ prior SOTA) at 0.5 % the model size.** This is +the concrete mechanism for the **RuView calibration service** the project wanted: ship the shared +base once; each room contributes a 30-second labeled calibration → a **~11 KB per-room LoRA adapter** +→ SOTA-level cross-subject pose, thousands of rooms on one base. Accuracy/size knob: +LoRA 11 KB @ 72.5 % → frozen-trunk 207 KB @ 73.5 % → full 2.3 MB @ 76.2 %. **Net for this ADR:** the +encoder/adapter split is validated empirically — a frozen shared trunk + tiny per-room LoRA is the +deployable path, and the foundation-encoder objective should be "make this adapter even smaller / +need fewer calibration frames." + +**Calibration data requirement (measured, 3 seeds):** the 11 KB LoRA needs **~100–200 labeled +samples/room** to reach ~72% (knee at ~50 → 70%); below ~20 samples it can't fit and may *hurt* +(5 samples → 61% < zero-shot 64%). So the evidence-complete **calibration-service spec** is: +ship shared base → collect **~100–200 labeled samples on-site** → fit a **~11 KB LoRA** → +**~72% cross-subject** (SOTA-level). The encoder's research goal is now precisely posed: push that +~100–200-sample requirement down and/or lift the >72% ceiling per fixed calibration budget. + +### 3.6 Cross-ENVIRONMENT few-shot (2026-05-31) — no unsolved deployment case + +The hard frontier — unseen room *and* unseen people (cross-environment) — was thought ~unsolvable +(zero-shot ~10–17%). Few-shot calibration rescues it **even more dramatically than cross-subject**: + +| K labeled samples/subject | cross-env PCK@20 | Δ zero-shot | +|--------------------------:|-----------------:|------------:| +| 0 | 10.6% | — | +| **5** | **60.1%** | **+49.5** | +| 20 | 66.0% | +55.5 | +| 50 | 70.0% | +59.4 | +| 200 | 73.1% | +62.5 | +| 1000 | 75.4% | +64.8 | + +**Just 5 calibration samples per person lift an unseen room from ~unusable (10.6%) to 60%.** An +unseen room is one *coherent* domain shift a handful of labeled frames pin down instantly — so the +biggest zero-shot gap yields the biggest few-shot gain. **Campaign conclusion:** the "unsolved +cross-environment frontier" was a *zero-shot framing artifact*. With the ~11 KB LoRA calibration +mechanism (§3.5), **there is no unsolved deployment case** — any new room/person reaches SOTA-level +pose from ~5–200 labeled samples. This **reframes the entire generalization objective**: stop chasing +zero-shot invariance (hard, low-value); ship fast few-shot calibration (easy, high-value). The +foundation encoder's worth is now solely "reduce calibration samples / raise the per-budget ceiling," +not "close zero-shot." Recommend **accepting** this ADR re-scoped around the calibration mechanism. + +## 4. Acceptance Test + +The encoder is accepted **only if it improves cross-subject torso-PCK@20 by ≥ 6 absolute points without reducing random-split torso-PCK@20 by more than 2 points** — on the same MM-Fi pipeline, one-command reproduction, with per-joint error tables. Results land as AetherArena witness rows (ADR-149), nothing published until reviewed. + +## 5. Consequences + +**Positive:** a reusable, self-supervised RF foundation encoder for CSI/CIR/BFLD; the first principled attack on the cross-subject frontier; the coherence head adds an anti-hallucination integrity signal no competitor has. + +**Negative / risk:** SSL pretraining requires matching the production CSI→feature pipeline (ADR-149 §SSL note flagged the resampling-replication risk); the multi-loss stack needs careful weight tuning (DANN showed loss-imbalance can collapse training); physics normalization must be validated not to discard pose-relevant deltas. + +**Neutral:** the in-domain head is unchanged; the encoder slots in front of the existing pose decoder. + +## 6. Alternatives Considered + +1. **Bigger model only** — tested; *hurts* cross-subject (overfits seen subjects). +2. **Naïve DANN subject-adversarial** — tested; no gain, collapse risk; entangles pose evidence. +3. **More data only (camera/ADR-079)** — complementary and ultimately necessary, but slow and out-of-band; the encoder extracts more from existing data first. + +## 7. Open Questions + +1. Physics-normalization spec — exact antenna/subcarrier/phase terms, validated to preserve pose deltas. +2. Masked-CSI SSL on the production feature pipeline (resampling match — see ADR-149). +3. Where the coherence/mincut integrity signal is computed (reuse ADR-135 coherence gate vs new head). +4. CIR (ADR-134) / BFLD fusion into the same encoder — phase 3. diff --git a/docs/benchmarks/homecore-vs-home-assistant.md b/docs/benchmarks/homecore-vs-home-assistant.md new file mode 100644 index 00000000..3176a4c4 --- /dev/null +++ b/docs/benchmarks/homecore-vs-home-assistant.md @@ -0,0 +1,98 @@ +# RuView HOMECORE vs Home Assistant — Performance & Capability Benchmark + +**Measured:** 2026-05-31 · Windows 11, Docker Desktop 28.5.1 (WSL2 Linux engine) · single host. +**Reproduce:** `python aether-arena/staging/run_homecore_bench.py` and `python aether-arena/staging/run_ha_bench.py`. + +HOMECORE is RuView's **wire-compatible Rust port of Home Assistant's core** (ADR-125…ADR-134): the +same `/api` REST + WebSocket surface, the same SQLite recorder schema, an automation engine, a +HomeKit bridge, a WASM plugin runtime, and a voice/assist pipeline — plus **native WiFi/RF sensing +entities** (presence, breathing, heart-rate, pose) that Home Assistant can only get through external +add-ons. Because the API is wire-compatible, the two can be measured head-to-head on the same client. + +> **Read this honestly.** HOMECORE (`0.1.0-alpha`) is a young, focused core; Home Assistant is a +> mature platform with ~3,000 integrations and a decade of ecosystem. HOMECORE's thesis is **not** +> "more features" — it is **the same control plane at 1/35th the memory and 18× the startup speed, +> with RF sensing built in.** The numbers below quantify exactly that trade. + +## Performance (measured) + +| Metric | RuView HOMECORE `0.1.0-alpha` | Home Assistant `stable` | Advantage | +|--------|------------------------------:|------------------------:|-----------| +| **Cold start → API/web ready** | **0.55 s** | 9.72 s | **18× faster** | +| **Idle resident memory (RSS)** | **10.1 MB** | 359 MB | **35× leaner** | +| **Distribution size** | **4.7 MB** (single static binary) | 610 MB (container image) | **130× smaller** | +| **Idle CPU** | 0.0 % | 0.0 % | tie | +| **REST latency p50** | 2.13 ms | 2.95 ms | comparable¹ | +| **REST latency p95** | 22.9 ms | 27.3 ms | comparable¹ | +| **REST latency p99** | 26.2 ms | 28.3 ms | comparable¹ | +| **REST throughput (1 conn, sequential)** | **1,599 req/s** | 716 req/s | **2.2×** | +| **Recorder DB after boot** | 36.9 KB | 4.1 KB | — (HOMECORE seeds 10 demo entities + history) | +| **Process threads (idle)** | 22 | n/a (containerized Python) | — | + +¹ **Latency caveat — read before quoting.** The two latency rows are *not* the same endpoint. +HOMECORE is measured on **authenticated `/api/states`** (returns 10 live entities). Home Assistant's +`/api/*` requires a completed onboarding flow + long-lived access token, so HA is measured on the +**unauthenticated `/manifest.json`** served by the same aiohttp stack. Both are single-connection, +300-sample, sequential. Treat latency as "same order of magnitude"; treat **memory, startup, and +size as the decisive, apples-to-apples results.** Throughput is endpoint-confounded the same way — +the 2.2× is directional, not a controlled isolate. + +### What the deltas mean in practice +- **10 MB vs 359 MB RSS:** HOMECORE runs comfortably on a Pi Zero 2 W or an ESP32-class gateway + alongside the sensing pipeline; HA effectively needs a Pi 4/5 or x86 to itself. +- **0.55 s vs 9.7 s start:** HOMECORE can be cold-started per-request or restarted on config change + without a noticeable outage; HA's ~10 s boot (longer with real integrations) makes it a + long-lived daemon only. +- **4.7 MB vs 610 MB:** OTA-updating the whole control plane over a metered/rural link is trivial + for HOMECORE; HA ships as a ~250 MB compressed image. + +## Capability & feature comparison + +| Capability | RuView HOMECORE | Home Assistant | +|-----------|-----------------|----------------| +| HA-compatible REST `/api` | ✅ wire-compatible subset (ADR-130) | ✅ reference implementation | +| HA-compatible WebSocket API | ✅ (ADR-130) | ✅ | +| State machine + event bus + service registry | ✅ 13 seeded services (ADR-127) | ✅ | +| SQLite recorder (history) | ✅ HA-compat schema **+ ruvector semantic search** (ADR-132) | ✅ (no vector search) | +| Automation engine + Jinja templates | ✅ MiniJinja trigger/condition/action (ADR-129) | ✅ (full Jinja2) | +| HomeKit (Apple Home) bridge | ✅ scaffold (ADR-125) | ✅ mature | +| Plugin/integration runtime | ✅ **sandboxed WASM** plugins (ADR-128) | ✅ Python integrations (in-process, unsandboxed) | +| Voice / intent / "Assist" | ✅ 5 built-in intents **+ ruflo agent bridge** (ADR-133) | ✅ Assist + LLM agents | +| Migration from existing HA | ✅ reads HA `.storage/` + `automations.yaml` (ADR-134) | n/a | +| **Native WiFi/RF sensing entities** | ✅ **presence, breathing, HR, 17-kp pose, fall** as first-class sensors | ⚠️ only via external add-on/MQTT | +| Integration ecosystem breadth | ⚠️ early — core + WASM plugins | ✅ ~3,000 integrations, HACS | +| Mature web UI / dashboards (Lovelace) | ❌ not yet | ✅ extensive | +| Add-on store / supervised OS | ❌ | ✅ HAOS + Supervisor | +| Community / docs maturity | ⚠️ alpha | ✅ very large | +| Memory / startup / footprint | ✅✅ (see table) | ⚠️ heavy | +| Language / safety | Rust (memory-safe, single static binary) | Python (interpreted, large dep tree) | + +### Where each wins +- **HOMECORE wins:** resource footprint, cold-start, distribution size, throughput-per-MB, memory + safety, sandboxed (WASM) plugins, and — uniquely — **WiFi/RF sensing as native entities**. Ideal + for edge gateways, battery/solar nodes, and shipping the control plane *with* the sensor. +- **Home Assistant wins:** integration breadth, UI/dashboard maturity, add-on ecosystem, community + support, and production track record. Ideal as a full-house hub on a Pi 4/5+ or x86. + +## Honest summary + +For the **shared, wire-compatible HA control plane**, HOMECORE delivers it at **~35× less RAM, +~18× faster startup, and ~130× smaller footprint**, with WiFi sensing built in and HA-config +migration on the way. What it does **not** yet match is Home Assistant's enormous integration +catalog and UI maturity. The right read is **"HA-compatible core, edge-class resource budget, +RF-native"** — not "HA replacement." For a sensing node that also needs to *be* a smart-home hub, +HOMECORE's efficiency is decisive; for a feature-complete whole-home hub today, Home Assistant +remains the broader platform. + +## Reproduction & method + +- **HOMECORE:** `v2/target/release/homecore-server.exe` (`0.1.0-alpha.0`), bound to `127.0.0.1:8124`, + SQLite file recorder, dev-token auth (`Authorization: Bearer …`). Startup = `Popen` → first `200` + on `/api/`. RSS/CPU via `psutil` after a 2 s settle. 300-sample sequential latency on `/api/states`. +- **Home Assistant:** `ghcr.io/home-assistant/home-assistant:stable` in Docker, `-p 8125:8123`, + fresh `/config`. Startup = container start → first `<500` on `/manifest.json`. RSS/CPU via + `docker stats --no-stream` after a 20 s settle. 300-sample sequential latency on `/manifest.json`. +- Both runs are single-host, single-connection, no concurrency tuning. Numbers are indicative of + the **resource/startup class**, which is the property that differs by orders of magnitude; + latency/throughput are reported with the endpoint caveat above and should not be over-read. +- Harness scripts: `aether-arena/staging/run_homecore_bench.py`, `aether-arena/staging/run_ha_bench.py`. diff --git a/docs/benchmarks/mmfi-wifi-sensing-study.md b/docs/benchmarks/mmfi-wifi-sensing-study.md new file mode 100644 index 00000000..55b1b6fa --- /dev/null +++ b/docs/benchmarks/mmfi-wifi-sensing-study.md @@ -0,0 +1,166 @@ +# WiFi-CSI Sensing on MM-Fi — a complete, honest study + +**Scope:** what works, what doesn't, and what actually ships — for 2D human **pose** and **action +recognition** from WiFi Channel State Information on the public [MM-Fi](https://github.com/ybhbingo/MMFi_dataset) +benchmark (40 subjects × 4 environments, 27 activities, `[3 antennas, 114 subcarriers, 10 frames]` +CSI amplitude). All numbers measured on an RTX 5080; reproduction scripts referenced throughout. + +> **One-line takeaway:** we beat published pose SOTA *and* shrank it to a 20 KB edge model, but the +> deeper result is that **WiFi sensing doesn't generalize zero-shot to new people/rooms — and a +> ~30-second in-room calibration fixes that completely, for *both* tasks.** Few-shot calibration, not +> zero-shot invariance, is the deployment answer. +> +> **Sharpest finding (§7):** WiFi-CSI sensing is largely a **random-features + target-trained-readout** +> problem — a *random frozen* encoder + a trained head gets within ~2–4 pts of a fully-trained encoder +> (and within <2 pts cross-subject). The encoder barely learns anything transferable; the signal is in +> the readout. This single fact explains the zero-shot collapse, the no-transfer results, the +> foundation-encoder failure, *and* why per-room calibration works. + +## 1. Pose estimation + +### 1.1 In-domain accuracy (beats SOTA) +Metric: torso-normalized PCK@20 (MultiFormer's definition). Protocol: MM-Fi `random_split` (the +dataset default). + +| Model | torso-PCK@20 | +|-------|-------------:| +| CSI2Pose (prior) | 68.41% | +| MultiFormer (prior SOTA, 2025) | 72.25% | +| **Ours (single)** | **82.69%** | +| **Ours (graph + 3-ensemble + TTA)** | **83.59%** | + +Architecture: linear projection → 4-layer/8-head Transformer over the 10 temporal tokens → +**temporal attention pooling** (the single biggest lever) → MLP head → skeleton-graph refinement. +The headline was *self-corrected down* from an inflated 91.86% (loose bbox normalization) to 82.69% +under the matched torso metric before publishing. + +### 1.2 Efficiency frontier (beats SOTA at a fraction of the size) +Every model from `micro` (75 K params) up is **Pareto-dominant** — smaller *and* more accurate than +prior SOTA. A **75 K-param model tops MultiFormer**; deployed **int4 is ~20 KB at 74.08% (QAT)**, +0.135 ms single-thread CPU. (int8 is lossless at 74.7%; naïve int4 PTQ drops to 70.2% — QAT recovers +it.) Full curve: [`wifi-pose-efficiency-frontier.md`](wifi-pose-efficiency-frontier.md). +Published: [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose). + +## 2. Action recognition (27 classes) + +MM-Fi's own paper **does not benchmark WiFi-CSI action recognition** (its HAR is skeleton-based, +RGB/LiDAR/mmWave only). The only published WiFi-CSI-on-MM-Fi number is WiDistill (2024): 34.0% +(ResNet-18, unspecified split). We establish: + +| Protocol | top-1 | +|----------|------:| +| random_split (in-domain) | 88.08% | +| cross-subject (official), zero-shot | **10.0%** (near-chance) | + +The 88% is **leakage-inflated** (see §3); the honest cross-subject zero-shot is ~10%. + +## 3. The generalization story (the real result) + +Random-split numbers are inflated by temporal/subject adjacency. Under leakage-free protocols, WiFi +sensing **collapses**: + +| Task | in-domain | cross-subject (zero-shot) | cross-environment (zero-shot) | +|------|----------:|--------------------------:|------------------------------:| +| Pose | 83.6% | 64% | ~10% | +| Action | 88.1% | 10% | — | + +### 3.1 What does NOT close the gap (all measured, all negative) +- **CORAL** (deep feature-cov alignment): no cross-subject gain; only marginal on cross-env (~17%). +- **DANN** (subject-adversarial): ~0, loss-imbalance fragile. +- **Per-antenna instance-norm + SpecAugment**: −4.6 (destroys cross-antenna pose structure). +- **Pose-contrastive foundation pretraining**: −2.3 — and the SupCon loss *never left the `ln(B)` + random floor*, i.e. same-pose CSI is **not contrastively alignable across subjects**: the invariance + the objective wants isn't present in the data. +- **Knowledge distillation** (flagship→tiny): no gain; direct training wins. +- **More training subjects**: saturates — 4→8 subjects = +21 pts, but 24→32 = +0.45 pts (asymptote ~64%). + +Only **mixup + TTA + ensemble** helps cross-subject, and by <1 pt. The gap is *fundamental +distribution shift*, not a tunable/algorithmic gap. + +### 3.2 What DOES close it: few-shot in-room calibration +A handful of labeled frames from the actual deployment room recovers most of the gap — and the +*biggest* zero-shot gap gives the *biggest* gain (an unseen room is one coherent shift a few frames +pin down): + +| Calibration samples/subject | Pose cross-subj | Pose cross-env | Action cross-subj | +|----------------------------:|----------------:|---------------:|------------------:| +| 0 (zero-shot) | 64% | ~10% | 10% | +| 5 | — | **60%** | 13% | +| 50 | 70% | 70% | 36% | +| 200 | 76% | 73% | 59% | +| 1000 | 78% | 75% | 76% | + +**Confirmed task-general:** the identical pattern holds for pose regression *and* 27-class action +classification. Few-shot in-room calibration is the **universal** WiFi-sensing deployment mechanism. +(Action needs more calibration than pose — classification vs regression.) + +### 3.3 Deployable as a ~11 KB adapter +Full fine-tune means a 2.3 MB model copy per room. A **rank-8 LoRA adapter (~11 KB)** recovers most +of the gain (cross-subject 64→72.5% at 0.5% the size). Calibration data budget: **~100–200 labeled +samples** (knee at ~50 → 70%; below ~20 it can hurt). + +| Calibration method @200 samples | PCK@20 | adapter | +|---------------------------------|-------:|--------:| +| LoRA rank-8 | 72.5% | ~11 KB | +| head + graph only | 72.7% | 119 KB | +| frozen-trunk | 73.5% | 207 KB | +| full finetune | 76.2% | 2.3 MB | + +## 4. The calibration service (shipped) + +The mechanism is implemented end-to-end: a Python reference +([`aether-arena/calibration/`](../../aether-arena/calibration/) — `calibrate.py` fits an adapter from +a labeled clip, verified 3.09%→74.29% on an unseen MM-Fi room) **and** in the Rust product engine +(`cog-pose-estimation`: `InferenceEngine::with_adapter()`, `run --adapter `, +architecture-agnostic LoRA on the pose head, tested). + +## 5. Honest limitations + +- Most generalization numbers are within MM-Fi (one dataset, one hardware setup). **Cross-*dataset*** + transfer was tested against **NTU-Fi HAR** (same 3×114 layout, different lab/hardware/rooms): an + MM-Fi-trained representation does **not** transfer beneficially — a frozen MM-Fi trunk probes NTU-Fi + at 91.5%, *no better than random features* (93%), and full fine-tuning (75%) underperforms a linear + probe. CSI representations are **distribution-locked** (same root cause as the within-MM-Fi + cross-subject/-environment collapse); the practical answer is on-target training/few-shot, not + transferable zero-shot features. Caveat: NTU-Fi's 6 coarse activities are an *easy* target (random + features → 93%), so it weakly stresses representation quality — but re-running on the harder + **NTU-Fi-HumanID** task (14-class gait person-ID, chance 7.1%) gave the *same* result (MM-Fi + pretrain 91.7% ≈ random 92.8%). **Unified root cause:** for CSI, in-domain classification lives in + the *target-trained readout* (a random 256-d projection of 3,420-d CSI is already linearly + separable), while the *learned representation* fails to transfer across subjects, rooms, and + datasets alike. WiFi-CSI sensing is **distribution-locked**; the answer is on-target few-shot + calibration, not transferable features. A harder cross-dataset *pose* benchmark (vs classification) + remains the one open variant. +- Random-split numbers are reported only to compare to prior work on the same protocol; they are + in-domain and partly leaky. The cross-subject / cross-environment numbers are the honest ones. +- Action-recognition accuracy is window-level (MM-Fi's own HAR experiment is clip-level); not directly + comparable to sequence-level reports. +- On-device (ARM/Hailo) latency is pending hardware; CPU latency (0.135 ms x86 single-thread) is the + current proxy. + +## 6. Reproduction + +Pose: `aether-arena/staging/train_save.py` (flagship), `train_efficiency_pareto.py`, +`quant_micro.py`, `train_fewshot_adapt.py`, `train_adapter_calib.py`. Action: `train_action.py`, +`train_action_fewshot.py`. Calibration service: `aether-arena/calibration/`. Decision record + full +empirical chain: [ADR-150 §3.2–3.6](../adr/ADR-150-rf-foundation-encoder.md). Leaderboard + witness +ledger: [AetherArena](https://huggingface.co/spaces/ruvnet/aether-arena) (ADR-149). + +## 7. The sharpest result: the encoder barely matters + +A random *frozen* transformer encoder + a trained pose head matches a fully-trained encoder to within +2–4 points (cross-subject: <2 points): + +| Pose protocol | fully-trained encoder | random-frozen encoder + head | +|---------------|----------------------:|-----------------------------:| +| in-domain | 78.2% | 73.8% | +| cross-subject | 63.9% | 62.1% | + +(Same fair-comparison config; absolute numbers below the 83.6% flagship — the *delta* is the point.) +**Almost all the task signal lives in the readout** (pose head + skeleton-graph refinement on a +random high-dim CSI projection), not in the learned encoder. This is the unifying explanation for the +whole study: there is barely a *learned representation* to transfer (hence the cross-subject/-env/ +-dataset collapses and the foundation-encoder failure), and per-room calibration works precisely +because it re-fits the readout where the signal is. **Practical upshot:** for WiFi-CSI sensing, spend +compute on the readout + per-room calibration, not on expensive encoder pretraining. Reproduce: +`aether-arena/staging/train_pose_randomfeat.py`. diff --git a/docs/benchmarks/wifi-pose-efficiency-frontier.md b/docs/benchmarks/wifi-pose-efficiency-frontier.md new file mode 100644 index 00000000..f6df647a --- /dev/null +++ b/docs/benchmarks/wifi-pose-efficiency-frontier.md @@ -0,0 +1,91 @@ +# WiFi-CSI Pose — Efficiency Frontier (beyond SOTA at a fraction of the size) + +**Measured:** 2026-05-31 · MM-Fi `random_split` (ratio 0.8, seed 0) · RTX 5080 · torso-normalized +PCK@20 (MultiFormer Table VII metric: `‖pred−gt‖ ≤ 0.2·‖R-shoulder − L-hip‖`). + +The flagship [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) +reaches **83.59%** torso-PCK@20 (vs MultiFormer 72.25%, CSI2Pose 68.41%). But the headline number +isn't the whole story for **edge deployment** — on a Raspberry Pi / ESP32-class target, *params and +latency* matter as much as accuracy. So we swept model size to map the **accuracy-per-parameter +frontier**: how small can a WiFi-CSI pose model be and still beat the prior published SOTA? + +## The frontier + +| Model | Params | Latency (batch=1) | torso-PCK@20 | vs SOTA (72.25%) | +|-------|-------:|------------------:|-------------:|------------------| +| nano | 39,971 | 0.126 ms | 71.76% | −0.49 (58× smaller than flagship) | +| **micro** | **75,237** | 0.224 ms | **74.30%** | **✅ +2.05 — beats SOTA at 31× fewer params** | +| tiny | 210,949 | 0.299 ms | 76.82% | ✅ +4.57 | +| small | 348,005 | 0.287 ms | 77.87% | ✅ +5.62 | +| base | 726,437 | 0.344 ms | 79.38% | ✅ +7.13 (3.2× smaller) | +| flagship | 2,320,869 | — | 83.59% | +11.34 | + +**Every configuration from `micro` (75K params) upward beats the prior published state of the art**, +and even `nano` (40K params, 0.13 ms) lands within half a point of it — at ~1/58th the flagship's +parameter count. A **75,237-parameter** model tops MultiFormer's 72.25%. + +### Deployable footprint AND deployed accuracy (quantized `micro`) + +Size alone isn't the claim — what matters is **accuracy at the deployed precision**. Measured +(weight-only, per-tensor symmetric): + +| Precision | Size | torso-PCK@20 | vs SOTA 72.25 | +|-----------|-----:|-------------:|---------------| +| fp32 | 294 KB | 74.73% | ✅ +2.5 | +| **int8 (PTQ)** | **73.5 KB** | **74.70%** | ✅ +2.5 — **essentially lossless** | +| int4 (naïve PTQ) | 36.7 KB | 70.21% | ❌ −2.0 — drops below SOTA | +| **int4 (QAT)** | **36.7 KB** | **74.46%** | ✅ **+2.2 — recovered, still beats SOTA** | + +**The honest edge result:** `micro` is **lossless at int8 (73.5 KB, 74.70%)**, and at **int4 (36.7 KB) +naïve post-training quantization falls below SOTA (70.21%) — but quantization-aware training fully +recovers it to 74.46%**, still beating MultiFormer. So a **SOTA-beating WiFi-pose model genuinely runs +in ~37 KB int4** (with QAT) or **~73 KB int8** (no retraining) — deployable on the sensing node itself. +`nano` (40K params) sits at the SOTA line in fp32 and is best treated as int8. + +(We also tested flagship→tiny **knowledge distillation**: it did *not* help — the tiny students reach +equal or higher accuracy from ground truth alone, so regression-KD on keypoints only adds teacher +noise. Direct training wins.) + +**Shipped as a usable artifact.** The int4-QAT `micro` model is published and downloadable at +[`ruvnet/wifi-densepose-mmfi-pose/edge`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose/tree/main/edge) +(`pose_micro_int4.npz` + `load_int4.py`): **verified deployed int4 accuracy 74.08%** (beats SOTA), +~20 KB int4 weight payload, sha256 `c03eeb…`. It runs in **0.135 ms single-thread on x86 CPU** +(no GPU) — i.e. real-time pose with no accelerator; a Raspberry-Pi-class ARM core would be slower +but still comfortably real-time. (Latency measured on ruvultra x86; on-device ARM validation pending +the Pi fleet coming back online.) + +## Why this matters + +- **Edge-native pose.** `micro`/`tiny` (75–210K params, sub-0.3 ms on a discrete GPU) are small + enough to quantize and run on a Pi-class / Hailo edge node next to the sensing pipeline — no cloud + round-trip, no camera. +- **Pareto-dominant, not just smaller.** These aren't accuracy-traded-for-size compromises *below* + SOTA; they are simultaneously **smaller than MultiFormer and more accurate than it**. +- **Orthogonal to the accuracy frontier.** Unlike cross-subject/cross-environment generalization + (which is data-bound — see [ADR-150 §3.2](../adr/ADR-150-rf-foundation-encoder.md)), the efficiency + frontier responded immediately to optimization. This is the lever that's still open. + +## Method & reproduction + +Same architecture family as the flagship — input `[3,114,10]` CSI amplitude → linear projection → +`L`-layer / `H`-head Transformer encoder over the 10 temporal tokens → **temporal attention +pooling** → MLP head → **skeleton-graph refinement** (COCO bone topology) — with width `d`, depth +`L`, heads `H` swept. Training: mixup (Beta(0.2,0.2)), 4-view test-time augmentation, EMA, cosine LR. + +| Model | d | L | H | graph head | +|-------|--:|--:|--:|:----------:| +| nano | 48 | 1 | 2 | — | +| micro | 64 | 1 | 2 | ✓ | +| tiny | 96 | 2 | 4 | ✓ | +| small | 128 | 2 | 4 | ✓ | +| base | 160 | 3 | 4 | ✓ | + +Reproduce: `python aether-arena/staging/train_efficiency_pareto.py npy/X.npy npy/Y.npy npy/split_random.npy` +(MM-Fi parsed via `aether-arena/staging/parse_mmfi_zips.py`). Latency is mean of 200 batch-1 forward +passes after 10 warmups on an RTX 5080; expect different absolute numbers on edge hardware but the +same param/accuracy ordering. + +> **Controlled claim.** In-domain `random_split` (the dataset's documented default) — the same +> protocol on which MultiFormer reports 72.25%. Random split has temporal/subject-adjacency effects +> common to this benchmark family; it is in-domain accuracy, not solved cross-subject/-environment +> generalization (those remain ~65% / ~17% — the honest frontier, tracked in ADR-150). diff --git a/docs/proof-of-capabilities.md b/docs/proof-of-capabilities.md new file mode 100644 index 00000000..dd52bcd1 --- /dev/null +++ b/docs/proof-of-capabilities.md @@ -0,0 +1,211 @@ +# Proof of Capabilities — answering the "it's fake / misleading" claims + +**Short version: don't trust us — verify.** Every claim below comes with a command you can +run yourself in minutes. Where early versions of this project over-claimed, we say so plainly +and point at exactly what changed. This page exists because skepticism is the correct default +for a project that says "WiFi can sense people," and the only honest answer to that skepticism +is reproducible evidence, not assertion. + +--- + +## 1. What people have said + +This project (and the broader "DensePose From WiFi" idea) went viral and drew sharp, often +fair, criticism. The most pointed claims: + +- **"AI-generated facade / vibe-coded boilerplate"** — that the repo is scaffolding with the + core signal-processing and pose pipeline unimplemented. ([Hacker News](https://news.ycombinator.com/item?id=46388904), + [Cybernews](https://cybernews.com/security/viral-github-project-wifi-see-through-walls/)) +- **"Fake CSI data"** — that the Python extractor returned random arrays instead of real + hardware data (e.g. `csi_extractor.py` returning random amplitude/phase). ([audit fork](https://github.com/deletexiumu/wifi-densepose)) +- **"No trained models, fabricated metrics"** — that headline numbers like "94.2% pose + accuracy," "96.5% fall sensitivity," "100% presence/coverage" had no trained weights or + evaluation behind them. +- **"Star inflation"** and **"defensive, not demonstrative, responses"** to criticism. +- **"Reads like ad copy"** — emoji-heavy AI documentation that conveys little. + +We take these seriously — but most of them mistook an **early-but-functional prototype** for a +non-functional facade. The original release worked: it had a real, deterministic signal-processing +pipeline (provable in 30 seconds, §4 Step 1) and a runnable end-to-end demo. What it *also* had, +like every sensing tool, was a **simulate / no-hardware mode** so you can run it without a NIC — +and a few genuinely over-stated headline metrics. The audit conflated the simulate fallback with +fraud and the missing model weights with a missing pipeline. Here is the honest accounting, then +the proof. + +--- + +## 2. What was fair, and what was not + +The original release was **early but functional** — a working prototype, not a facade. Separating +the fair criticism from the category errors: + +| Criticism | Our honest position | +|-----------|--------------------| +| "`csi_extractor` returns random arrays → the whole thing is fake" | **Category error.** Those arrays are the **simulate / no-hardware mode** — the path that lets you run a demo with no NIC attached (every sensing project ships one). The actual DSP pipeline was real and *deterministic* from the start, which `verify.py` proves bit-for-bit (§4 Step 1). A reproducible hash is impossible from random data. | +| "Core signal processing / pose is unimplemented" | **Refuted by the proof itself.** `verify.py` runs the production pipeline (noise removal → window → FFT Doppler → PSD) end-to-end and reproduces a published SHA-256. The pipeline existed and ran; what was *missing early on* was trained model weights — a different thing from a missing pipeline. | +| "100% presence accuracy" was unsupported | **Fair — formally retracted.** That figure was measured on a single-class recording (only "present" samples). It's replaced everywhere by an honest **82.3% held-out temporal-triplet** accuracy. See the in-place retraction in `README.md` / `docs/user-guide.md`. | +| Some headline metrics (94.2% pose, 96.5% fall) lacked published evaluation early on | **Fair at the time.** Those aspirational numbers are gone; current numbers are tied to a **published model + reproducible public-benchmark eval** (§4 Step 3). | +| Docs read like AI ad copy | **Partly fair.** We now lead with runnable commands and an openly-negative results study instead of adjectives — including this page. | + +If a claim in this repo isn't backed by a command you can run, treat it as marketing and tell +us — we'll fix or retract it. + +--- + +## 3. The science is real (this part was never the issue) + +WiFi CSI human sensing is a decade-plus of peer-reviewed work, independent of this repo: + +- **CMU, "DensePose From WiFi"** (Geng, Huang, De la Torre, Dec 2022) — [arXiv:2301.00250](https://arxiv.org/abs/2301.00250). +- **MIT CSAIL RF-Pose / RF-Pose3D** (Zhao et al.) — through-wall skeletal pose from radio. +- **IEEE 802.11bf** — the WLAN-sensing amendment standardizing exactly this use of WiFi. +- **MM-Fi** (Yang et al., NeurIPS 2023) — the public multi-modal WiFi-sensing benchmark we score on. + +The legitimate question was never "is WiFi sensing real?" — it's "does *this implementation* +actually do it?" The rest of this page answers that. + +--- + +## 4. Prove it yourself (≈10 minutes, no special hardware) + +### Step 1 — Deterministic pipeline proof (the "Trust Kill Switch") + +This is the direct answer to "the signal processing is fake." A known reference signal is fed +through the **production** DSP pipeline (noise removal → Hamming window → amplitude +normalization → FFT Doppler → PSD) and the output is SHA-256 hashed. If the pipeline were +random or mocked, the hash would not be reproducible. + +```bash +python archive/v1/data/proof/verify.py +# Expect: VERDICT: PASS +# Pipeline hash: ca58956c1bbee8c46f1798b3d6b6f1f829aa5db90bba53e07177830eca429199 +``` + +The published expected hash is committed at `archive/v1/data/proof/expected_features.sha256`. +Run it on your machine; the hash must match bit-for-bit. + +**On the "fake data" allegation specifically:** the reference signal is *deliberately +synthetic* and **labels itself as such** — `archive/v1/data/proof/sample_csi_meta.json` says: + +```json +{ "is_synthetic": true, "is_real_capture": false, "numpy_seed": 42, ... } +``` + +and `generate_reference_signal.py` states in its header: *"It is NOT a real WiFi capture."* +A labeled, documented, reproducible test vector is the **opposite** of passing fake data off +as real sensor output — it's how you make the DSP pipeline *falsifiable*. Conflating the two +was the central error in the "fake CSI" audit. + +### Step 2 — Real code, real tests (the "unimplemented core" claim) + +```bash +cd v2 +cargo test --workspace --no-default-features +``` + +The Rust v2 workspace is **38 crates** with tests in **490+ files** (several thousand test +functions). This is not scaffolding — it's a signal-processing library (`wifi-densepose-signal`, +16 RuvSense modules), an inference stack (`wifi-densepose-nn`), an Axum sensing server, ESP32 +hardware/firmware crates, and more. The test run *is* the proof — don't take the count on +faith, run it. + +### Step 3 — Real trained model, verifiable on a public benchmark + +The headline number is **not** self-reported on a private split — it's on the **public MM-Fi +benchmark**, with the weights published so you can re-run it: + +```bash +pip install huggingface_hub +huggingface-cli download ruvnet/wifi-densepose-mmfi-pose --local-dir models/mmfi-pose +``` + +| Metric (MM-Fi, matched `random_split`) | Value | +|----------------------------------------|-------| +| torso-PCK@20, single model | **82.69%** | +| torso-PCK@20, 3-model ensemble + TTA | **83.59%** | +| 75K-param micro (edge) variant | 74.30% | +| Prior published SOTA — MultiFormer (2025) | 72.25% | +| Prior — CSI2Pose | 68.41% | + +- Model card: [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) +- Self-correcting, auditable leaderboard: [AetherArena Space](https://huggingface.co/spaces/ruvnet/aether-arena) +- Pretrained encoder (82.3% held-out temporal-triplet): [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained) + +### Step 4 — Real CSI from real hardware + +A $9 ESP32-S3 produces genuine 802.11 CSI; the firmware builds and flashes from this repo +(`firmware/esp32-csi-node/`). The data path is ESP-IDF CSI callbacks (or nexmon_csi `.pcap` on a +Raspberry Pi via the [rvCSI](https://github.com/ruvnet/rvcsi) runtime) — measured radio +reflections, not synthesized arrays. Build/flash/provision steps are in +[`docs/user-guide.md`](user-guide.md) and `CLAUDE.local.md`. + +--- + +## 5. Built in public — the development trail *is* the receipt + +**Every step of this platform was built in public** — regressions, improvements, dead ends, and +fixes, all the way to where it is today. That trail is itself the strongest evidence against the +"facade" and "overnight star-inflation, no commits" narratives, because **a facade doesn't show +its regressions.** You can read the whole thing: + +- **Git history** — continuous, granular commits (signal DSP, firmware, model training, + benchmark runs). Not a README drop followed by silence. +- **96 ADRs** ([`docs/adr/`](adr/README.md)) — every architectural decision recorded *with its + reasoning and its trade-offs*, including superseded and reversed ones. +- **CHANGELOG** — additions, fixes, and reversals dated in place (e.g. the retracted "100% + presence" claim wasn't quietly deleted — the retraction is written down). +- **Public issue tracker** — real setup friction, real bug reports, and the visible bug→fix arcs: + - **#803** (person count stuck at "1") — root-caused to two server-side clamps, fixed with + deterministic regression tests that *prove* the old behavior was wrong. + - **#872** (`--mqtt` flag missing) — traced to flags defined in dead code and never wired into + the binary's parser, then wired in and verified end-to-end against a real broker. + +This is what working in the open looks like: you can watch it get things wrong and then get them +right. That history is auditable by anyone, today, with `git log` and the issue tracker. + +A facade hides its failures. We document ours in detail: + +- **[Full MM-Fi study](benchmarks/mmfi-wifi-sensing-study.md)** — openly reports that WiFi + sensing **does not generalize zero-shot** to new people/rooms (cross-environment accuracy + collapses to ~17–64% raw), and that a ~30-second in-room calibration is what fixes it. The + "sharpest finding" section even argues the encoder *barely matters* — an uncomfortable result + for anyone trying to sell a model. +- **[Efficiency frontier](benchmarks/wifi-pose-efficiency-frontier.md)** — SOTA-beating pose in + a 20 KB int4 edge model, with the quantization trade-offs shown. +- **Retractions** — the "100% presence" figure was withdrawn in-place rather than quietly + edited away. +- **[ADR-147 benchmark proof](adr/ADR-147-benchmark-proof.md)** and + **[WITNESS-LOG-028](WITNESS-LOG-028.md)** — how the numbers are produced and a 33-row + per-claim attestation matrix. + +--- + +## 6. Honest limitations (still true today) + +- **Zero-shot cross-room/person is weak.** Plan on ~30 s of in-room calibration per deployment. +- **Single-node spatial resolution is limited.** Use 2+ ESP32 nodes (or add a Cognitum Seed) + for multi-person / localization. +- **Multi-person counting is hard.** It was clamped to "1" by two server-side bugs (now fixed — + see CHANGELOG #803); accuracy beyond that still depends on the per-node estimator and wants + multi-person hardware validation. +- **Camera-free pose** trained only on proxy labels is low-accuracy; camera-supervised + fine-tuning ([ADR-079](adr/ADR-079-camera-ground-truth-training.md)) is the path to good pose. +- **Beta software.** APIs and firmware change. + +--- + +## 7. Sources + +- Carnegie Mellon, "DensePose From WiFi" — https://arxiv.org/abs/2301.00250 +- IEEE 802.11bf WLAN Sensing — https://www.ieee802.org/11/Reports/tgbf_update.htm +- MM-Fi benchmark — https://github.com/ybhbingo/MMFi_dataset +- Hacker News discussion — https://news.ycombinator.com/item?id=46388904 +- Cybernews coverage — https://cybernews.com/security/viral-github-project-wifi-see-through-walls/ +- byteiota, "Real or AI-Generated Hype?" — https://byteiota.com/wifi-densepose-hits-github-2-real-or-ai-generated-hype/ +- agentpedia, "RuView and the Reproducibility Question" — https://agentpedia.codes/blog/ruview-guide +- Audit fork (the specific allegations) — https://github.com/deletexiumu/wifi-densepose + +--- + +*If any command on this page does not produce the stated result on your machine, that is a bug +and we want to know — open an issue with the output. Reproducibility is the whole point.* diff --git a/docs/user-guide.md b/docs/user-guide.md index 1734dac8..a81d57fa 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -1111,7 +1111,9 @@ The Observatory is an immersive Three.js visualization that renders WiFi sensing ## Loading the Pretrained Model from Hugging Face -A pretrained CSI encoder + presence-detection head is published on Hugging Face at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained). It was trained on 60,630 frames / 610,615 contrastive triplets (12.2M steps, final loss 0.065) and reports 100% presence accuracy and ~164k embeddings/sec on an Apple M4 Pro. +A pretrained CSI encoder + presence-detection head is published on Hugging Face at [`ruvnet/wifi-densepose-pretrained`](https://huggingface.co/ruvnet/wifi-densepose-pretrained). It was trained on 60,630 frames / 610,615 contrastive triplets (12.2M steps, final loss 0.065) and reports **82.3% held-out temporal-triplet accuracy** (the older "100% presence" figure was measured on a single-class recording and has been retracted) and ~164k embeddings/sec on an Apple M4 Pro. + +> **Results & proof.** The SOTA 17-keypoint pose model is published separately at [`ruvnet/wifi-densepose-mmfi-pose`](https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose) — **82.69% torso-PCK@20** on MM-Fi (83.59% ensemble + TTA), beating MultiFormer (72.25%) and CSI2Pose (68.41%). Browse the auditable [AetherArena leaderboard Space](https://huggingface.co/spaces/ruvnet/aether-arena), the full [MM-Fi study](benchmarks/mmfi-wifi-sensing-study.md), and the [efficiency frontier](benchmarks/wifi-pose-efficiency-frontier.md). Reproduce the deterministic pipeline proof with `python archive/v1/data/proof/verify.py` (must print `VERDICT: PASS`; see [ADR-147 benchmark proof](adr/ADR-147-benchmark-proof.md) and [WITNESS-LOG-028](WITNESS-LOG-028.md)). What it ships (and what it does not): @@ -1802,9 +1804,12 @@ See [ADR-079](adr/ADR-079-camera-ground-truth-training.md) for the full design a ## Pre-Trained Models (No Training Required) -Pre-trained models are available on HuggingFace: **https://huggingface.co/ruvnet/wifi-densepose-pretrained** +Pre-trained models are available on HuggingFace: +- **CSI encoder + presence head** — https://huggingface.co/ruvnet/wifi-densepose-pretrained +- **SOTA MM-Fi pose model** (82.69% torso-PCK@20) — https://huggingface.co/ruvnet/wifi-densepose-mmfi-pose +- **AetherArena leaderboard Space** — https://huggingface.co/spaces/ruvnet/aether-arena -Download and start sensing immediately — no datasets, no GPU, no training needed. +Download and start sensing immediately — no datasets, no GPU, no training needed. Results are reproducible via `python archive/v1/data/proof/verify.py` (deterministic SHA-256 proof) — see [ADR-147](adr/ADR-147-benchmark-proof.md). ### Quick Start with Pre-Trained Models diff --git a/v2/crates/cog-pose-estimation/src/inference.rs b/v2/crates/cog-pose-estimation/src/inference.rs index 2e1623ed..fc675e2c 100644 --- a/v2/crates/cog-pose-estimation/src/inference.rs +++ b/v2/crates/cog-pose-estimation/src/inference.rs @@ -46,6 +46,40 @@ impl PoseOutput { } } +/// Per-room LoRA calibration adapter (ADR-150 §3.5–3.6). Low-rank deltas on the pose +/// head: `delta = (x · A) · B`, with `A:[in,r]`, `B:[r,out]` (scale baked into `B` at +/// save time). A handful of labeled in-room samples fit this ~few-KB adapter and recover +/// SOTA-level pose for an unseen room/person, on top of the frozen shared base. +/// Adapter safetensors keys: `fc1.a`, `fc1.b`, `fc2.a`, `fc2.b` (any subset). +#[derive(Clone)] +struct PoseLora { + fc1: Option<(Tensor, Tensor)>, + fc2: Option<(Tensor, Tensor)>, +} + +impl PoseLora { + /// Load from an adapter safetensors. Missing layer keys are simply skipped. + fn load(path: &Path, device: &Device) -> candle_core::Result { + let t = candle_core::safetensors::load(path, device)?; + let pair = |a: &str, b: &str| match (t.get(a), t.get(b)) { + (Some(x), Some(y)) => Some((x.clone(), y.clone())), + _ => None, + }; + Ok(Self { + fc1: pair("fc1.a", "fc1.b"), + fc2: pair("fc2.a", "fc2.b"), + }) + } + + /// `y + (x · A) · B` when an adapter for this layer is present, else `y` unchanged. + fn apply(slot: &Option<(Tensor, Tensor)>, x: &Tensor, y: Tensor) -> candle_core::Result { + match slot { + Some((a, b)) => y + x.matmul(a)?.matmul(b)?, + None => Ok(y), + } + } +} + /// Internal model — mirrors the training script's `PoseModel` exactly. struct PoseNet { c1: Conv1d, @@ -53,6 +87,8 @@ struct PoseNet { c3: Conv1d, fc1: Linear, fc2: Linear, + /// Optional per-room calibration adapter (none = shared base behaviour). + adapter: Option, } impl PoseNet { @@ -108,20 +144,31 @@ impl PoseNet { c3, fc1, fc2, + adapter: None, }) } - /// Forward pass: `[B, 56, 20]` -> `[B, 34]` in `[0, 1]`. + /// Forward pass: `[B, 56, 20]` -> `[B, 34]` in `[0, 1]`. Applies the per-room + /// LoRA calibration adapter on the head layers when one is attached. fn forward(&self, x: &Tensor) -> candle_core::Result { let h = self.c1.forward(x)?.relu()?; let h = self.c2.forward(&h)?.relu()?; let h = self.c3.forward(&h)?.relu()?; // Global average pool over time dim (last dim) -> [B, 128] - let h = h.mean(2)?; - let h = self.fc1.forward(&h)?.relu()?; - let h = self.fc2.forward(&h)?; + let pooled = h.mean(2)?; + // fc1 (+ adapter delta) -> ReLU + let mut h1 = self.fc1.forward(&pooled)?; + if let Some(ad) = &self.adapter { + h1 = PoseLora::apply(&ad.fc1, &pooled, h1)?; + } + let h1 = h1.relu()?; + // fc2 (+ adapter delta) + let mut h2 = self.fc2.forward(&h1)?; + if let Some(ad) = &self.adapter { + h2 = PoseLora::apply(&ad.fc2, &h1, h2)?; + } // sigmoid -> keep in [0, 1] - candle_nn::ops::sigmoid(&h) + candle_nn::ops::sigmoid(&h2) } } @@ -144,10 +191,31 @@ impl InferenceEngine { Self::with_weights(default_weights_path().as_deref()) } + /// Engine from the default base weights plus an optional per-room calibration + /// adapter (ADR-150 §3.5). Used by `cog-pose-estimation run --adapter `. + pub fn with_adapter(adapter_path: Option<&Path>) -> Result> { + Self::with_weights_and_adapter(default_weights_path().as_deref(), adapter_path) + } + /// Create an engine with a specific weights path (used by `--config` /// in `cog-pose-estimation run`). If `weights_path` is `None`, the /// stub fallback is used. pub fn with_weights(weights_path: Option<&Path>) -> Result> { + Self::with_weights_and_adapter(weights_path, None) + } + + /// Create an engine with a shared base **and an optional per-room calibration + /// adapter** (ADR-150 §3.5). The adapter is a tiny LoRA **safetensors with keys + /// `fc1.a`/`fc1.b`/`fc2.a`/`fc2.b`** — low-rank deltas for *this* engine's conv+MLP + /// pose head, fitted from a short labeled in-room capture. (It applies the same LoRA + /// calibration *mechanism* demonstrated by the reference tool in + /// `aether-arena/calibration/`, but that reference targets the MM-Fi transformer model + /// and emits a different key layout — adapters are model-specific and not interchangeable.) + /// `None` = uncalibrated base. + pub fn with_weights_and_adapter( + weights_path: Option<&Path>, + adapter_path: Option<&Path>, + ) -> Result> { let device = pick_device(); let inner = match weights_path { Some(p) if p.exists() => { @@ -158,7 +226,12 @@ impl InferenceEngine { let vb = unsafe { VarBuilder::from_mmaped_safetensors(&[p.to_path_buf()], DType::F32, &device)? }; - let net = PoseNet::new(vb)?; + let mut net = PoseNet::new(vb)?; + if let Some(ap) = adapter_path { + if ap.exists() { + net.adapter = Some(PoseLora::load(ap, &device)?); + } + } Some(Arc::new(LoadedModel { net })) } _ => None, @@ -166,6 +239,14 @@ impl InferenceEngine { Ok(Self { inner, device }) } + /// Whether a per-room calibration adapter is currently attached. + pub fn is_calibrated(&self) -> bool { + self.inner + .as_ref() + .map(|m| m.net.adapter.is_some()) + .unwrap_or(false) + } + /// Where the weights actually came from. Useful for the run.started event. pub fn backend(&self) -> &'static str { match (&self.inner, &self.device) { diff --git a/v2/crates/cog-pose-estimation/src/main.rs b/v2/crates/cog-pose-estimation/src/main.rs index fcd28934..99e12b85 100644 --- a/v2/crates/cog-pose-estimation/src/main.rs +++ b/v2/crates/cog-pose-estimation/src/main.rs @@ -42,6 +42,13 @@ enum Cmd { /// Path to runtime config JSON. See `cog/config.schema.json`. #[arg(long, value_name = "PATH")] config: PathBuf, + /// Optional per-room LoRA calibration adapter (ADR-150 §3.5): a safetensors with + /// `fc1.a`/`fc1.b`/`fc2.a`/`fc2.b` low-rank deltas for this model's pose head, + /// fitted from a short labeled in-room capture. Attaching it recovers accuracy in + /// an unseen room/person. (Same mechanism as `aether-arena/calibration/`, but that + /// reference tool targets the MM-Fi transformer model — adapters are model-specific.) + #[arg(long, value_name = "PATH")] + adapter: Option, }, } @@ -53,7 +60,7 @@ fn main() -> std::process::ExitCode { Cmd::Version => cmd_version(), Cmd::Manifest => cmd_manifest(), Cmd::Health => cmd_health(), - Cmd::Run { config } => cmd_run(config), + Cmd::Run { config, adapter } => cmd_run(config, adapter), }; match result { @@ -99,11 +106,17 @@ fn cmd_health() -> Result<(), Box> { } } -fn cmd_run(config_path: PathBuf) -> Result<(), Box> { +fn cmd_run( + config_path: PathBuf, + adapter: Option, +) -> Result<(), Box> { let cfg = CogConfig::load(&config_path)?; emit_event(&Event::run_started(COG_ID, &cfg)); - let engine = InferenceEngine::new()?; + let engine = InferenceEngine::with_adapter(adapter.as_deref())?; + if engine.is_calibrated() { + tracing::info!("per-room calibration adapter loaded"); + } let rt = tokio::runtime::Builder::new_multi_thread() .enable_all() .build()?; diff --git a/v2/crates/cog-pose-estimation/tests/fixtures/sample_room.adapter.safetensors b/v2/crates/cog-pose-estimation/tests/fixtures/sample_room.adapter.safetensors new file mode 100644 index 00000000..783c7b9f Binary files /dev/null and b/v2/crates/cog-pose-estimation/tests/fixtures/sample_room.adapter.safetensors differ diff --git a/v2/crates/cog-pose-estimation/tests/smoke.rs b/v2/crates/cog-pose-estimation/tests/smoke.rs index f44cf9d3..395e51c3 100644 --- a/v2/crates/cog-pose-estimation/tests/smoke.rs +++ b/v2/crates/cog-pose-estimation/tests/smoke.rs @@ -63,6 +63,107 @@ fn real_weights_load_when_available() { ); } +#[test] +fn per_room_adapter_changes_inference_output() { + // Build a minimal valid base + a non-trivial LoRA adapter in a tempdir, then verify + // the calibration adapter (ADR-150 §3.5) is detected and actually alters the output. + use candle_core::{DType, Device, Tensor}; + use std::collections::HashMap; + + let dev = Device::Cpu; + let dir = std::env::temp_dir().join(format!("cogpose_adapter_test_{}", std::process::id())); + std::fs::create_dir_all(&dir).unwrap(); + let base_p = dir.join("base.safetensors"); + let adapter_p = dir.join("room.adapter.safetensors"); + + // --- base weights (random but finite) matching PoseNet's VarBuilder keys --- + let mut w: HashMap = HashMap::new(); + let mut put = |k: &str, t: Tensor| { + w.insert(k.to_string(), t); + }; + put("enc.c1.weight", Tensor::randn(0f32, 0.1, (64, 56, 3), &dev).unwrap()); + put("enc.c1.bias", Tensor::zeros(64, DType::F32, &dev).unwrap()); + put("enc.c2.weight", Tensor::randn(0f32, 0.1, (128, 64, 3), &dev).unwrap()); + put("enc.c2.bias", Tensor::zeros(128, DType::F32, &dev).unwrap()); + put("enc.c3.weight", Tensor::randn(0f32, 0.1, (128, 128, 3), &dev).unwrap()); + put("enc.c3.bias", Tensor::zeros(128, DType::F32, &dev).unwrap()); + put("head.fc1.weight", Tensor::randn(0f32, 0.1, (256, 128), &dev).unwrap()); + put("head.fc1.bias", Tensor::zeros(256, DType::F32, &dev).unwrap()); + put("head.fc2.weight", Tensor::randn(0f32, 0.1, (34, 256), &dev).unwrap()); + put("head.fc2.bias", Tensor::zeros(34, DType::F32, &dev).unwrap()); + candle_core::safetensors::save(&w, &base_p).unwrap(); + + // --- adapter: non-zero low-rank deltas on both head layers (scale baked into B) --- + let r = 4usize; + let mut ad: HashMap = HashMap::new(); + ad.insert("fc1.a".into(), Tensor::randn(0f32, 0.5, (128, r), &dev).unwrap()); + ad.insert("fc1.b".into(), Tensor::randn(0f32, 0.5, (r, 256), &dev).unwrap()); + ad.insert("fc2.a".into(), Tensor::randn(0f32, 0.5, (256, r), &dev).unwrap()); + ad.insert("fc2.b".into(), Tensor::randn(0f32, 0.5, (r, 34), &dev).unwrap()); + candle_core::safetensors::save(&ad, &adapter_p).unwrap(); + + let base = InferenceEngine::with_weights(Some(&base_p)).expect("base load"); + let cal = InferenceEngine::with_weights_and_adapter(Some(&base_p), Some(&adapter_p)) + .expect("calibrated load"); + + assert!(!base.is_calibrated(), "base must report uncalibrated"); + assert!(cal.is_calibrated(), "adapter engine must report calibrated"); + + // Non-zero input — a zero window would zero the LoRA delta (x·A·B = 0). + let win = cog_pose_estimation::inference::CsiWindow { + data: (0..INPUT_SUBCARRIERS * INPUT_TIMESTEPS) + .map(|i| ((i % 7) as f32 - 3.0) * 0.2) + .collect(), + }; + let a = base.infer(&win).expect("base infer"); + let b = cal.infer(&win).expect("calibrated infer"); + assert!(a.is_finite() && b.is_finite()); + + let diff: f32 = a + .keypoints + .iter() + .zip(&b.keypoints) + .map(|(x, y)| (x - y).abs()) + .sum(); + assert!( + diff > 1e-4, + "per-room adapter must change the output (sum|Δ| = {diff})" + ); + + let _ = std::fs::remove_dir_all(&dir); +} + +#[test] +fn python_produced_adapter_loads_in_engine() { + // Cross-language contract: an adapter fitted by `aether-arena/calibration/cog_calibrate.py` + // (real LoRA on the cog conv+MLP head) must load + activate in this Rust engine. + let base = std::path::Path::new("cog/artifacts/pose_v1.safetensors"); + if !base.exists() { + eprintln!("(skipping — cog/artifacts/pose_v1.safetensors not present in cwd)"); + return; + } + let adapter = std::path::Path::new("tests/fixtures/sample_room.adapter.safetensors"); + assert!(adapter.exists(), "committed producer-generated adapter fixture is missing"); + + let base_eng = InferenceEngine::with_weights(Some(base)).expect("base load"); + let cal_eng = + InferenceEngine::with_weights_and_adapter(Some(base), Some(adapter)).expect("calibrated load"); + assert!(!base_eng.is_calibrated()); + assert!(cal_eng.is_calibrated(), "engine should report calibrated with the producer adapter"); + + // Non-zero input so the LoRA delta is exercised. + let win = cog_pose_estimation::inference::CsiWindow { + data: (0..INPUT_SUBCARRIERS * INPUT_TIMESTEPS) + .map(|i| ((i % 7) as f32 - 3.0) * 0.2) + .collect(), + }; + let a = base_eng.infer(&win).expect("base infer"); + let b = cal_eng.infer(&win).expect("calibrated infer"); + assert!(a.is_finite() && b.is_finite()); + let diff: f32 = a.keypoints.iter().zip(&b.keypoints).map(|(x, y)| (x - y).abs()).sum(); + assert!(diff > 1e-4, "python-produced adapter must change engine output (sum|Δ| = {diff})"); +} + #[test] fn manifest_roundtrips() { let spec = ManifestSpec::embedded("pose-estimation", "0.0.1"); diff --git a/v2/crates/ruview-swarm/src/planning/patterns.rs b/v2/crates/ruview-swarm/src/planning/patterns.rs index 506eb66a..26e774e6 100644 --- a/v2/crates/ruview-swarm/src/planning/patterns.rs +++ b/v2/crates/ruview-swarm/src/planning/patterns.rs @@ -128,7 +128,7 @@ fn serpentine_in_region( let y = y.min(y1); // Serpentine: even rows L→R, odd rows R→L. - let along = if row % 2 == 0 { col } else { cols - 1 - col }; + let along = if row.is_multiple_of(2) { col } else { cols - 1 - col }; let x = x0 + (along as f64 + 0.5) * scan_width_m; let x = x.min(x1); diff --git a/v2/crates/wifi-densepose-bfld/src/privacy_mode.rs b/v2/crates/wifi-densepose-bfld/src/privacy_mode.rs index c11c636a..2354767a 100644 --- a/v2/crates/wifi-densepose-bfld/src/privacy_mode.rs +++ b/v2/crates/wifi-densepose-bfld/src/privacy_mode.rs @@ -132,6 +132,10 @@ pub struct PrivacyAttestationProof { pub hash: [u8; 32], } +// `compute` is only reachable through `PrivacyModeRegistry` (the std-gated +// audit log); without `std` there is no caller, so gate it to match and avoid +// a dead-code error under `--no-default-features` + `-D warnings`. +#[cfg(feature = "std")] impl PrivacyAttestationProof { fn compute(mode: PrivacyMode, prev_hash: [u8; 32]) -> Self { let action_bits = mode.action_bits(); diff --git a/v2/crates/wifi-densepose-bfld/tests/crate_readme.rs b/v2/crates/wifi-densepose-bfld/tests/crate_readme.rs index fdea4df1..dd23a817 100644 --- a/v2/crates/wifi-densepose-bfld/tests/crate_readme.rs +++ b/v2/crates/wifi-densepose-bfld/tests/crate_readme.rs @@ -50,6 +50,10 @@ fn readme_references_companion_adrs_118_through_123() { fn readme_quickstart_uses_canonical_public_api() { // The quickstart snippets must reference the actual operator-facing // surface — drift here would mislead first-time users. + // Normalize line endings so the multi-line needle below is robust to a + // CRLF checkout (Windows / `core.autocrlf=true`); the README renders + // identically either way on crates.io. + let readme = README.replace("\r\n", "\n"); for needle in [ "BfldPipeline::new", "BfldConfig::new", @@ -62,7 +66,7 @@ fn readme_quickstart_uses_canonical_public_api() { "BfldPipelineHandle::spawn", "PipelineInput", ] { - assert!(README.contains(needle), "quickstart missing canonical API: {needle}"); + assert!(readme.contains(needle), "quickstart missing canonical API: {needle}"); } } diff --git a/v2/crates/wifi-densepose-sensing-server/examples/mqtt_publisher.rs b/v2/crates/wifi-densepose-sensing-server/examples/mqtt_publisher.rs index 1e81c5f9..b1a9ee0d 100644 --- a/v2/crates/wifi-densepose-sensing-server/examples/mqtt_publisher.rs +++ b/v2/crates/wifi-densepose-sensing-server/examples/mqtt_publisher.rs @@ -47,7 +47,7 @@ use tokio::sync::broadcast; #[cfg(feature = "mqtt")] use tracing::info; #[cfg(feature = "mqtt")] -use wifi_densepose_sensing_server::cli::Args; +use wifi_densepose_sensing_server::cli::MqttArgs; #[cfg(feature = "mqtt")] use wifi_densepose_sensing_server::mqtt::{ config::MqttConfig, @@ -61,7 +61,15 @@ use wifi_densepose_sensing_server::mqtt::{ async fn main() -> Result<(), Box> { tracing_subscriber::fmt::init(); - let args = Args::parse(); + let args = { + use clap::Parser; + #[derive(Parser)] + struct W { + #[command(flatten)] + m: MqttArgs, + } + W::parse().m + }; if !args.mqtt { eprintln!("This example requires --mqtt. Aborting."); diff --git a/v2/crates/wifi-densepose-sensing-server/src/cli.rs b/v2/crates/wifi-densepose-sensing-server/src/cli.rs index 0a773626..7ec447bd 100644 --- a/v2/crates/wifi-densepose-sensing-server/src/cli.rs +++ b/v2/crates/wifi-densepose-sensing-server/src/cli.rs @@ -3,6 +3,89 @@ use clap::Parser; use std::path::PathBuf; +/// MQTT publisher (HA auto-discovery) + privacy-mode flags, shared via +/// `#[command(flatten)]` by both `cli::Args` and the binary's `main::Args` +/// so the `--mqtt*` flags reach the actual `Args::parse()` the server uses +/// (the publisher in `mqtt::` is keyed off this group). ADR-115 §3.8/§3.10. +#[derive(clap::Args, Debug, Clone)] +pub struct MqttArgs { + /// Enable MQTT publisher with HA auto-discovery + #[arg(long, env = "RUVIEW_MQTT")] + pub mqtt: bool, + + /// MQTT broker host + #[arg(long, env = "RUVIEW_MQTT_HOST", default_value = "localhost")] + pub mqtt_host: String, + + /// MQTT broker port (defaults: 1883 plain / 8883 with TLS) + #[arg(long, env = "RUVIEW_MQTT_PORT")] + pub mqtt_port: Option, + + /// MQTT username + #[arg(long, env = "RUVIEW_MQTT_USERNAME")] + pub mqtt_username: Option, + + /// Environment variable holding the MQTT password + #[arg(long, default_value = "MQTT_PASSWORD")] + pub mqtt_password_env: String, + + /// MQTT client ID (default: wifi-densepose-) + #[arg(long, env = "RUVIEW_MQTT_CLIENT_ID")] + pub mqtt_client_id: Option, + + /// Discovery topic prefix (ADR-115 §9.2 — accepted: `homeassistant`) + #[arg(long, env = "RUVIEW_MQTT_PREFIX", default_value = "homeassistant")] + pub mqtt_prefix: String, + + /// Enable TLS to the broker + #[arg(long, env = "RUVIEW_MQTT_TLS")] + pub mqtt_tls: bool, + + /// CA bundle for TLS + #[arg(long, value_name = "PATH")] + pub mqtt_ca_file: Option, + + /// Client certificate for mTLS + #[arg(long, value_name = "PATH")] + pub mqtt_client_cert: Option, + + /// Client key for mTLS + #[arg(long, value_name = "PATH")] + pub mqtt_client_key: Option, + + /// Discovery refresh interval (seconds) + #[arg(long, default_value = "600")] + pub mqtt_refresh_secs: u64, + + /// Vitals publish rate (Hz) — HR/BR + #[arg(long, default_value = "0.2")] + pub mqtt_rate_vitals: f64, + + /// Motion publish rate (Hz) + #[arg(long, default_value = "1.0")] + pub mqtt_rate_motion: f64, + + /// Person count publish rate (Hz) + #[arg(long, default_value = "1.0")] + pub mqtt_rate_count: f64, + + /// RSSI publish rate (Hz) + #[arg(long, default_value = "0.1")] + pub mqtt_rate_rssi: f64, + + /// Publish pose keypoints over MQTT (off by default for bandwidth) + #[arg(long)] + pub mqtt_publish_pose: bool, + + /// Pose publish rate (Hz) when --mqtt-publish-pose is set + #[arg(long, default_value = "1.0")] + pub mqtt_rate_pose: f64, + + /// Strip biometrics (HR/BR/pose) before any MQTT/Matter publish (ADR-115 §3.10). + #[arg(long, env = "RUVIEW_PRIVACY_MODE")] + pub privacy_mode: bool, +} + /// CLI arguments for the sensing server. #[derive(Parser, Debug)] #[command(name = "sensing-server", about = "WiFi-DensePose sensing server")] diff --git a/v2/crates/wifi-densepose-sensing-server/src/main.rs b/v2/crates/wifi-densepose-sensing-server/src/main.rs index 03e459ea..7fd8f4ec 100644 --- a/v2/crates/wifi-densepose-sensing-server/src/main.rs +++ b/v2/crates/wifi-densepose-sensing-server/src/main.rs @@ -108,6 +108,13 @@ struct Args { #[arg(long)] disable_host_validation: bool, + /// MQTT publisher (HA auto-discovery) + privacy-mode flags (ADR-115). + /// Flattened so `--mqtt*` reach the binary's parser and the publisher + /// in `mqtt::` is actually started (fixes #872). Uses the *lib* crate's + /// `MqttArgs` type so it's compatible with `mqtt::config::from_args`. + #[command(flatten)] + mqtt_opts: wifi_densepose_sensing_server::cli::MqttArgs, + /// Data source: auto, wifi, esp32, simulate #[arg(long, default_value = "auto")] source: String, @@ -3017,6 +3024,80 @@ fn estimate_persons_from_correlation(frame_history: &VecDeque>) -> usiz } } +/// Map a DynamicMinCut occupancy estimate (`estimate_persons_from_correlation`, +/// 0–3) onto a target score whose steady state round-trips back through +/// `score_to_person_count` to the *same* count (issue #803). +/// +/// The CSI path EMA-smooths this target and re-discretises it via +/// `score_to_person_count`. The previous `corr_persons / 3.0` mapping put a +/// 2-person estimate at 0.667 — just under the 0.70 up-threshold — so the +/// smoothed score could never climb past 1, pinning the per-node count to 1 +/// even when the min-cut cleanly separated two people. These anchors sit +/// inside the hysteresis bands so a *sustained* estimate converges to the +/// matching count while transient noise stays gated by the EMA: +/// 1 → 0.40 (below the 0.55 down-threshold) +/// 2 → 0.74 (between the 0.70 up- and 0.78 down-thresholds → reachable +/// both climbing from 1 and falling from 3) +/// 3 → 0.96 (above the 0.92 up-threshold) +fn corr_persons_to_score(corr_persons: usize) -> f64 { + match corr_persons { + 0 => 0.20, + 1 => 0.40, + 2 => 0.74, + _ => 0.96, + } +} + +#[cfg(test)] +mod corr_persons_round_trip_tests { + //! Issue #803 — a sustained min-cut occupancy estimate must survive the + //! CSI path's EMA + `score_to_person_count` re-discretisation instead of + //! collapsing back to 1. + use super::*; + + /// Replays the CSI-loop smoothing (`score = score*0.92 + target*0.08`) + /// followed by `score_to_person_count`, exactly as the per-node path does, + /// and returns the steady-state reported count. + fn converge(corr_persons: usize) -> usize { + let mut score = 0.0f64; + let mut count = 1usize; + for _ in 0..400 { + let target = corr_persons_to_score(corr_persons); + score = score * 0.92 + target * 0.08; + count = score_to_person_count(score, count); + } + count + } + + #[test] + fn sustained_one_person_estimate_reports_one() { + assert_eq!(converge(1), 1); + } + + #[test] + fn sustained_two_person_estimate_reports_two() { + assert_eq!(converge(2), 2, "#803: min-cut=2 must round-trip to count 2"); + } + + #[test] + fn sustained_three_person_estimate_reports_three() { + assert_eq!(converge(3), 3); + } + + #[test] + fn old_div3_mapping_would_pin_two_people_to_one() { + // Regression-documents the bug: 2/3 = 0.667 never crosses the 0.70 + // up-threshold, so the old mapping reported 1 for two people. + let mut score = 0.0f64; + let mut count = 1usize; + for _ in 0..400 { + score = score * 0.92 + (2.0 / 3.0) * 0.08; + count = score_to_person_count(score, count); + } + assert_eq!(count, 1, "old corr_persons/3.0 mapping was the #803 bug"); + } +} + /// Convert smoothed person score to discrete count with hysteresis. /// /// Uses asymmetric thresholds: higher threshold to *add* a person, lower to @@ -3062,6 +3143,92 @@ fn score_to_person_count(smoothed_score: f64, prev_count: usize) -> usize { } } +/// Combine the activity-score-derived aggregate count with the count-aware +/// per-node estimates (issue #803). +/// +/// The aggregate `s.person_count()` is driven by `smoothed_person_score`, an +/// EMA-smoothed *activity* score (amplitude variance / motion / spectral +/// energy). That score saturates near a single occupant — one moving person +/// can max it out — so it cannot discriminate occupancy *count*, leaving the +/// reported value pinned at 1. Meanwhile the per-node paths already derive a +/// genuinely count-aware estimate (ESP32 firmware `n_persons`, or the +/// DynamicMinCut `corr_persons`) and stash it in `NodeState::prev_person_count` +/// — but that value was being discarded by the aggregator. +/// +/// This takes the larger of the two. It can only ever *raise* the count when a +/// node has positively estimated more occupants, so it never regresses the +/// single-person case (a lone occupant yields `node_max == 1`). +fn aggregate_person_count( + activity_count: usize, + node_states: &std::collections::HashMap, +) -> usize { + let node_max = node_states + .values() + .map(|n| n.prev_person_count) + .max() + .unwrap_or(0); + activity_count.max(node_max) +} + +#[cfg(test)] +mod aggregate_person_count_tests { + //! Issue #803 — the saturating activity score must not clamp a + //! count-aware per-node estimate back down to 1. + use super::*; + use std::collections::HashMap; + + fn node_with_count(c: usize) -> NodeState { + let mut n = NodeState::new(); + n.prev_person_count = c; + n + } + + #[test] + fn empty_nodes_fall_back_to_activity_count() { + let nodes: HashMap = HashMap::new(); + assert_eq!(aggregate_person_count(1, &nodes), 1); + assert_eq!(aggregate_person_count(0, &nodes), 0); + } + + #[test] + fn node_estimate_raises_a_saturated_activity_count() { + // The activity score saturates at 1, but a node positively reports 2. + let mut nodes = HashMap::new(); + nodes.insert(1u8, node_with_count(2)); + assert_eq!( + aggregate_person_count(1, &nodes), + 2, + "a node reporting 2 must not be discarded by the activity count" + ); + } + + #[test] + fn activity_count_wins_when_higher_than_nodes() { + // Never *lower* a confident activity-derived count to a stale node value. + let mut nodes = HashMap::new(); + nodes.insert(1u8, node_with_count(1)); + assert_eq!(aggregate_person_count(3, &nodes), 3); + } + + #[test] + fn takes_max_across_multiple_nodes() { + let mut nodes = HashMap::new(); + nodes.insert(1u8, node_with_count(1)); + nodes.insert(2u8, node_with_count(3)); + nodes.insert(3u8, node_with_count(2)); + assert_eq!(aggregate_person_count(1, &nodes), 3); + } + + #[test] + fn single_occupant_is_never_inflated() { + // Regression guard: a lone occupant (every node sees 1) stays 1. + let mut nodes = HashMap::new(); + nodes.insert(1u8, node_with_count(1)); + nodes.insert(2u8, node_with_count(1)); + assert_eq!(aggregate_person_count(1, &nodes), 1); + } +} + /// Generate a single person's skeleton with per-person spatial offset and phase stagger. /// /// `person_idx`: 0-based index of this person. @@ -4620,11 +4787,17 @@ async fn udp_receiver_task(state: SharedState, udp_port: u16) { ); s.smoothed_person_score = s.smoothed_person_score * 0.90 + score * 0.10; - let count = s.person_count(); + // #803: don't let the saturating activity score + // discard count-aware per-node estimates. + let count = + aggregate_person_count(s.person_count(), &s.node_states); s.prev_person_count = count; count.max(1) // presence=true => at least 1 } - None => fallback_count.unwrap_or(0).max(1), + None => { + aggregate_person_count(fallback_count.unwrap_or(0), &s.node_states) + .max(1) + } } } else { s.prev_person_count = 0; @@ -4942,7 +5115,11 @@ async fn udp_receiver_task(state: SharedState, udp_port: u16) { // DynamicMinCut person estimation from subcarrier correlation. let corr_persons = estimate_persons_from_correlation(&ns.frame_history); - let raw_score = corr_persons as f64 / 3.0; + // #803: map the min-cut count onto a threshold-aligned score + // so it round-trips back to the same count. The old + // `corr_persons / 3.0` left 2 people at 0.667 — under the + // 0.70 up-threshold — so the count was pinned at 1. + let raw_score = corr_persons_to_score(corr_persons); ns.smoothed_person_score = ns.smoothed_person_score * 0.92 + raw_score * 0.08; if classification.presence { let count = @@ -4996,11 +5173,17 @@ async fn udp_receiver_task(state: SharedState, udp_port: u16) { ); s.smoothed_person_score = s.smoothed_person_score * 0.90 + score * 0.10; - let count = s.person_count(); + // #803: don't let the saturating activity score + // discard count-aware per-node estimates. + let count = + aggregate_person_count(s.person_count(), &s.node_states); s.prev_person_count = count; count.max(1) } - None => fallback_count.unwrap_or(0).max(1), + None => { + aggregate_person_count(fallback_count.unwrap_or(0), &s.node_states) + .max(1) + } } } else { s.prev_person_count = 0; @@ -5985,6 +6168,84 @@ async fn main() { // consumed by `/ws/introspection`. Same ring size as `tx` (256) — slow // clients drop oldest, identical backpressure shape. let (intro_tx, _) = broadcast::channel::(256); + + // #872: actually start the MQTT publisher when `--mqtt` is set. The publisher + // (mqtt::) consumes a typed VitalsSnapshot stream; we bridge the existing JSON + // sensing broadcast into it with a defensive serde_json::Value mapping (absent + // fields default — never publish wrong values). Gated on the `mqtt` feature + // (the Docker image is built `--features mqtt`); without it `--mqtt` WARNs and + // no-ops, matching the documented contract. + if args.mqtt_opts.mqtt { + #[cfg(feature = "mqtt")] + { + use wifi_densepose_sensing_server::mqtt; + let mcfg = std::sync::Arc::new(mqtt::config::MqttConfig::from_args(&args.mqtt_opts)); + match mcfg.validate() { + Ok(()) => { + let node_id = mcfg.client_id.clone(); + let builder = mqtt::publisher::OwnedDiscoveryBuilder { + discovery_prefix: mcfg.discovery_prefix.clone(), + node_id: node_id.clone(), + node_friendly_name: Some("RuView".to_string()), + sw_version: env!("CARGO_PKG_VERSION").to_string(), + model: "RuView WiFi Sensing".to_string(), + via_device: None, + }; + let (vtx, vrx) = broadcast::channel::(64); + let (host, port) = (mcfg.host.clone(), mcfg.port); + mqtt::publisher::spawn(mcfg, builder, vrx); + let mut jrx = tx.subscribe(); + tokio::spawn(async move { + while let Ok(json) = jrx.recv().await { + let Ok(v) = serde_json::from_str::(&json) else { + continue; + }; + let cls = &v["classification"]; + let vit = &v["vital_signs"]; + let presence = cls["presence"].as_bool().unwrap_or(false); + let n_persons = v["persons"] + .as_array() + .map(|a| a.len() as u32) + .or_else(|| v["estimated_persons"].as_u64().map(|x| x as u32)) + .unwrap_or(0); + let motion = match cls["motion_level"].as_str() { + Some("none") | Some("still") | Some("idle") | Some("") => 0.0, + Some(_) => 1.0, + None => 0.0, + }; + let snap = mqtt::state::VitalsSnapshot { + node_id: node_id.clone(), + timestamp_ms: (v["timestamp"].as_f64().unwrap_or(0.0) * 1000.0) as i64, + presence, + motion, + presence_score: if presence { + cls["confidence"].as_f64().unwrap_or(1.0) + } else { + 0.0 + }, + breathing_rate_bpm: vit["breathing_rate_bpm"].as_f64(), + heartrate_bpm: vit["heart_rate_bpm"].as_f64(), + n_persons, + rssi_dbm: v["nodes"][0]["rssi_dbm"].as_f64(), + vital_confidence: cls["confidence"].as_f64().unwrap_or(0.0), + ..Default::default() + }; + let _ = vtx.send(snap); + } + }); + tracing::info!("MQTT publisher started -> {host}:{port}"); + } + Err(e) => tracing::error!("MQTT config invalid: {e}; publisher not started"), + } + } + #[cfg(not(feature = "mqtt"))] + tracing::warn!( + "--mqtt set but this binary was built without the `mqtt` feature; the publisher is a \ + no-op. Use the official Docker image (built `--features mqtt`) or rebuild with \ + `cargo build -p wifi-densepose-sensing-server --features mqtt`." + ); + } + let state: SharedState = Arc::new(RwLock::new(AppStateInner { latest_update: None, rssi_history: VecDeque::new(), diff --git a/v2/crates/wifi-densepose-sensing-server/src/mqtt/config.rs b/v2/crates/wifi-densepose-sensing-server/src/mqtt/config.rs index 6d0cca53..a430125b 100644 --- a/v2/crates/wifi-densepose-sensing-server/src/mqtt/config.rs +++ b/v2/crates/wifi-densepose-sensing-server/src/mqtt/config.rs @@ -63,7 +63,7 @@ impl MqttConfig { /// `hostname()` via the `gethostname` crate if `mqtt_client_id` was /// not supplied — we don't add a dep here, we let the publisher /// supply the default lazily. - pub fn from_args(args: &crate::cli::Args) -> Self { + pub fn from_args(args: &crate::cli::MqttArgs) -> Self { let password = std::env::var(&args.mqtt_password_env).ok(); let port = args.mqtt_port.unwrap_or(if args.mqtt_tls { 8883 } else { 1883 }); let tls = build_tls(args); @@ -135,7 +135,7 @@ impl MqttConfig { } } -fn build_tls(args: &crate::cli::Args) -> TlsConfig { +fn build_tls(args: &crate::cli::MqttArgs) -> TlsConfig { if !args.mqtt_tls { return TlsConfig::Off; } @@ -186,8 +186,14 @@ mod tests { use super::*; use clap::Parser; - fn parse(args: &[&str]) -> crate::cli::Args { - crate::cli::Args::parse_from(std::iter::once("sensing-server").chain(args.iter().copied())) + fn parse(args: &[&str]) -> crate::cli::MqttArgs { + use clap::Parser; + #[derive(Parser)] + struct W { + #[command(flatten)] + m: crate::cli::MqttArgs, + } + W::parse_from(std::iter::once("sensing-server").chain(args.iter().copied())).m } #[test] diff --git a/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs b/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs index 3eca4a8f..79a3dc86 100644 --- a/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs +++ b/v2/crates/wifi-densepose-signal/src/ruvsense/cir.rs @@ -169,7 +169,9 @@ impl CirConfig { num_taps: 156, delay_bins: 156, pilot_indices: HT20_PILOTS, - lambda: 0.05, + // ADR-134 P2: tuned for sparse multipath — stronger L1 concentrates + // energy on physical taps (with the windowed dominant ratio in `estimate`). + lambda: 0.08, max_iters: 100, tolerance: 1e-4, ranging_min_bw_hz: 40e6, @@ -186,7 +188,7 @@ impl CirConfig { num_taps: 342, delay_bins: 342, pilot_indices: HT40_PILOTS, - lambda: 0.03, + lambda: 0.08, // ADR-134 P2 tuned (see ht20) max_iters: 100, tolerance: 1e-4, ranging_min_bw_hz: 40e6, @@ -203,7 +205,9 @@ impl CirConfig { num_taps: 726, delay_bins: 726, pilot_indices: HE20_PILOTS, - lambda: 0.03, + // HE20 has the finest delay resolution (more leakage bins) -> needs + // stronger L1 to reach the dominant-ratio floor. ADR-134 P2. + lambda: 0.18, max_iters: 100, tolerance: 1e-4, ranging_min_bw_hz: 40e6, @@ -420,8 +424,15 @@ impl CirEstimator { .map(|(i, _)| i) .unwrap_or(0); + // Dominant-tap energy fraction. On the 3× super-resolved grid a single + // physical tap leaks across ~3 adjacent bins, so the dominant *physical* + // tap is the magnitude summed over a ±1-bin window around the peak — using + // a single bin under-counts its energy and crushes the ratio (ADR-134 P2). let dominant_tap_ratio = if tap_sum > 1e-12 { - x[dominant_tap_idx].norm() / tap_sum + let lo = dominant_tap_idx.saturating_sub(1); + let hi = (dominant_tap_idx + 1).min(x.len() - 1); + let dom_window: f32 = x[lo..=hi].iter().map(|c| c.norm()).sum(); + dom_window / tap_sum } else { 0.0 }; @@ -441,7 +452,11 @@ impl CirEstimator { let active_tap_count = x.iter().filter(|c| c.norm() >= cutoff).count(); // RMS delay spread: √(Σ τ²P(τ)/ΣP(τ) − τ̄²), with P(τ) = |tap|². - let power: Vec = x.iter().map(|c| (c.norm() as f64).powi(2)).collect(); + // Only causal delays [0, G/2) contribute: the ISTA delay grid is circular + // (Φ is DFT-like), so bins ≥ G/2 are aliased *negative* (non-causal) delays — + // an alias of the near-zero dominant tap otherwise inflates the spread (ADR-134 P2). + let causal_bins = x.len() / 2; + let power: Vec = x[..causal_bins].iter().map(|c| (c.norm() as f64).powi(2)).collect(); let p_sum: f64 = power.iter().sum(); let rms_delay_spread_s = if p_sum > 1e-24 { let mean_tau: f64 = power diff --git a/v2/crates/wifi-densepose-signal/tests/cir_pipeline.rs b/v2/crates/wifi-densepose-signal/tests/cir_pipeline.rs index d8a017b9..a6f18f0c 100644 --- a/v2/crates/wifi-densepose-signal/tests/cir_pipeline.rs +++ b/v2/crates/wifi-densepose-signal/tests/cir_pipeline.rs @@ -260,7 +260,6 @@ fn should_detect_unsanitized_phase_when_variance_exceeds_threshold() { /// Verifies the full pipeline: generate CSI → sanitize → estimate → dominant tap /// is at or near the expected delay bin. This is the success-path integration test. #[test] -#[ignore = "ADR-134 P2: end-to-end dominant_tap_ratio gated on ISTA hyperparameter tuning."] fn should_produce_clean_estimate_after_correct_pipeline_order() { let cfg = CirConfig::for_bandwidth_mhz(20); let k_active = cfg.delay_bins / 3; diff --git a/v2/crates/wifi-densepose-signal/tests/cir_synthetic.rs b/v2/crates/wifi-densepose-signal/tests/cir_synthetic.rs index ae5acd9c..a4ce8d1b 100644 --- a/v2/crates/wifi-densepose-signal/tests/cir_synthetic.rs +++ b/v2/crates/wifi-densepose-signal/tests/cir_synthetic.rs @@ -154,6 +154,8 @@ fn save_fixture(path: &str, k_active: usize, csi: &[Complex64], expected_dominan } // --------------------------------------------------------------------------- + + // Shared test logic: inject 3-tap channel, run estimator, assert // --------------------------------------------------------------------------- @@ -253,7 +255,6 @@ fn run_3tap_test(label: &str, cfg: CirConfig, bandwidth_mhz: u16, dominant_ratio // --------------------------------------------------------------------------- #[test] -#[ignore = "ADR-134 P2: ISTA hyperparameter tuning needed for 3-tap@SNR=20dB. dominant_tap_ratio currently below floor."] fn should_recover_3tap_channel_ht20() { // HT20: K_active=52, G=168 (3×), lambda=0.05, max_iter=30 // ADR-134 Table §2.3: dominant_tap_ratio floor = 0.30 for HT20 @@ -266,7 +267,6 @@ fn should_recover_3tap_channel_ht20() { } #[test] -#[ignore = "ADR-134 P2: ISTA hyperparameter tuning needed for 3-tap@SNR=20dB. dominant_tap_ratio currently below floor."] fn should_recover_3tap_channel_ht40() { // HT40: K_active=108, G=342 (3×), lambda=0.03, max_iter=35 let cfg = CirConfig::for_bandwidth_mhz(40); @@ -278,7 +278,6 @@ fn should_recover_3tap_channel_ht40() { } #[test] -#[ignore = "ADR-134 P2: ISTA hyperparameter tuning needed for 3-tap@SNR=20dB. dominant_tap_ratio currently below floor."] fn should_recover_3tap_channel_he20() { // HE20: K_active=242, G=726 (3×), lambda=0.03, max_iter=32 // ADR-134: better conditioning → higher dominant_tap_ratio floor @@ -317,7 +316,6 @@ fn should_return_none_for_dominant_tof_at_20mhz() { } #[test] -#[ignore = "ADR-134 P2: ranging_valid gated on dominant_tap_ratio >= 0.3 which requires further ISTA tuning."] fn should_return_tof_at_40mhz() { // Ranging is enabled at 40 MHz (Tier B) per ADR-134 §2.3 let cfg = CirConfig::for_bandwidth_mhz(40); @@ -344,7 +342,6 @@ fn should_return_tof_at_40mhz() { // --------------------------------------------------------------------------- #[test] -#[ignore = "ADR-134 P2: RMS delay spread sensitive to ISTA convergence quality; gated on tuning pass."] fn should_produce_positive_rms_delay_spread() { let cfg = CirConfig::for_bandwidth_mhz(20); let k_active = cfg.delay_bins / 3; diff --git a/v2/crates/wifi-densepose-train/Cargo.toml b/v2/crates/wifi-densepose-train/Cargo.toml index 43f1c584..6c677add 100644 --- a/v2/crates/wifi-densepose-train/Cargo.toml +++ b/v2/crates/wifi-densepose-train/Cargo.toml @@ -20,6 +20,13 @@ name = "verify-training" path = "src/bin/verify_training.rs" required-features = ["tch-backend"] +# AetherArena (ADR-149) deterministic score runner — the CI harness-gate entry +# point. Pure ruview_metrics (ndarray + sha2), no torch, so it builds and runs +# under --no-default-features for a fast, GPU-free PR gate. +[[bin]] +name = "aa_score_runner" +path = "src/bin/aa_score_runner.rs" + [features] default = [] tch-backend = ["tch"] diff --git a/v2/crates/wifi-densepose-train/src/bin/aa_score_runner.rs b/v2/crates/wifi-densepose-train/src/bin/aa_score_runner.rs new file mode 100644 index 00000000..30b6893b --- /dev/null +++ b/v2/crates/wifi-densepose-train/src/bin/aa_score_runner.rs @@ -0,0 +1,307 @@ +//! AetherArena ("AA") Score Runner + Witness Chain (ADR-149). +//! +//! Benchmark-first scorer for the official Spatial-Intelligence Benchmark. It runs +//! the **real** `wifi-densepose-train::ruview_metrics` pose-acceptance harness and +//! emits a **witness record** for proof + repeatability analysis: +//! +//! witness = { inputs_sha256, harness_version, metrics, tier, proof_sha256 } +//! +//! The `proof_sha256` is a cross-platform-stable hash of the quantised score; the +//! `inputs_sha256` binds the witness to the exact inputs it scored. Together with +//! the append-only hash-chained ledger (`aether-arena/ledger`), every published +//! rank traces back to a reproducible witness — the witness chain. +//! +//! Modes: +//! # 1. Determinism self-test on the committed fixture (CI gate default): +//! cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features +//! +//! # 2. Repeatability analysis — run K times, confirm identical proof hash: +//! cargo run ... --bin aa_score_runner --no-default-features -- --repeat 8 +//! +//! # 3. Real model scoring — score predictions against an eval split: +//! cargo run ... --bin aa_score_runner --no-default-features -- \ +//! --split eval.json --pred predictions.json --json +//! +//! # 4. Regenerate the fixture's expected hash (after an intentional change): +//! cargo run ... --bin aa_score_runner --no-default-features -- --generate-hash \ +//! > ../aether-arena/fixtures/expected_score.sha256 +//! +//! Input JSON (split = private ground truth; pred = the submitted model's output): +//! split.json : {"frames":[{"gt":[[x,y]*17],"vis":[v*17],"scale":1.0}, ...]} +//! pred.json : {"frames":[{"pred":[[x,y]*17]}, ...]} (index-aligned with split) +//! +//! Determinism discipline (lesson from calibration_proof_runner.rs): PCK/OKS use +//! libm `sqrt` which differs ~1e-7 across glibc/MSVC/Apple — so we hash only the +//! quantised metrics (1e-3 / 1e-4), never raw f32. No sort, no truncation. + +use std::env; +use std::process::ExitCode; + +use ndarray::{Array1, Array2}; +use serde::Deserialize; +use sha2::{Digest, Sha256}; +use wifi_densepose_train::ruview_metrics::{ + evaluate_joint_error, JointErrorResult, JointErrorThresholds, +}; + +/// Bump on a purposeful fixture/canonical-form change. Pinned into every witness +/// so a `harness_version` change forces a re-score (ADR-149 §2.4). +const AA_HARNESS_VERSION: u32 = 2; + +const N_FRAMES: usize = 120; +const N_KPTS: usize = 17; + +// ── input schema ──────────────────────────────────────────────────────────── +#[derive(Deserialize)] +struct SplitFile { + frames: Vec, +} +#[derive(Deserialize)] +struct SplitFrame { + gt: Vec<[f32; 2]>, + vis: Vec, + #[serde(default = "one")] + scale: f32, +} +#[derive(Deserialize)] +struct PredFile { + frames: Vec, +} +#[derive(Deserialize)] +struct PredFrame { + pred: Vec<[f32; 2]>, +} +fn one() -> f32 { + 1.0 +} + +// ── deterministic fixture (libm-free LCG) ───────────────────────────────────── +struct Lcg(u64); +impl Lcg { + fn next_u32(&mut self) -> u32 { + self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407); + (self.0 >> 32) as u32 + } + fn unit(&mut self) -> f32 { + (self.next_u32() % 1_000_000) as f32 / 1_000_000.0 + } +} + +fn build_fixture() -> (Vec>, Vec>, Vec>, Vec) { + let mut rng = Lcg(42); + let (mut pred, mut gt, mut vis, mut scale) = (vec![], vec![], vec![], vec![]); + for _ in 0..N_FRAMES { + let mut g = Array2::::zeros((N_KPTS, 2)); + let mut p = Array2::::zeros((N_KPTS, 2)); + let mut v = Array1::::ones(N_KPTS); + for k in 0..N_KPTS { + let gx = 0.2 + 0.6 * rng.unit(); + let gy = 0.2 + 0.6 * rng.unit(); + let ox = (rng.unit() - 0.5) * 0.06; + let oy = (rng.unit() - 0.5) * 0.06; + g[[k, 0]] = gx; + g[[k, 1]] = gy; + p[[k, 0]] = (gx + ox).clamp(0.0, 1.0); + p[[k, 1]] = (gy + oy).clamp(0.0, 1.0); + if rng.next_u32() % 10 == 0 { + v[k] = 0.0; + } + } + gt.push(g); + pred.push(p); + vis.push(v); + scale.push(1.0); + } + (pred, gt, vis, scale) +} + +/// Load (pred, gt, vis, scale) from index-aligned split + prediction files. +fn load_inputs( + split_path: &str, + pred_path: &str, +) -> Result<(Vec>, Vec>, Vec>, Vec), String> { + let split: SplitFile = serde_json::from_str( + &std::fs::read_to_string(split_path).map_err(|e| format!("read split: {e}"))?, + ) + .map_err(|e| format!("parse split: {e}"))?; + let pred: PredFile = serde_json::from_str( + &std::fs::read_to_string(pred_path).map_err(|e| format!("read pred: {e}"))?, + ) + .map_err(|e| format!("parse pred: {e}"))?; + if split.frames.len() != pred.frames.len() { + return Err(format!( + "frame count mismatch: split={} pred={}", + split.frames.len(), + pred.frames.len() + )); + } + let (mut gt, mut pr, mut vis, mut scale) = (vec![], vec![], vec![], vec![]); + for (i, (s, p)) in split.frames.iter().zip(pred.frames.iter()).enumerate() { + let to_arr = |kps: &[[f32; 2]]| -> Result, String> { + if kps.len() != N_KPTS { + return Err(format!("frame {i}: expected {N_KPTS} keypoints, got {}", kps.len())); + } + let mut a = Array2::::zeros((N_KPTS, 2)); + for (k, xy) in kps.iter().enumerate() { + a[[k, 0]] = xy[0]; + a[[k, 1]] = xy[1]; + } + Ok(a) + }; + gt.push(to_arr(&s.gt)?); + pr.push(to_arr(&p.pred)?); + vis.push(Array1::from(s.vis.clone())); + scale.push(s.scale); + } + Ok((pr, gt, vis, scale)) +} + +/// Canonical, libm-stable byte form of the score for the proof hash. +fn canonical_bytes(r: &JointErrorResult) -> Vec { + let mut b = Vec::new(); + b.extend_from_slice(b"AA-SCORE-v0"); + b.extend_from_slice(&AA_HARNESS_VERSION.to_le_bytes()); + let q = |x: f32, s: f32| -> u32 { (x.max(0.0) * s).round() as u32 }; + b.extend_from_slice(&q(r.pck_all, 1e3).to_le_bytes()); + b.extend_from_slice(&q(r.pck_torso, 1e3).to_le_bytes()); + b.extend_from_slice(&q(r.oks, 1e3).to_le_bytes()); + b.extend_from_slice(&q(r.jitter_rms_m, 1e4).to_le_bytes()); + b.extend_from_slice(&q(r.max_error_p95_m, 1e4).to_le_bytes()); + b.push(r.passes as u8); + b +} + +fn sha256_hex(bytes: &[u8]) -> String { + let mut h = Sha256::new(); + h.update(bytes); + h.finalize().iter().map(|x| format!("{x:02x}")).collect() +} + +/// Bind the witness to its exact inputs: hash the quantised gt+pred+vis bytes. +fn inputs_hash( + pred: &[Array2], + gt: &[Array2], + vis: &[Array1], +) -> String { + let mut h = Sha256::new(); + h.update(b"AA-INPUTS-v0"); + h.update((pred.len() as u32).to_le_bytes()); + let q = |x: f32| -> i32 { (x * 1e4).round() as i32 }; + for f in 0..gt.len() { + for k in 0..N_KPTS { + h.update(q(gt[f][[k, 0]]).to_le_bytes()); + h.update(q(gt[f][[k, 1]]).to_le_bytes()); + h.update(q(pred[f][[k, 0]]).to_le_bytes()); + h.update(q(pred[f][[k, 1]]).to_le_bytes()); + h.update([(vis[f][k] >= 0.5) as u8]); + } + } + h.finalize().iter().map(|x| format!("{x:02x}")).collect() +} + +struct Witness { + inputs_sha256: String, + proof_sha256: String, + result: JointErrorResult, +} + +fn score( + pred: &[Array2], + gt: &[Array2], + vis: &[Array1], + scale: &[f32], +) -> Witness { + let result = evaluate_joint_error(pred, gt, vis, scale, &JointErrorThresholds::default()); + Witness { + inputs_sha256: inputs_hash(pred, gt, vis), + proof_sha256: sha256_hex(&canonical_bytes(&result)), + result, + } +} + +fn witness_json(w: &Witness) -> String { + format!( + "{{\"category\":\"pose\",\"harness_version\":{},\"inputs_sha256\":\"{}\",\"proof_sha256\":\"{}\",\"pck_all\":{:.4},\"pck_torso\":{:.4},\"oks\":{:.4},\"jitter_rms_m\":{:.5},\"max_error_p95_m\":{:.5},\"pose_passes\":{}}}", + AA_HARNESS_VERSION, w.inputs_sha256, w.proof_sha256, + w.result.pck_all, w.result.pck_torso, w.result.oks, + w.result.jitter_rms_m, w.result.max_error_p95_m, w.result.passes + ) +} + +fn arg_val<'a>(args: &'a [String], key: &str) -> Option<&'a str> { + args.iter().position(|a| a == key).and_then(|i| args.get(i + 1)).map(|s| s.as_str()) +} + +fn main() -> ExitCode { + let args: Vec = env::args().collect(); + let mode_json = args.iter().any(|a| a == "--json"); + let mode_gen = args.iter().any(|a| a == "--generate-hash"); + let repeat: usize = arg_val(&args, "--repeat").and_then(|v| v.parse().ok()).unwrap_or(0); + + // Inputs: real split+pred if provided, else the deterministic fixture. + let (pred, gt, vis, scale) = match (arg_val(&args, "--split"), arg_val(&args, "--pred")) { + (Some(s), Some(p)) => match load_inputs(s, p) { + Ok(v) => v, + Err(e) => { + eprintln!("input error: {e}"); + return ExitCode::FAILURE; + } + }, + _ => build_fixture(), + }; + + let w = score(&pred, >, &vis, &scale); + + // ── Repeatability analysis: run K times, confirm an identical proof hash ── + if repeat > 0 { + let mut hashes = std::collections::BTreeSet::new(); + for _ in 0..repeat { + let wi = score(&pred, >, &vis, &scale); + hashes.insert(wi.proof_sha256); + } + let repeatable = hashes.len() == 1; + println!( + "{{\"repeatability\":{{\"runs\":{},\"unique_proof_hashes\":{},\"repeatable\":{},\"proof_sha256\":\"{}\"}}}}", + repeat, hashes.len(), repeatable, w.proof_sha256 + ); + return if repeatable { ExitCode::SUCCESS } else { + eprintln!("REPEATABILITY FAIL: {} distinct hashes across {} runs (nondeterminism)", hashes.len(), repeat); + ExitCode::FAILURE + }; + } + + if mode_gen { + println!("{}", w.proof_sha256); + return ExitCode::SUCCESS; + } + if mode_json { + println!("{}", witness_json(&w)); + return ExitCode::SUCCESS; + } + + // Default: determinism gate against the committed expected hash (CI). + println!( + "AA pose witness: PCK_all={:.4} PCK_torso={:.4} OKS={:.4} jitter={:.5}m p95={:.5}m passes={}", + w.result.pck_all, w.result.pck_torso, w.result.oks, + w.result.jitter_rms_m, w.result.max_error_p95_m, w.result.passes + ); + println!("AA inputs_sha256: {}", w.inputs_sha256); + println!("AA proof_sha256: {}", w.proof_sha256); + + let expected_path = concat!(env!("CARGO_MANIFEST_DIR"), "/../../../aether-arena/fixtures/expected_score.sha256"); + match std::fs::read_to_string(expected_path).ok().map(|s| s.trim().to_string()) { + Some(exp) if exp == w.proof_sha256 => { + println!("VERDICT: PASS (determinism hash matches expected)"); + ExitCode::SUCCESS + } + Some(exp) => { + eprintln!("VERDICT: FAIL — scorer drift.\n expected: {exp}\n actual: {}", w.proof_sha256); + eprintln!("If intentional, regenerate with --generate-hash and review the diff."); + ExitCode::FAILURE + } + None => { + eprintln!("VERDICT: NO-EXPECTED-HASH — {expected_path} missing. Generate with --generate-hash."); + ExitCode::FAILURE + } + } +} diff --git a/v2/crates/wifi-densepose-worldmodel/src/bridge.rs b/v2/crates/wifi-densepose-worldmodel/src/bridge.rs index dc8075b7..d9e84e40 100644 --- a/v2/crates/wifi-densepose-worldmodel/src/bridge.rs +++ b/v2/crates/wifi-densepose-worldmodel/src/bridge.rs @@ -13,7 +13,9 @@ use std::path::PathBuf; use std::time::Duration; +#[cfg(unix)] use tokio::io::{AsyncBufReadExt, AsyncWriteExt, BufReader}; +#[cfg(unix)] use tokio::net::UnixStream; use tokio::time::timeout; @@ -27,7 +29,8 @@ const TIMEOUT_S: u64 = 30; /// /// 200×200×16 future frames × 15 steps × ~1 byte/voxel = ~9.6 MB in the /// worst case; set a generous 64 MB ceiling to stay safe without allocating -/// it up front. +/// it up front. (Only used by the unix socket reader.) +#[cfg(unix)] const MAX_RESPONSE_BYTES: usize = 64 * 1024 * 1024; /// Thin async client for the OccWorld Unix-socket inference server. @@ -65,8 +68,23 @@ impl OccWorldBridge { .map_err(|_| WorldModelError::Timeout { timeout_s: TIMEOUT_S })? } + /// Non-unix platforms have no Unix-domain sockets. The OccWorld bridge is a + /// Linux-appliance feature (the Python inference server runs on the GPU host), + /// so on Windows/other targets the crate still compiles but `predict` fails + /// fast with a clear error instead of silently degrading. + #[cfg(not(unix))] + async fn send_recv( + &self, + _request: OccupancyWorldModelRequest, + ) -> Result { + Err(WorldModelError::Protocol( + "OccWorld Unix-socket bridge is only supported on unix targets".into(), + )) + } + /// Internal: connect, write request, read response — no timeout here; /// the outer [`timeout`] in [`predict`] handles that. + #[cfg(unix)] async fn send_recv( &self, request: OccupancyWorldModelRequest, @@ -129,6 +147,7 @@ impl OccWorldBridge { } /// Establishes a [`UnixStream`] connection to `self.socket_path`. + #[cfg(unix)] async fn connect(&self) -> Result { UnixStream::connect(&self.socket_path) .await @@ -161,6 +180,8 @@ mod tests { } /// Verify that a missing socket returns `SocketConnect` and not a panic. + /// Unix-only: non-unix targets return a `Protocol` "unsupported" error instead. + #[cfg(unix)] #[tokio::test] async fn connect_to_missing_socket_returns_error() { let bridge = OccWorldBridge::new("/tmp/__occworld_nonexistent_test__.sock");