mirror of
https://github.com/ruvnet/RuView.git
synced 2026-06-02 00:58:56 +02:00
feat(aether-arena): ADR-149 spatial-intelligence benchmark — scorer + CI harness gate (M1-M4)
AetherArena ("AA") — the official, project-agnostic Spatial-Intelligence Benchmark
(ADR-149, Accepted). Iteration 1 of the long-horizon build:
- ADR-149 accepted: name locked (ruvnet/aether-arena), v0 metrics locked
(pose/presence/latency/determinism), dataset legality resolved (MM-Fi CC BY-NC
only; Wi-Pose excluded). Adds four-part framing, threat model, arena_score
formula, submission state machine, neutrality/governance, and the §7 acceptance test.
- aa_score_runner: deterministic scorer bin reusing the real ruview_metrics pose
harness on a fixed seed=42 fixture → RuViewTier-style verdict + cross-platform
SHA-256 proof hash. Builds --no-default-features (no torch/GPU). VERDICT: PASS.
- CI harness gate: .github/workflows/aether-arena-harness.yml runs the scorer on
every PR — the "PR that runs the harness as part of the build" requirement.
- Scaffold: aether-arena/{README,VERIFY,STATUS}.md + schema/aa-submission.toml.
- Horizon record persisted (.claude-flow/horizons/aether-arena-aa.json).
Infra = the deliverable; model SOTA (MM-Fi PCK@20) is a separate effort blocked on
ADR-079 data collection, tracked as a stretch goal, not an infra exit.
Co-Authored-By: claude-flow <ruv@ruv.net>
This commit is contained in:
@@ -0,0 +1,119 @@
|
|||||||
|
{
|
||||||
|
"id": "aether-arena-aa",
|
||||||
|
"name": "AetherArena (AA) — Official Spatial-Intelligence Benchmark",
|
||||||
|
"adr": "ADR-149",
|
||||||
|
"adrPath": "docs/adr/ADR-149-public-community-leaderboard-huggingface.md",
|
||||||
|
"status": "Accepted",
|
||||||
|
"initializedDate": "2026-05-30",
|
||||||
|
"targetDate": "2026-08-31",
|
||||||
|
"exitCriteria": "Benchmark INFRASTRUCTURE done, tested, CI-gated, deploy-ready: aa_score_runner.rs passes deterministic fixture test; CI harness-gate green on every PR; aether-arena repo scaffold committed (README four-part framing + aa-submission.toml schema + VERIFY.md); public smoke split committed; HF Space lifecycle skeleton deployed; signed Parquet ledger functional; RuView baseline PCK@20 ~2.5% entered; ADR-149 §7 acceptance test (five-step stranger test) passes. NOTE: ML SOTA (MM-Fi PCK@20 ~72%) is a separate long-running stretch goal blocked on ADR-079 camera-ground-truth — it is NOT an infra exit criterion.",
|
||||||
|
"baselineState": {
|
||||||
|
"adrStatus": "Accepted, committed 2026-05-30",
|
||||||
|
"scorerCode": "ruview_metrics.rs + ablation.rs + proof.rs exist in wifi-densepose-train; aa_score_runner.rs not yet created",
|
||||||
|
"aetherArenaRepo": "does not exist yet — needs user authorization to create ruvnet/aether-arena public repo",
|
||||||
|
"hfSpace": "does not exist yet — needs HF_TOKEN and user authorization to deploy ruvnet/aether-arena HF Space",
|
||||||
|
"smokeDataset": "not committed",
|
||||||
|
"resultsLedger": "not created",
|
||||||
|
"ruviewBaseline": "PCK@20 ~2.5% self-reported, not formally entered",
|
||||||
|
"ciGate": "not added to workflow"
|
||||||
|
},
|
||||||
|
"milestones": {
|
||||||
|
"m1": {
|
||||||
|
"name": "ADR-149 Accepted + committed",
|
||||||
|
"status": "DONE",
|
||||||
|
"completedDate": "2026-05-30",
|
||||||
|
"completionCriteria": "ADR-149 file committed to docs/adr/ with status Accepted",
|
||||||
|
"notes": "Done this session. File at docs/adr/ADR-149-public-community-leaderboard-huggingface.md"
|
||||||
|
},
|
||||||
|
"m2": {
|
||||||
|
"name": "Deterministic scorer runner bin (aa_score_runner.rs)",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "aa_score_runner.rs compiles, runs ruview_metrics on a committed fixture, emits RuViewTier + SHA-256 proof hash, mirrors existing *_proof_runner.rs pattern; cargo test passes",
|
||||||
|
"estimatedEffort": "3-5 days",
|
||||||
|
"owner": "wifi-densepose-train crate or new aa-scorer crate"
|
||||||
|
},
|
||||||
|
"m3": {
|
||||||
|
"name": "CI harness-gate: GitHub Actions workflow",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "A GitHub Actions workflow runs aa_score_runner on every PR as a build gate; PR fails if scorer fails determinism check; workflow committed and green",
|
||||||
|
"estimatedEffort": "2-3 days",
|
||||||
|
"dependency": "M2 must be done first"
|
||||||
|
},
|
||||||
|
"m4": {
|
||||||
|
"name": "aether-arena repo scaffold",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "ruvnet/aether-arena repo created with: README (four-part framing: Public leaderboard / Private eval split / Open scorer / Signed results); aa-submission.toml manifest schema; VERIFY.md (ADR-149 §7 stranger acceptance test); neutrality/governance section (§2.8); contribution guide",
|
||||||
|
"estimatedEffort": "3-5 days",
|
||||||
|
"blockers": ["Needs user authorization to create public ruvnet/aether-arena repo on GitHub"]
|
||||||
|
},
|
||||||
|
"m5": {
|
||||||
|
"name": "Public smoke split committed + private MM-Fi held-out split prep",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "Public smoke split committed to aether-arena repo (stranger can score locally); private MM-Fi held-out split prepared under non-public path with CC BY-NC 4.0 attribution; Wi-Pose explicitly excluded from v0",
|
||||||
|
"estimatedEffort": "5-7 days",
|
||||||
|
"riskNotes": "MM-Fi CC BY-NC 4.0: AA must remain non-commercial and carry MM-Fi attribution; raw frames stay in private split; only derived CSI features + scores may be exposed"
|
||||||
|
},
|
||||||
|
"m6": {
|
||||||
|
"name": "HF Space (Gradio) skeleton",
|
||||||
|
"status": "BLOCKED",
|
||||||
|
"completionCriteria": "HF Space deployed at ruvnet/aether-arena with submission lifecycle (submitted->validated->quarantined->smoke_scored->full_scored->published/rejected); sandboxed scorer container wired; basic leaderboard table rendered",
|
||||||
|
"estimatedEffort": "7-10 days",
|
||||||
|
"blockers": [
|
||||||
|
"Needs HF_TOKEN — check .env for HF_TOKEN or HUGGINGFACE_TOKEN",
|
||||||
|
"Needs user authorization to create/deploy ruvnet/aether-arena HF Space (outward-facing public deployment)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"m7": {
|
||||||
|
"name": "Signed append-only Parquet results ledger",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "HF dataset ruvnet/aether-arena-results created; append-only Parquet ledger with signed rows; determinism_gate enforced; no row can be silently edited",
|
||||||
|
"estimatedEffort": "3-5 days",
|
||||||
|
"ledgerSchema": "submitter, model_ref, category, feature_set, tier, pck20, oks, mota, vitals_bpm_err, latency_p50, latency_p95, privacy_leakage, cross_room_deg, proof_sha256, scored_at, harness_version",
|
||||||
|
"dependency": "M6 must be scaffolded first"
|
||||||
|
},
|
||||||
|
"m8": {
|
||||||
|
"name": "RuView baseline entry + public launch",
|
||||||
|
"status": "NOT_STARTED",
|
||||||
|
"completionCriteria": "RuView wifi-densepose-pretrained baseline entered (honest PCK@20 ~2.5%); ADR-149 §7 five-step stranger acceptance test passes; v0 live with Presence + Pose + Edge-latency + Determinism categories active; Privacy and Cross-room shown as gated/coming-soon",
|
||||||
|
"estimatedEffort": "3-5 days",
|
||||||
|
"dependency": "M4+M5+M6+M7 complete",
|
||||||
|
"notes": "ML SOTA improvement (PCK@20 ~72%) is a SEPARATE stretch goal blocked on ADR-079 P7-P9 camera ground truth. NOT a blocker for infra launch."
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"activeMilestone": "m2",
|
||||||
|
"completedMilestones": ["m1"],
|
||||||
|
"knownRisks": [
|
||||||
|
"HF_TOKEN not confirmed present in .env — check before M6 work begins",
|
||||||
|
"ruvnet/aether-arena public repo creation is outward-facing — needs explicit user authorization",
|
||||||
|
"MM-Fi CC BY-NC 4.0: AA must stay legally non-commercial and brand-distinct from commercial RuView product; or seek MM-Fi commercial grant before any paid tier",
|
||||||
|
"Wi-Pose has research-use-only terms (no redistribution grant) — excluded from v0; revisit only if terms are clarified with authors",
|
||||||
|
"HF Space free CPU tier may be too slow for Candle/tch inference pipeline — may need ZeroGPU or self-hosted scorer on cognitum-20260110 GCloud A100/L4",
|
||||||
|
"ADR-079 camera-ground-truth (PCK@20 SOTA) is P7-P9 pending — NOT an infra blocker; must not be conflated with AA infra completion",
|
||||||
|
"Neutrality/governance risk: RuView seeded the scorer — must be demonstrably scored through the same public pipeline as any other entrant (§2.8 controls)"
|
||||||
|
],
|
||||||
|
"driftSignals": {
|
||||||
|
"timeline": "GREEN — just initialized, no timeline pressure yet",
|
||||||
|
"scope": "GREEN — scope locked at four-part structure per ADR-149 §2 decision",
|
||||||
|
"approach": "GREEN — reuse pattern (existing ruview_metrics + proof.rs) confirmed in ADR-149",
|
||||||
|
"dependency": "YELLOW — HF_TOKEN and ruvnet/aether-arena repo authorization are external blockers with unknown ETA",
|
||||||
|
"priority": "GREEN — active feature branch feat/adr-136-146-streaming-engine in progress; AA infra can proceed in parallel on its own branch"
|
||||||
|
},
|
||||||
|
"stretchGoals": {
|
||||||
|
"sotaML": "MM-Fi PCK@20 SOTA ~72% — separate ML effort blocked on ADR-079 P7-P9 camera-ground-truth data collection; NOT an infra exit criterion",
|
||||||
|
"privacyAxis": "ADR-145 §10 membership-inference attacker — activate Privacy leaderboard axis once attacker is implemented and published",
|
||||||
|
"crossRoom": "Multi-room held-out split — activate Cross-room generalization axis",
|
||||||
|
"multiOrgSteering": "Invite co-maintainers from other projects once >=N external entries land"
|
||||||
|
},
|
||||||
|
"sessionHistory": [
|
||||||
|
{
|
||||||
|
"date": "2026-05-30",
|
||||||
|
"type": "initialization",
|
||||||
|
"accomplished": [
|
||||||
|
"ADR-149 Accepted and committed to docs/adr/",
|
||||||
|
"Horizon record initialized in .claude-flow/horizons/aether-arena-aa.json",
|
||||||
|
"Memory stored in horizons namespace under key horizon-aether-arena-aa",
|
||||||
|
"Session check-in record stored in horizon-sessions namespace"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@@ -0,0 +1,73 @@
|
|||||||
|
name: AetherArena harness gate (ADR-149)
|
||||||
|
|
||||||
|
# Runs the AetherArena scoring harness as a PR build gate. Every PR that touches
|
||||||
|
# the scorer, the metrics, or the benchmark scaffold must keep the deterministic
|
||||||
|
# score hash stable (ADR-149 §2.5 determinism_gate). If the scoring maths changes,
|
||||||
|
# the hash moves and this gate fails until `expected_score.sha256` is regenerated
|
||||||
|
# and reviewed — so scorer drift can never land silently.
|
||||||
|
#
|
||||||
|
# This is the "a PR that runs the harness as part of the build process" requirement.
|
||||||
|
|
||||||
|
on:
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'v2/crates/wifi-densepose-train/src/ruview_metrics.rs'
|
||||||
|
- 'v2/crates/wifi-densepose-train/src/ablation.rs'
|
||||||
|
- 'v2/crates/wifi-densepose-train/src/bin/aa_score_runner.rs'
|
||||||
|
- 'aether-arena/**'
|
||||||
|
- '.github/workflows/aether-arena-harness.yml'
|
||||||
|
push:
|
||||||
|
branches: ['feat/adr-149-aether-arena']
|
||||||
|
workflow_dispatch:
|
||||||
|
|
||||||
|
permissions:
|
||||||
|
contents: read
|
||||||
|
pull-requests: write
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
harness-gate:
|
||||||
|
name: Run AA scorer harness (determinism gate)
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
defaults:
|
||||||
|
run:
|
||||||
|
working-directory: v2
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Install Rust toolchain
|
||||||
|
run: rustup show && rustc --version
|
||||||
|
|
||||||
|
- name: Cache cargo
|
||||||
|
uses: actions/cache@v4
|
||||||
|
with:
|
||||||
|
path: |
|
||||||
|
~/.cargo/registry
|
||||||
|
~/.cargo/git
|
||||||
|
v2/target
|
||||||
|
key: aa-harness-${{ runner.os }}-${{ hashFiles('v2/Cargo.lock') }}
|
||||||
|
|
||||||
|
# 1. Build the pure-Rust scorer (no torch / no GPU → fast PR gate).
|
||||||
|
- name: Build AA score runner
|
||||||
|
run: cargo build -p wifi-densepose-train --bin aa_score_runner --no-default-features
|
||||||
|
|
||||||
|
# 2. Determinism gate: the committed expected hash must still match. A
|
||||||
|
# non-zero exit here fails the PR.
|
||||||
|
- name: Run determinism gate
|
||||||
|
run: cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features
|
||||||
|
|
||||||
|
# 3. Emit the score row into the PR run summary (leaderboard-ledger shape).
|
||||||
|
- name: Score row → job summary
|
||||||
|
if: always()
|
||||||
|
run: |
|
||||||
|
ROW=$(cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --json)
|
||||||
|
{
|
||||||
|
echo "## AetherArena harness gate"
|
||||||
|
echo ""
|
||||||
|
echo "Deterministic score row (ADR-149 §2.2):"
|
||||||
|
echo '```json'
|
||||||
|
echo "$ROW"
|
||||||
|
echo '```'
|
||||||
|
echo ""
|
||||||
|
echo "If the determinism gate failed, the scoring maths changed: regenerate with"
|
||||||
|
echo '`cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --generate-hash > aether-arena/fixtures/expected_score.sha256` and review the diff.'
|
||||||
|
} >> "$GITHUB_STEP_SUMMARY"
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# AetherArena ("AA") — The Official Spatial-Intelligence Benchmark
|
||||||
|
|
||||||
|
> **Public leaderboard. Private evaluation split. Open scorer. Signed results.**
|
||||||
|
|
||||||
|
AetherArena is a **standalone, project-agnostic benchmark** for camera-free **spatial intelligence** — pose, presence, occupancy, tracking, and vitals from RF/WiFi (and, over time, mmWave / UWB / radar / lidar / multimodal). It is **not** a single-vendor leaderboard: any team, framework, or sensing modality can enter, and every entrant — including the RuView baseline that donated the seed scorer — is scored by the identical, open, pinned harness.
|
||||||
|
|
||||||
|
Specified in [ADR-149](../docs/adr/ADR-149-public-community-leaderboard-huggingface.md) (Accepted).
|
||||||
|
|
||||||
|
Canonical home: **`ruvnet/aether-arena`** + a Hugging Face Space (deploy pending — see `STATUS`).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
WiFi/RF spatial sensing has no shared yardstick — papers self-report against inconsistent splits and metrics, with **no accounting for latency, reproducibility, or privacy leakage**. AA fixes the *measurement*, not just the models: a single deterministic scorer, a private held-out split nobody can train on, and a signed result ledger that can't be silently edited.
|
||||||
|
|
||||||
|
## What gets measured (v0)
|
||||||
|
|
||||||
|
| Category | Metric | Status |
|
||||||
|
|----------|--------|--------|
|
||||||
|
| **Pose** | PCK@0.2 (all / torso), OKS | Ranked |
|
||||||
|
| **Presence** | accuracy, FP/FN | Ranked |
|
||||||
|
| **Edge latency** | p50 / p95 / p99 ms | Ranked |
|
||||||
|
| **Determinism** | proof-hash pass/fail | Ranked (gate) |
|
||||||
|
| Tracking (MOTA) | — | activates when multi-person clips land |
|
||||||
|
| Vitals (BPM err) | — | activates when paired vitals ground truth lands |
|
||||||
|
| **Privacy leakage** | membership-inference ∈ [0,1] | **gated — not ranked** until the attacker ships |
|
||||||
|
| Cross-room | degradation ratio | coming soon |
|
||||||
|
|
||||||
|
The headline rank is the **category metric**; an optional `arena_score = quality × latency_factor × privacy_factor × determinism_gate` is exposed alongside (never instead) so accuracy can't win at any cost. See ADR-149 §2.5.
|
||||||
|
|
||||||
|
## How scoring works
|
||||||
|
|
||||||
|
The scorer is RuView's **already-published** `wifi-densepose-train` acceptance harness (`ruview_metrics` + ADR-145 `ablation`), run in a pinned sandbox. **You submit a model, not predictions** — predictions on data you hold prove nothing. Your model is scored against a **private** MM-Fi held-out split (CC BY-NC 4.0; Wi-Pose excluded for redistribution reasons), and one **signed, append-only** row is written to the results ledger with a determinism proof hash.
|
||||||
|
|
||||||
|
Submission lifecycle: `submitted → validated → quarantined → smoke_scored → full_scored → published` (or `rejected` with a reason). The model only ever runs inside a no-network, read-only-FS sandbox.
|
||||||
|
|
||||||
|
## Submit (when the Space is live)
|
||||||
|
|
||||||
|
1. Write a manifest: [`schema/aa-submission.toml`](schema/aa-submission.toml).
|
||||||
|
2. Push your model artifact (`.safetensors` / `.rvf` / LoRA adapter) + manifest to the Space.
|
||||||
|
3. Watch it move through the lifecycle; your signed row appears on the board.
|
||||||
|
|
||||||
|
## Verify it's fair (you don't have to trust us)
|
||||||
|
|
||||||
|
See [`VERIFY.md`](VERIFY.md) — run the **open scorer** locally on the **public smoke split**, reproduce the determinism hash, and confirm RuView's own entries were scored by the identical path. That five-step check is the launch gate (ADR-149 §7).
|
||||||
|
|
||||||
|
## Neutrality
|
||||||
|
|
||||||
|
AA is a neutral commons. The scorer is open and versioned; any metric change is a public `harness_version` bump that **re-scores all entries**. RuView donated the seed harness and enters as one baseline — it gets no special treatment (ADR-149 §2.8).
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
# AetherArena — Build Status
|
||||||
|
|
||||||
|
Tracks ADR-149 implementation milestones. "Complete" = benchmark **infrastructure** done,
|
||||||
|
tested, CI-gated, deploy-ready, RuView baseline entered, §7 acceptance test passing.
|
||||||
|
Model **SOTA** (e.g. MM-Fi PCK@20 ~72%) is a separate long-running ML effort, blocked on
|
||||||
|
ADR-079 camera-ground-truth collection — *not* an infra-completion blocker.
|
||||||
|
|
||||||
|
| # | Milestone | Status |
|
||||||
|
|---|-----------|--------|
|
||||||
|
| M1 | ADR-149 Accepted + committed | ✅ done |
|
||||||
|
| M2 | Deterministic scorer runner (`aa_score_runner`) → tier + proof hash | ✅ done — builds `--no-default-features`, hash stable, VERDICT: PASS |
|
||||||
|
| M3 | CI harness-gate workflow (PR runs the scorer) | ✅ done — `.github/workflows/aether-arena-harness.yml` |
|
||||||
|
| M4 | Scaffold: README + submission schema + VERIFY (acceptance test) | ✅ done |
|
||||||
|
| M5 | Public smoke split (committed) + private MM-Fi held-out split prep | ⏳ next |
|
||||||
|
| M6 | HF Space (Gradio) submission flow + sandboxed scorer container | ⛔ blocked — needs HF token / maintainer authorization to deploy |
|
||||||
|
| M7 | Signed append-only Parquet results ledger | ⏳ |
|
||||||
|
| M8 | RuView baseline entry (honest PCK@20) + public launch | ⏳ |
|
||||||
|
|
||||||
|
## Blockers / decisions needed
|
||||||
|
- **HF deploy (M6)** needs an HF token and authorization to create the public `ruvnet/aether-arena` Space.
|
||||||
|
- **MM-Fi is CC BY-NC** → AA must stay non-commercial / legally distinct from the commercial RuView product.
|
||||||
|
- **Realism of M2 fixture**: current fixture is a *determinism* fixture (stable hash), not a realistic baseline; M5 swaps in real MM-Fi held-out scoring.
|
||||||
@@ -0,0 +1,50 @@
|
|||||||
|
# Verifying AetherArena (you don't have to trust us)
|
||||||
|
|
||||||
|
AA's credibility rests on a stranger being able to reproduce a score and see that the rules are fair. This is the **launch gate** (ADR-149 §7): v0 does not ship until all five checks below pass for someone with no insider access.
|
||||||
|
|
||||||
|
## The open scorer
|
||||||
|
|
||||||
|
The scoring engine is a pure-Rust, GPU-free binary: `aa_score_runner` in `wifi-densepose-train`. It runs the real `ruview_metrics` pose-acceptance harness on a fixed fixture and emits a cross-platform-stable SHA-256 **determinism proof**.
|
||||||
|
|
||||||
|
### Reproduce the determinism hash locally
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cd v2
|
||||||
|
# Verify the committed expected hash still matches (this is the CI gate):
|
||||||
|
cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features
|
||||||
|
# → prints the score, the proof sha256, and "VERDICT: PASS"
|
||||||
|
|
||||||
|
# See the leaderboard-ledger row as JSON:
|
||||||
|
cargo run -q -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --json
|
||||||
|
```
|
||||||
|
|
||||||
|
The expected hash is committed at [`fixtures/expected_score.sha256`](fixtures/expected_score.sha256). Same harness version + same fixture → same hash on glibc / MSVC / Apple. If your local run prints `VERDICT: PASS`, you have reproduced the scorer.
|
||||||
|
|
||||||
|
### What happens if the scoring maths changes
|
||||||
|
|
||||||
|
Any edit to `ruview_metrics.rs`, `ablation.rs`, or `aa_score_runner.rs` moves the hash and **fails the CI gate** (`.github/workflows/aether-arena-harness.yml`) until the maintainer regenerates and reviews:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --generate-hash \
|
||||||
|
> aether-arena/fixtures/expected_score.sha256
|
||||||
|
```
|
||||||
|
|
||||||
|
So a scorer change is always a reviewed, public diff — never silent. That's `harness_version` pinning + `determinism_gate` in action (ADR-149 §2.4–§2.5).
|
||||||
|
|
||||||
|
## The five-step acceptance test (v0 launch gate)
|
||||||
|
|
||||||
|
A stranger must be able to:
|
||||||
|
|
||||||
|
1. **Submit** a model (artifact + `schema/aa-submission.toml`) with no insider help.
|
||||||
|
2. **Get a deterministic score** — same model + same `harness_version` → same numbers.
|
||||||
|
3. **See the signed row** appended to the public results ledger.
|
||||||
|
4. **Rerun the scorer locally** on the public smoke split and reproduce the logic (the command above).
|
||||||
|
5. **Understand why the rank is fair** — private split, open scorer, pinned version, proof hash — from these docs alone.
|
||||||
|
|
||||||
|
If any step fails, v0 is not ready.
|
||||||
|
|
||||||
|
## Current status
|
||||||
|
|
||||||
|
- ✅ Step 4 (rerun the open scorer locally, reproduce the hash) — **works today** via `aa_score_runner`.
|
||||||
|
- ✅ CI harness gate runs the scorer on every PR.
|
||||||
|
- ⏳ Steps 1–3, 5 (HF Space submission flow + signed ledger) — in progress; require the HF Space deploy (needs an HF token / maintainer authorization).
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
dee374bb4ba22bc4583e8280f9d03567d0e174c7e7aac8664a804bb34a428b0e
|
||||||
@@ -0,0 +1,41 @@
|
|||||||
|
# AetherArena submission manifest (ADR-149 §2.2).
|
||||||
|
# Accompanies a model artifact pushed to the AA Hugging Face Space.
|
||||||
|
# This file is the contract the Space validates before quarantine + scoring.
|
||||||
|
|
||||||
|
[submission]
|
||||||
|
# Free-form display name shown on the leaderboard.
|
||||||
|
name = "my-spatial-model"
|
||||||
|
# Hugging Face repo or URL of the model artifact (.safetensors / .rvf / LoRA adapter).
|
||||||
|
model_ref = "hf://your-org/your-model"
|
||||||
|
# Submitter handle (HF username / org). Used to sign the ledger row.
|
||||||
|
submitter = "your-hf-username"
|
||||||
|
# SPDX license of the submitted model.
|
||||||
|
license = "Apache-2.0"
|
||||||
|
|
||||||
|
[category]
|
||||||
|
# One of: pose | presence | tracking | vitals | multi-task
|
||||||
|
# v0 ranks: pose, presence (tracking/vitals activate when ground truth lands).
|
||||||
|
primary = "pose"
|
||||||
|
|
||||||
|
[input]
|
||||||
|
# Which ADR-145 FeatureSet the model consumes. v0 input is RF/WiFi CSI.
|
||||||
|
# F0 = CSI amplitude/phase F1 = +CIR F2 = +Doppler F3 = +BFLD
|
||||||
|
feature_set = "F0"
|
||||||
|
# Tensor I/O contract so the scorer can feed the model correctly.
|
||||||
|
input_shape = [114, 2] # subcarriers × {amp, phase} (example)
|
||||||
|
output_shape = [17, 2] # 17 keypoints × {x, y} normalised [0,1]
|
||||||
|
# Normalisation expected on the input ("none" | "zscore" | "minmax").
|
||||||
|
normalization = "zscore"
|
||||||
|
|
||||||
|
[runtime]
|
||||||
|
# Inference entrypoint inside the artifact (framework-specific).
|
||||||
|
framework = "candle" # candle | onnx | torch
|
||||||
|
# Optional: target the edge-latency category with a declared device class.
|
||||||
|
device_class = "cpu" # cpu | pi5 | gpu
|
||||||
|
|
||||||
|
# Notes:
|
||||||
|
# - You submit a MODEL, never predictions on data you hold.
|
||||||
|
# - Scoring runs against a PRIVATE MM-Fi held-out split in a no-network,
|
||||||
|
# read-only sandbox. You cannot see the eval data.
|
||||||
|
# - The resulting score is a signed, append-only ledger row carrying a
|
||||||
|
# determinism proof hash and the pinned harness_version.
|
||||||
@@ -0,0 +1,288 @@
|
|||||||
|
# ADR-149: AetherArena ("AA") — The Official Spatial-Intelligence Benchmark (Hugging Face)
|
||||||
|
|
||||||
|
> **Scope note:** AetherArena is a **standalone, project-agnostic benchmark** for spatial intelligence — open to *any* project, team, or modality, not a RuView-branded board. RuView contributes the initial scoring harness and enters as one baseline among others; it gets no special treatment. This ADR lives in the RuView repo only because RuView is donating the seed harness — the benchmark itself is independent.
|
||||||
|
|
||||||
|
| Field | Value |
|
||||||
|
|-------|-------|
|
||||||
|
| **Status** | Accepted |
|
||||||
|
| **Date** | 2026-05-30 |
|
||||||
|
| **Deciders** | ruv |
|
||||||
|
| **Gate decisions** | Name **locked**: `ruvnet/aether-arena` ("AA"), positioned as the official cross-project Spatial-Intelligence Benchmark. v0 ranked metrics **locked**: pose, presence, edge-latency, determinism. Dataset legality **resolved**: MM-Fi (CC BY-NC 4.0) only for v0; Wi-Pose dropped (research-use, no redistribution). |
|
||||||
|
| **Codebase target** | New repo `ruvnet/aether-arena` (leaderboard + HF Space); reuses `wifi-densepose-train` (`src/ruview_metrics.rs`, `src/ablation.rs`, `src/eval.rs`, `src/proof.rs`) and `wifi-densepose-cli` as the scoring engine |
|
||||||
|
| **Relates to** | ADR-011 (Deterministic Proof Harness), ADR-015 (Public Dataset Training Strategy — MM-Fi / Wi-Pose), ADR-024 (Contrastive CSI Embedding / HF model release), ADR-027 (Cross-Environment Domain Generalization / MERIDIAN), ADR-031 (RuView Sensing-First RF Mode — `RuViewTier` acceptance), ADR-079 (Camera-Supervised Pose Fine-tune — PCK@20), ADR-120 / ADR-141 (BFLD Privacy), ADR-145 (Ablation Eval Harness — the scoring substrate) |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. Context
|
||||||
|
|
||||||
|
### 1.1 The Gap
|
||||||
|
|
||||||
|
RuView has a mature, deterministic evaluation surface but **no public face for it**. Two assets already exist:
|
||||||
|
|
||||||
|
1. **A grading harness.** `wifi-densepose-train/src/ruview_metrics.rs` rolls pose (PCK@0.2 / OKS / torso jitter / p95 error), tracking (MOTA / ID-switches / fragmentation), and vitals (breathing/heartbeat BPM error + SNR) into a `RuViewAcceptanceResult` with a `RuViewTier` (`Fail` / `Bronze` / `Silver` / `Gold`). ADR-145's `src/ablation.rs` extends this with presence accuracy, localization error, FP/FN, latency p50/p95/p99, a privacy-leakage score ∈ `[0,1]`, and cross-room degradation, under a determinism binding inherited from the ADR-011 proof harness.
|
||||||
|
|
||||||
|
2. **A determinism substrate.** `proof.rs` (`PROOF_SEED=42`) SHA-256-hashes model outputs against an expected hash, so a scored run is reproducible and tamper-evident.
|
||||||
|
|
||||||
|
What is missing is a **public, multi-entrant ranking**. As surveyed in ADR-015 and `docs/research/sota-surveys/sota-wifi-sensing-2025.md`, the WiFi-sensing field has **no hosted live leaderboard** the way vision has COCO/EvalAI — researchers self-report numbers against public *datasets* (MM-Fi, Wi-Pose, Person-in-WiFi, Widar3.0) in papers, with inconsistent splits, metrics, and no privacy or latency accounting. RuView's own pose number (PCK@20 ≈ 2.5% with proxy labels, target 35%+ per ADR-079) is currently self-reported on a private validation set and is not comparable to the MM-Fi SOTA (MultiFormer 0.7225).
|
||||||
|
|
||||||
|
### 1.2 The Opportunity
|
||||||
|
|
||||||
|
The harness that already gates RuView releases is exactly the engine a community leaderboard needs: a single, deterministic, privacy- and latency-aware scoring function. Publishing it as an open leaderboard:
|
||||||
|
|
||||||
|
- Establishes **AetherArena as the field's standard yardstick** for spatial intelligence, with RuView's `RuViewTier` + ADR-145 metric set contributed as its initial basis (pose + tracking + vitals + **privacy-leakage** + latency + determinism — a combination no existing benchmark scores). The standard is AA's; RuView donates the seed.
|
||||||
|
- Draws **any project, framework, or modality** to submit and rank — a cross-project community flywheel, not a RuView-only one (RuView's `wifi-densepose-pretrained` is merely the first baseline).
|
||||||
|
- Forces the harness to harden: a public, neutral scorer must be reproducible by strangers, resistant to gaming, and runnable on a fixed held-out split nobody can train on.
|
||||||
|
|
||||||
|
### 1.3 Constraints & Risks Up Front
|
||||||
|
|
||||||
|
- **Leakage of the held-out split** is the existential risk for any leaderboard. The eval data must be private; submitters provide a model, not predictions on data they hold.
|
||||||
|
- **Compute cost.** Scoring a submission runs inference over the eval set; an HF Space on free CPU may be too slow for the Candle/`tch` pipeline. Tiering of compute (CPU smoke vs GPU full score) is required.
|
||||||
|
- **Privacy / consent of the eval data.** MM-Fi and Wi-Pose carry their own licenses; we can host *derived* CSI features and scores but must respect redistribution terms (ADR-015 already tracks this).
|
||||||
|
- **Trust.** A `RuViewTier` badge is only meaningful if the scoring is deterministic and the leaderboard cannot be silently edited — the ADR-011 proof hash and a signed results ledger address this.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 2. Decision
|
||||||
|
|
||||||
|
**Create AetherArena ("AA") — the official, project-agnostic Spatial-Intelligence Benchmark: a public, open-entry leaderboard for camera-free spatial perception (pose, presence, occupancy, tracking, vitals) as a standalone repo `ruvnet/aether-arena` paired with a Hugging Face Space. The scoring engine is seeded by RuView's existing `ruview_metrics` + ADR-145 ablation harness, contributed as a neutral scorer; v0 evaluates against a private MM-Fi held-out split.**
|
||||||
|
|
||||||
|
AA is **not a RuView leaderboard**. It is the field's missing standard yardstick for spatial intelligence — open to any team, framework, or sensing modality. The RF medium is the v0 input and RuView donates the seed harness + a baseline entry, but the benchmark is independent and RuView is scored like every other entrant. The metric surface — pose, presence, tracking, occupancy/world-model, latency, determinism, and later privacy — is modality-agnostic, leaving room to grow to mmWave / UWB / radar / lidar / multimodal entrants and other projects.
|
||||||
|
|
||||||
|
The leaderboard does **not** fork or re-implement the scoring logic. It is a thin orchestration + presentation layer over the published `wifi-densepose-cli` scorer, so the public number a model earns is identical to the number RuView uses internally to gate releases. **This makes the leaderboard governance, not marketing.**
|
||||||
|
|
||||||
|
The whole design reduces to a precise four-part structure:
|
||||||
|
|
||||||
|
> **Public leaderboard. Private evaluation split. Open scorer. Signed results.**
|
||||||
|
|
||||||
|
- **Public leaderboard** — anyone can see the ranking and submit.
|
||||||
|
- **Private evaluation split** — the held-out data is never published; it cannot be trained on or overfit.
|
||||||
|
- **Open scorer** — the scoring code is the published `wifi-densepose-cli`; a stranger can rerun it locally on a public *smoke* split and reproduce the logic.
|
||||||
|
- **Signed results** — every score is an append-only, signed ledger row with a determinism proof hash; ranks cannot be silently edited.
|
||||||
|
|
||||||
|
### 2.1 Name — DECIDED: `ruvnet/aether-arena` ("AA")
|
||||||
|
|
||||||
|
**Locked.** Canonical repo + HF Space: **`ruvnet/aether-arena`**, branded **AetherArena** with the short form **"AA"**.
|
||||||
|
|
||||||
|
- **"Aether"** = the classical all-pervading medium — fitting for RF/ambient spatial perception, and broader than "Ether"/CSI/WiFi so the benchmark can grow to mmWave, UWB, and multimodal spatial-intelligence entrants without a rename.
|
||||||
|
- **"Arena"** = open competitive entry.
|
||||||
|
- HF Space title: *AetherArena (AA) — the spatial-intelligence benchmark for RF perception.*
|
||||||
|
- `ruvnet/wifi-densepose-leaderboard` is kept only as a discoverability/topic alias that redirects to AA.
|
||||||
|
|
||||||
|
(Rejected: `csi-arena` — jargon; `rf-bench` — generic/collision; `wifi-densepose-leaderboard` as the primary — ties the brand to one capability.)
|
||||||
|
|
||||||
|
### 2.2 Architecture
|
||||||
|
|
||||||
|
```
|
||||||
|
Submitter ruvnet/aether-arena RuView harness
|
||||||
|
───────── ────────────────── ──────────────
|
||||||
|
push model.safetensors ──► HF Space (Gradio): submit form ┌─ wifi-densepose-cli score
|
||||||
|
+ model card (adapter, │ • validates manifest │ ├─ load model snapshot
|
||||||
|
input contract, license) │ • queues job ──► │ ├─ replay private MM-Fi/
|
||||||
|
│ • runs scorer in container │ │ Wi-Pose split (PROOF_SEED)
|
||||||
|
│ • appends signed result │ ├─ ruview_metrics → RuViewTier
|
||||||
|
▼ │ ├─ ablation.rs → p50/p95,
|
||||||
|
leaderboard.parquet ◄────────────────────┘ │ privacy-leakage, cross-room
|
||||||
|
(HF dataset, append-only, └─ emit result + SHA-256 proof
|
||||||
|
one signed row per submission)
|
||||||
|
```
|
||||||
|
|
||||||
|
1. **Submission contract.** A submitter pushes a model artifact (`model.safetensors` / `.rvf` / LoRA adapter) plus a `ruview-arena.toml` manifest declaring: input feature set (which ADR-145 `FeatureSet` it consumes — F0 CSI / F1 CIR / F2 Doppler / F3 BFLD), tensor I/O contract, license, and optional category (pose / presence / tracking / vitals / multi-task).
|
||||||
|
2. **Scoring.** The Space runs the **published `wifi-densepose-cli`** in a pinned container against a **private held-out split** of MM-Fi / Wi-Pose (and RuView's own paired-capture set per ADR-079). Output is the existing `RuViewAcceptanceResult` + the ADR-145 scalar set, plus the ADR-011 SHA-256 reproducibility hash.
|
||||||
|
3. **Ledger.** Each scored submission appends **one signed row** to an append-only HF dataset (`ruvnet/aether-arena-results`, Parquet): `{submitter, model_ref, category, feature_set, tier, pck20, oks, mota, vitals_bpm_err, latency_p50, latency_p95, privacy_leakage, cross_room_deg, proof_sha256, scored_at, harness_version}`. Append-only + signed = no silent edits.
|
||||||
|
4. **Presentation.** Gradio leaderboard with category tabs (Pose / Presence / Tracking / Vitals / Edge-latency / **Privacy**), `RuViewTier` badges, and a "privacy-respecting" filter (leakage ≤ threshold) — the differentiator no other WiFi benchmark has.
|
||||||
|
|
||||||
|
### 2.2.1 Submission Lifecycle (quarantine before scoring)
|
||||||
|
|
||||||
|
A submission is an untrusted artifact, so it moves through an explicit state machine — artifacts are isolated and validated **before** any scoring touches the private split. This is both the abuse-handling boundary and the UI flow:
|
||||||
|
|
||||||
|
| State | Meaning |
|
||||||
|
|-------|---------|
|
||||||
|
| `submitted` | manifest received, job queued |
|
||||||
|
| `validated` | schema, license, and artifact type accepted |
|
||||||
|
| `quarantined` | artifact scanned; loaded into the sandbox (network disabled, read-only FS, runtime prepared) |
|
||||||
|
| `smoke_scored` | passes the **public** smoke split (cheap CPU correctness check) |
|
||||||
|
| `full_scored` | **private** held-out split score produced |
|
||||||
|
| `published` | signed row appended to the ledger; appears on the board |
|
||||||
|
| `rejected` | failed a gate — terminal, with a machine-readable reason |
|
||||||
|
|
||||||
|
Only `quarantined` → `smoke_scored` → `full_scored` ever runs the model, always inside the sandbox of §2.4. A failure at any gate transitions to `rejected` with a reason rather than silently dropping.
|
||||||
|
|
||||||
|
### 2.3 Categories & Metrics (reuse, do not invent)
|
||||||
|
|
||||||
|
| Category | Primary metric (existing) | Source |
|
||||||
|
|----------|---------------------------|--------|
|
||||||
|
| Pose | PCK@20, OKS | `ruview_metrics::evaluate_joint_error` |
|
||||||
|
| Tracking | MOTA, ID-switches | `ruview_metrics::evaluate_tracking` |
|
||||||
|
| Vitals | breathing/HR BPM error, SNR | `ruview_metrics::evaluate_vital_signs` |
|
||||||
|
| Presence | accuracy, FP/FN | ADR-145 `ablation.rs` |
|
||||||
|
| Edge latency | p50 / p95 / p99 ms | ADR-145 `LatencyProfile` |
|
||||||
|
| **Privacy** | leakage score ∈ `[0,1]` (membership-inference) | ADR-145 §10 |
|
||||||
|
| Cross-room | degradation ratio | ADR-027 / ADR-145 |
|
||||||
|
| Overall | `RuViewTier` Bronze/Silver/Gold + `arena_score` (§2.5) | `determine_tier()` |
|
||||||
|
|
||||||
|
### 2.3.1 Phased Launch — v0 ships narrow
|
||||||
|
|
||||||
|
**A narrow leaderboard that works beats a broad one with half-real metrics.** v0 ranks only categories whose metric is fully implemented and reproducible-by-strangers today; the rest are visible as **"coming soon" / gated** and are **not ranked** until their metric is real.
|
||||||
|
|
||||||
|
| Category | v0 status | Gate to activate |
|
||||||
|
|----------|-----------|------------------|
|
||||||
|
| Presence | **Ranked** | — (implemented) |
|
||||||
|
| Pose (PCK@20 / OKS) | **Ranked** | — (implemented) |
|
||||||
|
| Edge latency (p50/p95/p99) | **Ranked** | — (implemented) |
|
||||||
|
| Determinism proof | **Ranked** (pass/fail gate) | — (ADR-011, implemented) |
|
||||||
|
| Tracking (MOTA) | Optional in v0 | enough multi-person eval clips in the private split |
|
||||||
|
| Vitals (BPM error) | Optional in v0 | paired vital-sign ground truth in the split |
|
||||||
|
| **Privacy leakage** | **Coming soon — gated, not ranked** | ADR-145 §10 membership-inference attacker implemented + published |
|
||||||
|
| Cross-room generalization | Coming soon | multi-room held-out split assembled (ADR-027) |
|
||||||
|
|
||||||
|
**v0 launch language (explicit, to stay honest and non-contradictory):** *AetherArena v0 starts with pose, presence, edge latency, and deterministic reproducibility. Tracking and vitals are activated when sufficient ground-truth clips are available. Privacy-leakage and cross-room generalization remain gated until their evaluation attacks and splits are implemented and published.* Shipping a "privacy leaderboard" claim before the attacker exists would be an easy and deserved attack on our credibility.
|
||||||
|
|
||||||
|
### 2.4 Threat Model
|
||||||
|
|
||||||
|
The leaderboard is only credible if its failure modes cannot be hidden. Explicit threats and the control that neutralizes each:
|
||||||
|
|
||||||
|
| Threat | Control |
|
||||||
|
|--------|---------|
|
||||||
|
| Model exfiltrates / phones home the eval data | Scorer container runs with **no network, read-only eval FS, resource caps** (sandboxed) |
|
||||||
|
| Submitter overfits the public split | **Private held-out split** — never published; scoring runs on data the submitter has never seen |
|
||||||
|
| Model fingerprints / detects the eval set | **Seasonal rotation** of a fraction of the held-out split (mirrors ADR-120 hash rotation) |
|
||||||
|
| Maintainer silently edits a score / rank | **Signed, append-only** Parquet results ledger — rows are immutable and verifiable |
|
||||||
|
| Scorer version drift changes ranks invisibly | **`harness_version` pinned per row**; a scorer change forces a re-eval, not a silent re-rank |
|
||||||
|
| Slow model brute-forces accuracy | **Latency is a ranked axis** (p50/p95/p99) with hard caps + the `latency_factor` in `arena_score` |
|
||||||
|
| "Gold accuracy, leaks identity" win | **Privacy is a (gated) axis**; once active, `privacy_factor` penalizes leakage in `arena_score` |
|
||||||
|
| Malicious model artifact (RCE in the scorer) | Untrusted artifact loaded in the sandboxed container only; pinned, minimal runtime; no host mounts |
|
||||||
|
|
||||||
|
### 2.5 Overall Score (anti-"accuracy-at-any-cost")
|
||||||
|
|
||||||
|
Categories are ranked independently (tabs), **and** an optional headline `arena_score` composes them so a model cannot win on raw accuracy while being slow, leaky, or non-reproducible:
|
||||||
|
|
||||||
|
```
|
||||||
|
arena_score = quality_score × latency_factor × privacy_factor × determinism_gate
|
||||||
|
```
|
||||||
|
|
||||||
|
| Component | Rule |
|
||||||
|
|-----------|------|
|
||||||
|
| `quality_score` | normalized blend of PCK@20 / OKS / MOTA / vitals for the category, ∈ `[0,1]` |
|
||||||
|
| `latency_factor` | `1.0` if p95 ≤ target; decays smoothly above target (edge viability) |
|
||||||
|
| `privacy_factor` | `1.0 − privacy_leakage` once the Privacy axis is active; **fixed at `1.0` in v0** (privacy gated/unranked) |
|
||||||
|
| `determinism_gate` | `1.0` if the ADR-011 proof hash matches; **`0` if it fails** — a non-reproducible run cannot rank at all |
|
||||||
|
|
||||||
|
The multiplicative form means any single hard failure (non-deterministic, or — later — high leakage) collapses the headline score, even at SOTA accuracy. In v0, `privacy_factor` is pinned to `1.0` so the headline number is honest about what is actually measured.
|
||||||
|
|
||||||
|
**`arena_score` is a gate, not the only headline.** Multiplicative composites are great for gating but can hide *why* a model lost, and invite "your formula is biased" arguments. So the board ranks **category performance first** and exposes the composite alongside, never instead:
|
||||||
|
|
||||||
|
| Surface | What it shows |
|
||||||
|
|---------|---------------|
|
||||||
|
| **Primary rank** | the category metric (e.g. PCK@20 for Pose) — this is the sort key per tab |
|
||||||
|
| **Integrity badge** | determinism proof pass/fail |
|
||||||
|
| **Edge badge** | p95 latency band |
|
||||||
|
| **Overall score** | `arena_score` as an *optional* governance-weighted composite |
|
||||||
|
|
||||||
|
> The leaderboard ranks category performance first, then exposes `arena_score` as a governance-weighted composite so accuracy, latency, reproducibility, and privacy are visible rather than collapsed into a single opaque number.
|
||||||
|
|
||||||
|
### 2.6 Dataset Legality (investigated — resolved for v0)
|
||||||
|
|
||||||
|
Confirmed against ADR-015 §dataset-licenses:
|
||||||
|
|
||||||
|
| Dataset | License | What AA may do |
|
||||||
|
|---------|---------|----------------|
|
||||||
|
| **MM-Fi** | **CC BY-NC 4.0** | ✅ v0 eval source. Non-commercial use + derivatives **permitted with attribution**. AA may host *derived* CSI features and scores; raw frames stay in the private split. AA must be operated **non-commercially** and carry MM-Fi attribution. |
|
||||||
|
| **Wi-Pose** | **"Research use"** (no clean redistribution grant) | ⚠️ **Not hosted.** Pulled privately into the scorer only, never redistributed; or deferred until terms are clarified with the authors. **Dropped from v0.** |
|
||||||
|
| Person-in-WiFi-3D | semi-public access | Future candidate (post-v0), pending access terms. |
|
||||||
|
|
||||||
|
**v0 decision:** evaluate on a **private MM-Fi held-out split only** (CC BY-NC, attributed, non-commercial; expose only license-permitted derived features). Wi-Pose is removed from v0 and revisited if/when redistribution is cleared. This keeps the existential "can we even host this" risk at zero for launch.
|
||||||
|
|
||||||
|
> **Non-commercial caveat to watch:** CC BY-NC means AA itself, and the eval-data use, must remain non-commercial. Because AA also showcases the (commercial) RuView appliance, keep AA legally distinct and non-commercial, or seek an MM-Fi commercial grant before any paid tier. Flagged for the maintainer.
|
||||||
|
|
||||||
|
### 2.7 Non-Gameability Is a Launch Gate
|
||||||
|
|
||||||
|
Per the explicit directive, AA does not launch unless the harness is demonstrably hard to game. The controls (private split §2.4, seasonal rotation §2.4, model-not-prediction submission §2.2, sandbox §2.4, pinned `harness_version` §2.4, signed append-only ledger §2.3-§2.4, multiplicative `arena_score` §2.5, `determinism_gate=0` on proof-hash failure §2.5) are **not optional hardening — they are acceptance criteria** (see §7). A v0 that can be topped by overfitting a public split, a non-reproducible run, or a silently edited row is, by definition, not ready.
|
||||||
|
|
||||||
|
### 2.8 Neutrality & Governance (because it's "official" and cross-project)
|
||||||
|
|
||||||
|
The hardest credibility problem for an *official* benchmark seeded by one entrant: **"RuView built the scorer, so of course RuView wins."** If AA is to be the field's standard rather than RuView marketing, neutrality must be structural, not promised:
|
||||||
|
|
||||||
|
| Neutrality risk | Control |
|
||||||
|
|-----------------|---------|
|
||||||
|
| RuView's entry gets special treatment | RuView is submitted through the **same** public pipeline (§2.2.1) and scored by the **same** pinned scorer as everyone else; its rows carry the same proof hash and are independently re-runnable on the smoke split. |
|
||||||
|
| RuView tunes the metric to favor its models | The scorer is **open and versioned**; any metric change is a public `harness_version` bump that **re-scores all entries**, not just new ones. Metric changes go through a public changelog. |
|
||||||
|
| "Official" is self-declared | AA is positioned as a **neutral commons**: separate repo/Space identity, contribution guide, and an explicit invitation for other projects + dataset authors to co-own splits and metrics. RuView is the *donor of the seed harness*, not the owner of the standard. |
|
||||||
|
| Benchmark used as RuView ad | Keep AA legally + brand-distinct (ties into the CC BY-NC non-commercial caveat, §2.6); the README leads with the standard, not the product. |
|
||||||
|
| Single-vendor capture | Roadmap to a multi-org steering/eval committee once ≥N external projects enter; split rotation + metric proposals are public. |
|
||||||
|
|
||||||
|
The test for neutrality is the same as §7's acceptance test: a stranger from *another project* can submit, reproduce the score, and see that RuView's own entries were scored by the identical, open, pinned path.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 3. Consequences
|
||||||
|
|
||||||
|
### 3.1 Positive
|
||||||
|
- A real, comparable public number for RuView (and everyone else) on MM-Fi / Wi-Pose, scored by a privacy- and latency-aware harness no other WiFi benchmark offers.
|
||||||
|
- Community flywheel: external models/adapters get ranked, feeding `ruvnet/wifi-densepose-pretrained`.
|
||||||
|
- Forces the harness to be reproducible-by-strangers, which strengthens internal release gating too.
|
||||||
|
|
||||||
|
### 3.2 Negative / Costs
|
||||||
|
- **New repo + HF Space to maintain**, incl. a scoring container and queue. Ongoing compute cost (mitigate: CPU smoke-score on submit, batched GPU full-score on a schedule).
|
||||||
|
- **Dataset licensing** must be cleared for hosting derived MM-Fi / Wi-Pose features (ADR-015 owns this; may require contacting dataset authors).
|
||||||
|
- **Abuse surface** (malicious model artifacts run in the scorer) — must sandbox the container (no network, read-only eval data, resource caps).
|
||||||
|
|
||||||
|
### 3.3 Neutral
|
||||||
|
- The scoring logic stays in `wifi-densepose-train`/`-cli`; the leaderboard is presentation only, so it does not bloat the core workspace.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 4. Alternatives Considered
|
||||||
|
|
||||||
|
1. **Submit RuView to existing venues only (MM-Fi GitHub, Papers-with-Code).** Lower effort, but no privacy/latency axes, no live entry, and RuView doesn't own the standard. *Complementary, not exclusive — we should still post MM-Fi numbers.*
|
||||||
|
2. **A static numbers page in the RuView README.** Zero infra, but not multi-entrant and not a leaderboard.
|
||||||
|
3. **EvalAI / Kaggle competition.** Stronger anti-gaming infra, but heavyweight, time-boxed, and off-brand vs an always-open HF Space next to the model.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 5. Open Questions
|
||||||
|
|
||||||
|
1. **Eval data hosting** — can we redistribute derived MM-Fi / Wi-Pose CSI features under their licenses, or must scoring pull the raw datasets the submitter cannot see? (Owner: ADR-015 follow-up.)
|
||||||
|
2. **Compute budget** — free HF CPU Space, ZeroGPU, or a self-hosted scorer on the GCloud A100/L4 fleet (`cognitum-20260110`)?
|
||||||
|
3. **Name lock** — confirm `aether-arena` vs `wifi-densepose-leaderboard`.
|
||||||
|
4. **Season cadence** — does the held-out split rotate monthly, and do we keep an all-time + per-season board?
|
||||||
|
5. **Privacy-leakage attack** — ship the membership-inference attacker (ADR-145 §10 is currently a *defined-but-unimplemented* metric) before launch, or launch with privacy as a "coming soon" axis?
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 6. Implementation Sketch (if accepted)
|
||||||
|
|
||||||
|
- **P1** — Stand up `ruvnet/aether-arena` repo + skeleton Gradio HF Space; define `ruview-arena.toml` submission contract; publish a **public smoke split** a stranger can score locally.
|
||||||
|
- **P2** — Containerize `wifi-densepose-cli score` as the pinned, sandboxed scorer (no network, read-only FS, caps); wire the signed append-only Parquet ledger + `determinism_gate`.
|
||||||
|
- **P3 — v0 LAUNCH (narrow).** Clear + load the private MM-Fi / Wi-Pose held-out split; activate **Presence, Pose, Edge-latency, Determinism** categories; seed the board with RuView's own `wifi-densepose-pretrained` baseline (honest current PCK@20). Tracking/Vitals optional. Privacy + Cross-room shown as **gated / coming soon**.
|
||||||
|
- **P4** — *(post-launch, gated)* Implement the ADR-145 §10 privacy-leakage membership-inference attacker; only then activate + rank the **Privacy** category and switch `privacy_factor` on in `arena_score`.
|
||||||
|
- **P5** — Assemble the multi-room split → activate **Cross-room**. Submit RuView's MM-Fi number to Papers-with-Code in parallel (alternative #1).
|
||||||
|
|
||||||
|
## 7. Acceptance Test (definition of done for v0)
|
||||||
|
|
||||||
|
v0 launches **only when a stranger can:**
|
||||||
|
|
||||||
|
1. **Submit** a model (artifact + `ruview-arena.toml`) through the Space with no insider help,
|
||||||
|
2. **Get a deterministic score** back (same model + same harness version → same numbers),
|
||||||
|
3. **See the signed row** appended to the public results ledger,
|
||||||
|
4. **Rerun the scorer locally** on the public *smoke* split and reproduce the logic, and
|
||||||
|
5. **Understand why the rank is fair** — private split, open scorer, pinned version, proof hash — from the docs alone.
|
||||||
|
|
||||||
|
If any of these five fails, v0 is not ready.
|
||||||
|
|
||||||
|
## 8. Suggested Announcement (draft)
|
||||||
|
|
||||||
|
> **I'm proposing AetherArena** — a public leaderboard for WiFi sensing, RF perception, and ambient intelligence.
|
||||||
|
>
|
||||||
|
> The problem with this field is not just model quality. It is *measurement* quality. Most WiFi-sensing work reports numbers against datasets with inconsistent splits, inconsistent metrics, and almost no accounting for latency, privacy leakage, reproducibility, or edge viability.
|
||||||
|
>
|
||||||
|
> AetherArena fixes that. Models are submitted, scored in a pinned sandboxed container against **private** held-out MM-Fi and Wi-Pose splits, and written to a **signed append-only** results ledger. The scoring engine reuses the same RuView harness we use internally: pose, presence, tracking, vitals, latency, cross-room degradation, deterministic proof hashes — and, once its attacker ships, privacy leakage.
|
||||||
|
>
|
||||||
|
> The goal is not to make RuView look good. The goal is to make the *category* measurable. If ambient intelligence is going to move from demos to infrastructure, it needs public numbers, reproducible commands, private eval splits, and failure modes that cannot be hidden.
|
||||||
|
|
||||||
|
### Strategic note — three layers of the credibility story
|
||||||
|
|
||||||
|
| Layer | Asset |
|
||||||
|
|-------|-------|
|
||||||
|
| Retrieval credibility | ruflo BEIR harness |
|
||||||
|
| Sensing credibility | **AetherArena (this ADR)** |
|
||||||
|
| Product credibility | RuView appliance + Arista-style deployments |
|
||||||
@@ -20,6 +20,13 @@ name = "verify-training"
|
|||||||
path = "src/bin/verify_training.rs"
|
path = "src/bin/verify_training.rs"
|
||||||
required-features = ["tch-backend"]
|
required-features = ["tch-backend"]
|
||||||
|
|
||||||
|
# AetherArena (ADR-149) deterministic score runner — the CI harness-gate entry
|
||||||
|
# point. Pure ruview_metrics (ndarray + sha2), no torch, so it builds and runs
|
||||||
|
# under --no-default-features for a fast, GPU-free PR gate.
|
||||||
|
[[bin]]
|
||||||
|
name = "aa_score_runner"
|
||||||
|
path = "src/bin/aa_score_runner.rs"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
tch-backend = ["tch"]
|
tch-backend = ["tch"]
|
||||||
|
|||||||
@@ -0,0 +1,173 @@
|
|||||||
|
//! AetherArena ("AA") Deterministic Score Runner (ADR-149).
|
||||||
|
//!
|
||||||
|
//! The CI-runnable entry point behind the AA harness gate: it runs the **real**
|
||||||
|
//! `wifi-densepose-train::ruview_metrics` pose-acceptance harness against a
|
||||||
|
//! fixed, committed synthetic fixture (seed = 42) and emits:
|
||||||
|
//! 1. the pose metrics (PCK@0.2 all/torso, OKS, jitter, p95 error),
|
||||||
|
//! 2. the v0 `RuViewTier`-style pose verdict, and
|
||||||
|
//! 3. a cross-platform-stable SHA-256 **proof hash** of the quantised result.
|
||||||
|
//!
|
||||||
|
//! This is the `determinism_gate` substrate from ADR-149 §2.5: the same fixture
|
||||||
|
//! + same harness version must always produce the same hash. A PR that changes
|
||||||
|
//! the scoring maths moves the hash and fails the gate (the `expected_score.sha256`
|
||||||
|
//! must be regenerated and reviewed), so scorer drift can never land silently.
|
||||||
|
//!
|
||||||
|
//! Cross-platform portability (lesson from `calibration_proof_runner.rs`):
|
||||||
|
//! PCK/OKS use `sqrt` (libm-sensitive: glibc/MSVC/Apple differ by ~1e-7). We
|
||||||
|
//! never hash raw f32 — we quantise each metric to coarse fixed-point (1e-3 /
|
||||||
|
//! 1e-4) so a 1e-7 libm wobble is invisible while a real algorithm change
|
||||||
|
//! (>1e-3) breaks the hash. No sort, no truncation.
|
||||||
|
//!
|
||||||
|
//! Usage:
|
||||||
|
//! # verify against the committed expected hash (CI gate default):
|
||||||
|
//! cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features
|
||||||
|
//!
|
||||||
|
//! # emit the score as JSON (for the leaderboard ledger row):
|
||||||
|
//! cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --json
|
||||||
|
//!
|
||||||
|
//! # regenerate the expected hash (after an intentional scorer change):
|
||||||
|
//! cargo run -p wifi-densepose-train --bin aa_score_runner --no-default-features -- --generate-hash \
|
||||||
|
//! > ../aether-arena/fixtures/expected_score.sha256
|
||||||
|
|
||||||
|
use std::env;
|
||||||
|
use std::process::ExitCode;
|
||||||
|
|
||||||
|
use ndarray::{Array1, Array2};
|
||||||
|
use sha2::{Digest, Sha256};
|
||||||
|
use wifi_densepose_train::ruview_metrics::{
|
||||||
|
evaluate_joint_error, JointErrorResult, JointErrorThresholds,
|
||||||
|
};
|
||||||
|
|
||||||
|
/// Bump when the fixture or canonical hash form changes on purpose. Pinned into
|
||||||
|
/// the proof so a `harness_version` change forces a re-score (ADR-149 §2.4).
|
||||||
|
const AA_HARNESS_VERSION: u32 = 1;
|
||||||
|
|
||||||
|
/// Fixture size — fixed so the hash is stable.
|
||||||
|
const N_FRAMES: usize = 120;
|
||||||
|
const N_KPTS: usize = 17;
|
||||||
|
|
||||||
|
/// Deterministic, libm-free LCG (Numerical Recipes constants) → u32 → f32 in [0,1).
|
||||||
|
struct Lcg(u64);
|
||||||
|
impl Lcg {
|
||||||
|
fn next_u32(&mut self) -> u32 {
|
||||||
|
self.0 = self.0.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
|
||||||
|
(self.0 >> 32) as u32
|
||||||
|
}
|
||||||
|
/// Uniform f32 in [0,1) at 1e-6 granularity — no float math in the generator.
|
||||||
|
fn unit(&mut self) -> f32 {
|
||||||
|
(self.next_u32() % 1_000_000) as f32 / 1_000_000.0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build the canonical fixture: ground-truth keypoints in [0.2,0.8] and
|
||||||
|
/// predictions = GT + a small, deterministic offset, so PCK/OKS land in a
|
||||||
|
/// stable mid-high band (not trivially 0 or 1). Identical on every platform.
|
||||||
|
fn build_fixture() -> (Vec<Array2<f32>>, Vec<Array2<f32>>, Vec<Array1<f32>>, Vec<f32>) {
|
||||||
|
let mut rng = Lcg(42);
|
||||||
|
let mut gt = Vec::with_capacity(N_FRAMES);
|
||||||
|
let mut pred = Vec::with_capacity(N_FRAMES);
|
||||||
|
let mut vis = Vec::with_capacity(N_FRAMES);
|
||||||
|
let mut scale = Vec::with_capacity(N_FRAMES);
|
||||||
|
|
||||||
|
for _ in 0..N_FRAMES {
|
||||||
|
let mut g = Array2::<f32>::zeros((N_KPTS, 2));
|
||||||
|
let mut p = Array2::<f32>::zeros((N_KPTS, 2));
|
||||||
|
let mut v = Array1::<f32>::ones(N_KPTS);
|
||||||
|
for k in 0..N_KPTS {
|
||||||
|
let gx = 0.2 + 0.6 * rng.unit();
|
||||||
|
let gy = 0.2 + 0.6 * rng.unit();
|
||||||
|
// Deterministic prediction offset: small for most kpts, larger for a
|
||||||
|
// few, so PCK is a believable fraction (~0.6-0.8) rather than 1.0.
|
||||||
|
let ox = (rng.unit() - 0.5) * 0.06;
|
||||||
|
let oy = (rng.unit() - 0.5) * 0.06;
|
||||||
|
g[[k, 0]] = gx;
|
||||||
|
g[[k, 1]] = gy;
|
||||||
|
p[[k, 0]] = (gx + ox).clamp(0.0, 1.0);
|
||||||
|
p[[k, 1]] = (gy + oy).clamp(0.0, 1.0);
|
||||||
|
// Occlude ~10% deterministically.
|
||||||
|
if rng.next_u32() % 10 == 0 {
|
||||||
|
v[k] = 0.0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
gt.push(g);
|
||||||
|
pred.push(p);
|
||||||
|
vis.push(v);
|
||||||
|
scale.push(1.0);
|
||||||
|
}
|
||||||
|
(pred, gt, vis, scale)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Canonical, libm-stable byte form of the result for hashing.
|
||||||
|
/// Each metric → coarse fixed-point so ~1e-7 platform noise can't flip the hash.
|
||||||
|
fn canonical_bytes(r: &JointErrorResult) -> Vec<u8> {
|
||||||
|
let mut b = Vec::new();
|
||||||
|
b.extend_from_slice(b"AA-SCORE-v0");
|
||||||
|
b.extend_from_slice(&AA_HARNESS_VERSION.to_le_bytes());
|
||||||
|
let q = |x: f32, scale: f32| -> u32 { (x.max(0.0) * scale).round() as u32 };
|
||||||
|
b.extend_from_slice(&q(r.pck_all, 1e3).to_le_bytes());
|
||||||
|
b.extend_from_slice(&q(r.pck_torso, 1e3).to_le_bytes());
|
||||||
|
b.extend_from_slice(&q(r.oks, 1e3).to_le_bytes());
|
||||||
|
b.extend_from_slice(&q(r.jitter_rms_m, 1e4).to_le_bytes());
|
||||||
|
b.extend_from_slice(&q(r.max_error_p95_m, 1e4).to_le_bytes());
|
||||||
|
b.push(r.passes as u8);
|
||||||
|
b
|
||||||
|
}
|
||||||
|
|
||||||
|
fn hash_hex(bytes: &[u8]) -> String {
|
||||||
|
let mut h = Sha256::new();
|
||||||
|
h.update(bytes);
|
||||||
|
h.finalize().iter().map(|x| format!("{x:02x}")).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() -> ExitCode {
|
||||||
|
let args: Vec<String> = env::args().collect();
|
||||||
|
let mode_json = args.iter().any(|a| a == "--json");
|
||||||
|
let mode_gen = args.iter().any(|a| a == "--generate-hash");
|
||||||
|
|
||||||
|
let (pred, gt, vis, scale) = build_fixture();
|
||||||
|
let result = evaluate_joint_error(&pred, >, &vis, &scale, &JointErrorThresholds::default());
|
||||||
|
let proof = hash_hex(&canonical_bytes(&result));
|
||||||
|
|
||||||
|
if mode_gen {
|
||||||
|
// Emit just the hash (stdout) for redirection into expected_score.sha256.
|
||||||
|
println!("{proof}");
|
||||||
|
return ExitCode::SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
if mode_json {
|
||||||
|
// One leaderboard-ledger-shaped row (ADR-149 §2.2).
|
||||||
|
println!(
|
||||||
|
"{{\"category\":\"pose\",\"harness_version\":{},\"pck_all\":{:.4},\"pck_torso\":{:.4},\"oks\":{:.4},\"jitter_rms_m\":{:.5},\"max_error_p95_m\":{:.5},\"pose_passes\":{},\"proof_sha256\":\"{}\"}}",
|
||||||
|
AA_HARNESS_VERSION,
|
||||||
|
result.pck_all, result.pck_torso, result.oks,
|
||||||
|
result.jitter_rms_m, result.max_error_p95_m, result.passes, proof
|
||||||
|
);
|
||||||
|
return ExitCode::SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default: verify against the committed expected hash (CI gate).
|
||||||
|
let expected_path = concat!(env!("CARGO_MANIFEST_DIR"), "/../../../aether-arena/fixtures/expected_score.sha256");
|
||||||
|
let expected = std::fs::read_to_string(expected_path)
|
||||||
|
.ok()
|
||||||
|
.map(|s| s.trim().to_string());
|
||||||
|
|
||||||
|
println!("AA pose score: PCK_all={:.4} PCK_torso={:.4} OKS={:.4} jitter={:.5}m p95={:.5}m passes={}",
|
||||||
|
result.pck_all, result.pck_torso, result.oks, result.jitter_rms_m, result.max_error_p95_m, result.passes);
|
||||||
|
println!("AA proof sha256: {proof}");
|
||||||
|
|
||||||
|
match expected {
|
||||||
|
Some(exp) if exp == proof => {
|
||||||
|
println!("VERDICT: PASS (determinism hash matches expected)");
|
||||||
|
ExitCode::SUCCESS
|
||||||
|
}
|
||||||
|
Some(exp) => {
|
||||||
|
eprintln!("VERDICT: FAIL — scorer drift detected.\n expected: {exp}\n actual: {proof}");
|
||||||
|
eprintln!("If this change to the scoring maths is intentional, regenerate with --generate-hash and review the diff.");
|
||||||
|
ExitCode::FAILURE
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
eprintln!("VERDICT: NO-EXPECTED-HASH — {expected_path} missing. Generate with --generate-hash.");
|
||||||
|
ExitCode::FAILURE
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user