wifi-ruview/aether-arena/calibration/test_calibration.py

"""Self-contained regression test for the RuView calibration service.

Exercises the committed CLI end-to-end on synthetic data (CPU, no GPU, no real checkpoint):
  build a base -> calibrate.py fits an adapter -> infer.py runs base+adapter -> assert the
  adapter is small, inference is shape-correct and finite, and the adapter actually changes output.

Run:  python test_calibration.py    (or via pytest)
"""
import json
import subprocess
import sys
import tempfile
from pathlib import Path

import numpy as np
import torch

HERE = Path(__file__).parent
sys.path.insert(0, str(HERE))
from model import PoseNet, standardize  # noqa: E402


def _make_base(path: Path):
    torch.manual_seed(0)
    net = PoseNet()
    # Save without the deterministic gr.A buffer (mirrors the published checkpoint;
    # calibrate.py/infer.py load with strict=False).
    sd = {k: v for k, v in net.state_dict().items() if k != "gr.A"}
    torch.save(sd, path)


def _make_data(path: Path, n: int, seed: int):
    rng = np.random.default_rng(seed)
    X = rng.standard_normal((n, 3, 114, 10)).astype(np.float32)
    Y = rng.random((n, 17, 2)).astype(np.float32)  # keypoints in [0,1]
    np.savez(path, X=X, Y=Y)


def _run(*args):
    r = subprocess.run(
        [sys.executable, str(HERE / args[0]), *map(str, args[1:])],
        capture_output=True, text=True,
    )
    assert r.returncode == 0, f"{args[0]} failed:\n{r.stdout}\n{r.stderr}"
    return r.stdout


def test_calibration_end_to_end():
    with tempfile.TemporaryDirectory() as d:
        d = Path(d)
        base = d / "base.pt"
        calib = d / "calib.npz"
        frames = d / "frames.npz"
        adapter = d / "room.adapter.npz"
        kp = d / "kp.npy"

        _make_base(base)
        _make_data(calib, n=40, seed=1)     # ≥20 → no underfit warning
        _make_data(frames, n=16, seed=2)

        # 1) calibrate -> adapter
        out = _run("calibrate.py", "--base", base, "--data", calib, "--out", adapter,
                   "--iters", "50", "--device", "cpu")
        assert adapter.exists(), "adapter not written"
        assert "saved" in out.lower()
        sz = adapter.stat().st_size
        assert sz < 200_000, f"adapter unexpectedly large ({sz} bytes)"

        # adapter contains the expected LoRA tensors (materialize + close so the
        # Windows tempdir can be cleaned up — np.load keeps a lazy file handle).
        with np.load(adapter) as z:
            keys = [k for k in z.files if k.endswith(".A") or k.endswith(".B")]
            assert keys, f"adapter has no LoRA tensors: {z.files}"
            lora = {k: z[k].astype(np.float32) for k in keys}

        # 2) infer with adapter -> keypoints
        _run("infer.py", "--base", base, "--adapter", adapter, "--data", frames,
             "--out", kp, "--device", "cpu")
        out_kp = np.load(kp)
        assert out_kp.shape == (16, 17, 2), f"bad keypoint shape {out_kp.shape}"
        assert np.isfinite(out_kp).all(), "non-finite keypoints"
        assert (out_kp >= 0).all() and (out_kp <= 1).all(), "keypoints out of [0,1]"

        # 3) adapter must actually change the output vs the zero-shot base
        with np.load(frames) as fz:
            frames_x = fz["X"][:]
        net = PoseNet()
        net.load_state_dict(torch.load(base, map_location="cpu"), strict=False)
        net.eval()
        x = standardize(torch.tensor(frames_x))
        with torch.no_grad():
            base_kp = net(x).reshape(16, 17, 2).numpy()
        net.add_lora()
        net.load_lora(lora)
        net.eval()
        with torch.no_grad():
            cal_kp = net(x).reshape(16, 17, 2).numpy()
        assert np.abs(base_kp - cal_kp).sum() > 1e-4, "adapter did not change output"


if __name__ == "__main__":
    test_calibration_end_to_end()
    print("PASS: calibration service end-to-end (calibrate -> adapter -> infer)")