proper tests

2026-04-10 18:29:58 -03:00
parent e906b0a963
commit ea9dbf8772
16 changed files with 1077 additions and 15 deletions
--- a/ctrl/bench.py
+++ b/ctrl/bench.py
@@ -0,0 +1,372 @@
+#!/usr/bin/env python3
+"""Post-hoc session benchmark — extract timing metrics from session logs.
+
+Usage:
+    python ctrl/bench.py --session data/sessions/20260410_160441
+    python ctrl/bench.py --latest              # pick most recent session
+    python ctrl/bench.py --latest --json       # machine-readable output
+
+Parses telemetry.jsonl, session.log, frames/index.json, and (if present)
+media/logs/server.log. No live session required — works on finished sessions.
+
+Metrics:
+    M1  Full startup      Connect → first transcript
+    M1a   → first duration update
+    M1b   → first scene frame
+    M1c   → first audio chunk
+    M1d   → first transcript
+    M5  Audio lag          How far audio extraction trails real-time
+    M6  Transcript lag     Time from audio ready to transcript done
+    M7  Frame throughput   Scene frames per minute
+    M9  Recorder health    Unexpected restarts / segment rotations
+"""
+
+import argparse
+import json
+import logging
+import re
+import sys
+from datetime import datetime
+from pathlib import Path
+
+log = logging.getLogger("bench")
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+DATA_DIR = PROJECT_DIR / "data"
+SESSIONS_DIR = DATA_DIR / "sessions"
+
+
+def parse_log_time(line: str) -> float | None:
+    """Parse HH:MM:SS from session.log line → seconds since midnight."""
+    m = re.match(r"(\d{2}):(\d{2}):(\d{2})", line)
+    if m:
+        return int(m[1]) * 3600 + int(m[2]) * 60 + int(m[3])
+    return None
+
+
+def load_telemetry(session_dir: Path) -> list[dict]:
+    path = session_dir / "telemetry.jsonl"
+    if not path.exists():
+        return []
+    entries = []
+    for line in path.read_text().splitlines():
+        if line.strip():
+            try:
+                entries.append(json.loads(line))
+            except json.JSONDecodeError:
+                pass
+    return entries
+
+
+def load_session_log(session_dir: Path) -> list[str]:
+    path = session_dir / "session.log"
+    return path.read_text().splitlines() if path.exists() else []
+
+
+def load_frames_index(session_dir: Path) -> list[dict]:
+    path = session_dir / "frames" / "index.json"
+    if not path.exists():
+        return []
+    try:
+        return json.loads(path.read_text())
+    except (json.JSONDecodeError, ValueError):
+        return []
+
+
+def find_first_line(lines: list[str], pattern: str) -> tuple[float | None, str | None]:
+    """Find first line matching pattern. Returns (time_seconds, full_line)."""
+    for line in lines:
+        if pattern in line:
+            return parse_log_time(line), line
+    return None, None
+
+
+def extract_metrics(session_dir: Path) -> dict:
+    tel = load_telemetry(session_dir)
+    log_lines = load_session_log(session_dir)
+    frames = load_frames_index(session_dir)
+
+    metrics = {}
+
+    # Session start time (from telemetry t=0 wall clock, or first log line)
+    session_start_t = None
+    for entry in tel:
+        if entry.get("name") == "session_start":
+            session_start_t = parse_log_time(log_lines[0]) if log_lines else None
+            break
+
+    # Session end
+    session_duration = None
+    for entry in tel:
+        if entry.get("name") == "session_end":
+            session_duration = entry.get("t")
+    metrics["session_duration_s"] = session_duration
+
+    # M1a: start → first duration update
+    t_start = session_start_t
+    t_duration, _ = find_first_line(log_lines, "Duration:")
+    if t_start is not None and t_duration is not None:
+        metrics["M1a_first_duration_s"] = t_duration - t_start
+
+    # M1b: start → first scene frame
+    t_frame, _ = find_first_line(log_lines, "Scene frame:")
+    if t_start is not None and t_frame is not None:
+        metrics["M1b_first_scene_frame_s"] = t_frame - t_start
+
+    # M1c: start → first audio chunk
+    t_audio, _ = find_first_line(log_lines, "Audio chunk:")
+    if t_start is not None and t_audio is not None:
+        metrics["M1c_first_audio_chunk_s"] = t_audio - t_start
+
+    # M1d: start → first transcript (whisper processing)
+    t_transcript, _ = find_first_line(log_lines, "faster_whisper: Processing audio")
+    if t_start is not None and t_transcript is not None:
+        metrics["M1d_first_transcript_s"] = t_transcript - t_start
+
+    # M1: full startup = start → first transcript
+    if "M1d_first_transcript_s" in metrics:
+        metrics["M1_full_startup_s"] = metrics["M1d_first_transcript_s"]
+
+    # Going LIVE time
+    t_live, _ = find_first_line(log_lines, "Going LIVE")
+    if t_start is not None and t_live is not None:
+        metrics["going_live_s"] = t_live - t_start
+
+    # M5: Audio extraction lag
+    # Parse "Audio chunk: chunk_N (Xs → Ys, global Zs)" lines
+    audio_lags = []
+    for line in log_lines:
+        m = re.search(r"Audio chunk: \S+ \(([\d.]+)s → ([\d.]+)s, global ([\d.]+)s\)", line)
+        if m:
+            end_time = float(m[2])
+            log_t = parse_log_time(line)
+            if log_t is not None and t_start is not None:
+                wall_elapsed = log_t - t_start
+                lag = wall_elapsed - end_time
+                if lag >= 0:
+                    audio_lags.append(lag)
+    if audio_lags:
+        metrics["M5_audio_lag_avg_s"] = round(sum(audio_lags) / len(audio_lags), 1)
+        metrics["M5_audio_lag_max_s"] = round(max(audio_lags), 1)
+        metrics["M5_audio_lag_min_s"] = round(min(audio_lags), 1)
+        metrics["M5_audio_chunk_count"] = len(audio_lags)
+
+    # M6: Transcription lag
+    # Parse faster_whisper "Processing audio with duration MM:SS.mmm" or "HH:MM:SS.mmm"
+    transcript_durations = []
+    for line in log_lines:
+        # MM:SS.mmm format (e.g., 00:06.145)
+        m = re.search(r"faster_whisper: Processing audio with duration (\d+):([\d.]+)$", line)
+        if m:
+            dur = int(m[1]) * 60 + float(m[2])
+            transcript_durations.append(dur)
+            continue
+        # HH:MM:SS.mmm format
+        m = re.search(r"faster_whisper: Processing audio with duration (\d+):(\d+):([\d.]+)", line)
+        if m:
+            dur = int(m[1]) * 3600 + int(m[2]) * 60 + float(m[3])
+            transcript_durations.append(dur)
+    if transcript_durations:
+        metrics["M6_whisper_processing_avg_s"] = round(sum(transcript_durations) / len(transcript_durations), 1)
+        metrics["M6_transcript_count"] = len(transcript_durations)
+
+    # M7: Frame throughput
+    if frames and session_duration and session_duration > 0:
+        minutes = session_duration / 60
+        metrics["M7_frame_throughput_per_min"] = round(len(frames) / minutes, 1)
+        metrics["M7_total_frames"] = len(frames)
+
+    # M9: Recorder health
+    restarts = sum(1 for l in log_lines if "Recorder died" in l)
+    segments = sum(1 for l in log_lines if "Restarting recorder" in l)
+    metrics["M9_recorder_restarts"] = restarts
+    metrics["M9_segment_rotations"] = segments
+
+    # Scene detection mode
+    if any("Scene detector: connecting" in l for l in log_lines):
+        metrics["scene_mode"] = "rust_relay"
+    elif any("Recorder+scene: pid=" in l for l in log_lines):
+        metrics["scene_mode"] = "python_single_process"
+    else:
+        metrics["scene_mode"] = "unknown"
+
+    # Transport mode — check for Rust-specific markers
+    if any("Rust session dir" in l or "Attached to Rust session" in l for l in log_lines):
+        metrics["transport"] = "rust"
+    elif any("Recorder+scene: pid=" in l for l in log_lines):
+        metrics["transport"] = "python"
+    else:
+        # Check file signatures: Rust writes audio.aac separately, Python muxes into fMP4
+        aac = session_dir / "stream" / "audio.aac"
+        if aac.exists():
+            metrics["transport"] = "rust"
+        elif any("run_async:" in l for l in log_lines):
+            metrics["transport"] = "python"
+        else:
+            metrics["transport"] = "unknown"
+
+    # Scene mode from log markers
+    if metrics.get("scene_mode") == "unknown":
+        if any("Recorder+scene: pid=" in l for l in log_lines):
+            metrics["scene_mode"] = "python_single_process"
+        elif any("run_async:" in l for l in log_lines):
+            metrics["scene_mode"] = "python_single_process"
+
+    return metrics
+
+
+def print_report(session_dir: Path, metrics: dict):
+    log.info("=" * 60)
+    log.info("  CHT Benchmark Report")
+    log.info("  Session: %s", session_dir.name)
+    log.info("  Transport: %s", metrics.get("transport", "?"))
+    log.info("  Scene mode: %s", metrics.get("scene_mode", "?"))
+    log.info("  Duration: %ss", metrics.get("session_duration_s", "?"))
+    log.info("=" * 60)
+
+    rows = [
+        ("M1", "Full startup", "M1_full_startup_s", "s"),
+        ("M1a", "  → first duration", "M1a_first_duration_s", "s"),
+        ("M1b", "  → first scene frame", "M1b_first_scene_frame_s", "s"),
+        ("M1c", "  → first audio chunk", "M1c_first_audio_chunk_s", "s"),
+        ("M1d", "  → first transcript", "M1d_first_transcript_s", "s"),
+        ("", "  → going live", "going_live_s", "s"),
+        ("M5", "Audio lag (avg)", "M5_audio_lag_avg_s", "s"),
+        ("M5", "Audio lag (max)", "M5_audio_lag_max_s", "s"),
+        ("M5", "Audio chunks", "M5_audio_chunk_count", ""),
+        ("M6", "Whisper processing (avg)", "M6_whisper_processing_avg_s", "s"),
+        ("M6", "Transcripts produced", "M6_transcript_count", ""),
+        ("M7", "Frame throughput", "M7_frame_throughput_per_min", "/min"),
+        ("M7", "Total frames", "M7_total_frames", ""),
+        ("M9", "Recorder restarts", "M9_recorder_restarts", ""),
+        ("M9", "Segment rotations", "M9_segment_rotations", ""),
+    ]
+
+    for code, label, key, unit in rows:
+        val = metrics.get(key)
+        if val is not None:
+            log.info("  %4s %28s %s%s", code, label, val, unit)
+        else:
+            log.info("  %4s %28s -", code, label)
+
+
+def compare_ground_truth(session_dir: Path, gt: dict) -> dict:
+    """Compare detected scene frames against ground truth scene changes."""
+    frames = load_frames_index(session_dir)
+    gt_scenes = gt.get("scenes", [])
+
+    if not frames or not gt_scenes:
+        return {"error": "no frames or no ground truth scenes"}
+
+    detected_ts = sorted(f["timestamp"] for f in frames)
+    expected_ts = sorted(s["timestamp_s"] for s in gt_scenes)
+
+    # For each expected scene change, find the closest detected frame
+    matches = []
+    for exp_ts in expected_ts:
+        best = None
+        best_delta = float("inf")
+        for det_ts in detected_ts:
+            delta = det_ts - exp_ts
+            if abs(delta) < abs(best_delta):
+                best_delta = delta
+                best = det_ts
+        matches.append({
+            "expected_s": exp_ts,
+            "detected_s": best,
+            "delta_s": round(best_delta, 3) if best is not None else None,
+        })
+
+    deltas = [m["delta_s"] for m in matches if m["delta_s"] is not None]
+
+    return {
+        "expected_scenes": len(expected_ts),
+        "detected_frames": len(detected_ts),
+        "matches": matches,
+        "avg_delta_s": round(sum(deltas) / len(deltas), 3) if deltas else None,
+        "max_delta_s": round(max(abs(d) for d in deltas), 3) if deltas else None,
+        "missed": sum(1 for m in matches if m["delta_s"] is None or abs(m["delta_s"]) > 10),
+    }
+
+
+def print_ground_truth_report(gt: dict):
+    log.info("")
+    log.info("  Scene detection vs ground truth:")
+    log.info("    Expected scenes: %d", gt.get("expected_scenes", 0))
+    log.info("    Detected frames: %d", gt.get("detected_frames", 0))
+    if gt.get("avg_delta_s") is not None:
+        log.info("    Avg detection delta: %ss", gt["avg_delta_s"])
+        log.info("    Max detection delta: %ss", gt["max_delta_s"])
+    if gt.get("missed", 0) > 0:
+        log.warning("    Missed scenes: %d", gt["missed"])
+    for m in gt.get("matches", []):
+        status = "OK" if m["delta_s"] is not None and abs(m["delta_s"]) < 5 else "MISS"
+        det = f"{m['detected_s']:.1f}s" if m["detected_s"] is not None else "---"
+        delta = f"+{m['delta_s']:.1f}s" if m["delta_s"] is not None else ""
+        log.info("    %4s  expected=%5.1fs  detected=%s  %s", status, m["expected_s"], det, delta)
+
+
+def find_latest_session() -> Path | None:
+    if not SESSIONS_DIR.exists():
+        return None
+    dirs = sorted(SESSIONS_DIR.iterdir(), reverse=True)
+    for d in dirs:
+        if d.is_dir() and (d / "telemetry.jsonl").exists():
+            return d
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(description="CHT session benchmark")
+    parser.add_argument("--session", type=Path, help="Path to session directory")
+    parser.add_argument("--latest", action="store_true", help="Use most recent session")
+    parser.add_argument("--json", action="store_true", help="Output JSON instead of table")
+    parser.add_argument("--ground-truth", type=Path, help="Ground truth JSON for scene comparison")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    if args.latest:
+        session_dir = find_latest_session()
+        if not session_dir:
+            log.error("No sessions found")
+            sys.exit(1)
+    elif args.session:
+        session_dir = args.session
+    else:
+        parser.print_help()
+        sys.exit(1)
+
+    if not session_dir.exists():
+        log.error("Session not found: %s", session_dir)
+        sys.exit(1)
+
+    metrics = extract_metrics(session_dir)
+    metrics["session_id"] = session_dir.name
+
+    # Ground truth comparison
+    if args.ground_truth and args.ground_truth.exists():
+        gt = json.loads(args.ground_truth.read_text())
+        comparison = compare_ground_truth(session_dir, gt)
+        metrics["ground_truth"] = comparison
+
+    if args.json:
+        sys.stdout.write(json.dumps(metrics, indent=2) + "\n")
+    else:
+        print_report(session_dir, metrics)
+        if "ground_truth" in metrics:
+            print_ground_truth_report(metrics["ground_truth"])
+
+    # Save report
+    bench_dir = DATA_DIR / "bench"
+    bench_dir.mkdir(parents=True, exist_ok=True)
+    report_path = bench_dir / f"{session_dir.name}.json"
+    report_path.write_text(json.dumps(metrics, indent=2))
+
+
+if __name__ == "__main__":
+    main()
--- a/ctrl/bench_delay.py
+++ b/ctrl/bench_delay.py
@@ -0,0 +1,164 @@
+#!/usr/bin/env python3
+"""Live scene detection latency benchmark (M4).
+
+Measures time from a triggered visual change on the sender to a new JPEG
+appearing in the receiver's frames/ directory.
+
+Usage (run on receiver, sender accessible via SSH):
+    python ctrl/bench_delay.py --session-dir data/sessions/CURRENT --sender mariano@sender
+    python ctrl/bench_delay.py --frames-dir data/sessions/CURRENT/frames --sender mariano@sender
+
+How it works:
+1. Records the current frame count in frames/index.json
+2. SSH to sender, triggers a visual change (xdotool workspace switch)
+3. Polls frames/index.json for a new entry (or watches via mtime)
+4. Measures wall-clock difference = scene detection latency
+
+For repeated measurements, use --repeat N with --interval S between triggers.
+"""
+
+import argparse
+import json
+import logging
+import os
+import subprocess
+import sys
+import time
+from pathlib import Path
+
+log = logging.getLogger("bench_delay")
+
+
+def get_frame_count(frames_dir: Path) -> int:
+    index = frames_dir / "index.json"
+    if not index.exists():
+        return 0
+    try:
+        return len(json.loads(index.read_text()))
+    except (json.JSONDecodeError, ValueError):
+        return 0
+
+
+def get_latest_frame_mtime(frames_dir: Path) -> float:
+    index = frames_dir / "index.json"
+    if not index.exists():
+        return 0.0
+    return index.stat().st_mtime
+
+
+def trigger_scene_change(sender: str, method: str = "workspace") -> float:
+    """Trigger a visual change on the sender. Returns wall-clock time of trigger."""
+    if method == "workspace":
+        # xdotool switch workspace — causes a full-screen visual change
+        cmd = ["ssh", sender, "DISPLAY=:0 xdotool key super+Right"]
+    elif method == "color":
+        # Flash a fullscreen color using xterm (more dramatic change)
+        cmd = ["ssh", sender,
+               "DISPLAY=:0 bash -c 'xterm -fullscreen -bg red -e sleep 0.5 &'"]
+    else:
+        log.error("Unknown trigger method: %s", method)
+        sys.exit(1)
+
+    t = time.monotonic()
+    wall = time.time()
+    try:
+        subprocess.run(cmd, timeout=5, capture_output=True)
+    except subprocess.TimeoutExpired:
+        log.warning("SSH trigger timed out")
+    return wall
+
+
+def wait_for_new_frame(frames_dir: Path, initial_count: int,
+                       timeout: float = 15.0, poll_interval: float = 0.1) -> float | None:
+    """Wait for a new frame to appear. Returns wall-clock time when detected, or None."""
+    deadline = time.monotonic() + timeout
+    while time.monotonic() < deadline:
+        count = get_frame_count(frames_dir)
+        if count > initial_count:
+            return time.time()
+        time.sleep(poll_interval)
+    return None
+
+
+def run_measurement(frames_dir: Path, sender: str, method: str) -> dict:
+    initial_count = get_frame_count(frames_dir)
+    trigger_wall = trigger_scene_change(sender, method)
+    detected_wall = wait_for_new_frame(frames_dir, initial_count)
+
+    if detected_wall is None:
+        return {"trigger_wall": trigger_wall, "latency_s": None, "timed_out": True}
+
+    latency = detected_wall - trigger_wall
+    return {
+        "trigger_wall": trigger_wall,
+        "detected_wall": detected_wall,
+        "latency_s": round(latency, 3),
+        "timed_out": False,
+    }
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Scene detection latency benchmark")
+    parser.add_argument("--frames-dir", type=Path, help="Path to frames/ directory")
+    parser.add_argument("--session-dir", type=Path, help="Path to session directory")
+    parser.add_argument("--sender", required=True, help="SSH target for sender (user@host)")
+    parser.add_argument("--method", default="workspace", choices=["workspace", "color"],
+                        help="How to trigger visual change")
+    parser.add_argument("--repeat", type=int, default=3, help="Number of measurements")
+    parser.add_argument("--interval", type=float, default=5.0, help="Seconds between triggers")
+    parser.add_argument("--json", action="store_true", help="Output JSON")
+    args = parser.parse_args()
+
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    if args.session_dir:
+        frames_dir = args.session_dir / "frames"
+    elif args.frames_dir:
+        frames_dir = args.frames_dir
+    else:
+        parser.error("Provide --frames-dir or --session-dir")
+        return
+
+    if not frames_dir.exists():
+        log.error("Frames dir not found: %s", frames_dir)
+        sys.exit(1)
+
+    results = []
+    for i in range(args.repeat):
+        if i > 0:
+            time.sleep(args.interval)
+        log.info("Trigger %d/%d...", i + 1, args.repeat)
+        r = run_measurement(frames_dir, args.sender, args.method)
+        if r["timed_out"]:
+            log.warning("TIMEOUT (no frame in 15s)")
+        else:
+            log.info("  latency: %ss", r["latency_s"])
+        results.append(r)
+
+    latencies = [r["latency_s"] for r in results if r["latency_s"] is not None]
+
+    if args.json:
+        print(json.dumps({"measurements": results, "summary": {
+            "count": len(latencies),
+            "avg_s": round(sum(latencies) / len(latencies), 3) if latencies else None,
+            "min_s": round(min(latencies), 3) if latencies else None,
+            "max_s": round(max(latencies), 3) if latencies else None,
+            "timeouts": sum(1 for r in results if r["timed_out"]),
+        }}, indent=2))
+    else:
+        log.info("M4 Scene detection latency:")
+        if latencies:
+            log.info("  avg: %.1fs", sum(latencies) / len(latencies))
+            log.info("  min: %.1fs", min(latencies))
+            log.info("  max: %.1fs", max(latencies))
+        timeouts = sum(1 for r in results if r["timed_out"])
+        if timeouts:
+            log.warning("  timeouts: %d/%d", timeouts, len(results))
+
+
+if __name__ == "__main__":
+    main()
--- a/ctrl/client.sh
+++ b/ctrl/client.sh
@@ -0,0 +1,25 @@
+#!/bin/bash
+# Start the client (sender) — Python or Rust mode.
+#
+# Usage:
+#   ctrl/client.sh --python [RECEIVER_IP] [PORT]    # kmsgrab + mpegts (default port 4444)
+#   ctrl/client.sh --rust   [server_addr]            # Rust framed protocol (default mcrndeb:4447)
+#
+# Default: --python
+set -euo pipefail
+
+PROJECT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+MODE="python"
+
+# Parse mode flag
+if [[ "${1:-}" == "--python" ]]; then
+    MODE="python"; shift
+elif [[ "${1:-}" == "--rust" ]]; then
+    MODE="rust"; shift
+fi
+
+if [ "$MODE" = "rust" ]; then
+    exec "$PROJECT_DIR/media/ctrl/client.sh" "$@"
+else
+    exec sudo python3 "$PROJECT_DIR/sender/stream_av.py" "$@"
+fi
--- a/ctrl/e2e_test.sh
+++ b/ctrl/e2e_test.sh
@@ -0,0 +1,116 @@
+#!/bin/bash
+# E2E benchmark test — fully automated, run from the SENDER machine.
+#
+# Starts everything via SSH, captures test video, collects results.
+#
+# Usage:
+#   ctrl/e2e_test.sh --python [--duration 30]
+#   ctrl/e2e_test.sh --rust   [--duration 30]
+set -euo pipefail
+
+PROJECT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+cd "$PROJECT_DIR"
+
+DURATION=30
+INTERVAL=5
+RECEIVER="mcrndeb"
+RDIR="wdir/cht"
+MODE="python"
+PLAY_DELAY=3
+
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --python) MODE="python"; shift ;;
+        --rust) MODE="rust"; shift ;;
+        --duration) DURATION="$2"; shift 2 ;;
+        --interval) INTERVAL="$2"; shift 2 ;;
+        --receiver) RECEIVER="$2"; shift 2 ;;
+        --play-delay) PLAY_DELAY="$2"; shift 2 ;;
+        *) echo "Unknown arg: $1"; exit 1 ;;
+    esac
+done
+
+VIDEO="tests/fixtures/test_scene_${DURATION}s.mp4"
+GROUND_TRUTH="tests/fixtures/test_scene_${DURATION}s_ground_truth.json"
+
+# Generate test video if needed
+if [ ! -f "$VIDEO" ]; then
+    echo "=== Generating test video ==="
+    python3 ctrl/gen_test_video.py --duration "$DURATION" --interval "$INTERVAL"
+fi
+
+# PIDs to clean up
+PIDS=()
+cleanup() {
+    echo "=== Cleaning up ==="
+    for pid in "${PIDS[@]}"; do
+        kill "$pid" 2>/dev/null || true
+    done
+    # Kill sudo'd client
+    sudo pkill -f "stream_av\|cht-client" 2>/dev/null || true
+    # Stop remote processes
+    ssh "$RECEIVER" "pkill -f 'cht-server|cht.app' 2>/dev/null" || true
+    wait 2>/dev/null
+}
+trap cleanup EXIT INT TERM
+
+echo "=== E2E test: $MODE mode, ${DURATION}s ==="
+
+# Step 1: Start receiver side (on mcrndeb via SSH)
+if [ "$MODE" = "rust" ]; then
+    echo "--- Starting Rust server on $RECEIVER ---"
+    ssh -tt "$RECEIVER" "cd $RDIR && ctrl/server.sh" &
+    PIDS+=($!)
+    sleep 2
+
+    echo "--- Starting app on $RECEIVER (rust transport, auto-connect) ---"
+    ssh -tt "$RECEIVER" "cd $RDIR && CHT_AUTO_CONNECT=1 CHT_RUST_TRANSPORT=1 ctrl/app.sh" &
+    PIDS+=($!)
+    sleep 3
+else
+    echo "--- Starting app on $RECEIVER (python transport, auto-connect) ---"
+    ssh -tt "$RECEIVER" "cd $RDIR && CHT_AUTO_CONNECT=1 CHT_RUST_TRANSPORT=0 ctrl/app.sh" &
+    PIDS+=($!)
+    sleep 3
+fi
+
+# Step 2: Play test video fullscreen on sender
+echo "--- Playing test video fullscreen ---"
+mpv --fullscreen --loop-file=inf --no-terminal "$VIDEO" &
+PIDS+=($!)
+sleep "$PLAY_DELAY"
+
+# Step 3: Start client (sender)
+echo "--- Starting $MODE client → $RECEIVER ---"
+if [ "$MODE" = "rust" ]; then
+    ctrl/client.sh --rust "${RECEIVER}:4447" &
+else
+    ctrl/client.sh --python "$RECEIVER" &
+fi
+PIDS+=($!)
+
+# Step 4: Wait for capture + processing
+WAIT=$(( DURATION + 15 ))
+echo "--- Waiting ${WAIT}s for capture + processing ---"
+sleep "$WAIT"
+
+# Step 5: Stop sender side
+echo "--- Stopping sender ---"
+sudo pkill -f "stream_av\|cht-client" 2>/dev/null || true
+kill "${PIDS[-1]}" 2>/dev/null || true  # mpv
+sleep 2
+
+# Step 6: Stop receiver side
+echo "--- Stopping receiver ---"
+ssh "$RECEIVER" "pkill -f 'cht.app' 2>/dev/null" || true
+sleep 2
+ssh "$RECEIVER" "pkill -f 'cht-server' 2>/dev/null" || true
+sleep 1
+
+# Step 7: Benchmark
+echo ""
+echo "=== Benchmark results ($MODE) ==="
+ssh "$RECEIVER" "cd $RDIR && python3 ctrl/bench.py --latest --ground-truth $GROUND_TRUTH"
+
+echo ""
+echo "JSON: ssh $RECEIVER 'cd $RDIR && python3 ctrl/bench.py --latest --json --ground-truth $GROUND_TRUTH'"
--- a/ctrl/gen_test_video.py
+++ b/ctrl/gen_test_video.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Generate a test video with known scene changes and audio markers.
+
+The video is played fullscreen on the sender while kmsgrab captures it —
+simulating a meeting with deterministic, reproducible content.
+
+- Scene changes: solid color blocks every INTERVAL seconds
+- Overlay: large elapsed-seconds counter for visual sync measurement
+- Audio: sine tone (changes frequency each scene for transcription ground truth)
+
+Outputs:
+    tests/fixtures/test_scene_30s.mp4
+    tests/fixtures/test_scene_30s_ground_truth.json
+
+Usage:
+    python ctrl/gen_test_video.py [--duration 30] [--interval 5]
+"""
+
+import argparse
+import json
+import logging
+import subprocess
+import sys
+from pathlib import Path
+
+log = logging.getLogger("gen_test_video")
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+FIXTURES_DIR = PROJECT_DIR / "tests" / "fixtures"
+
+# Scene colors (RGB hex)
+COLORS = ["FF0000", "0000FF", "00FF00", "FFFF00", "FF00FF", "00FFFF"]
+
+# Speech sample (Harvard sentences, public domain, Open Speech Repository)
+SPEECH_SAMPLE = FIXTURES_DIR / "test_speech_harvard.wav"
+SPEECH_URL = "http://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav"
+
+
+def ensure_speech_sample():
+    """Download speech sample if not present."""
+    FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
+    if SPEECH_SAMPLE.exists():
+        return
+    log.info("Downloading speech sample from Open Speech Repository...")
+    import urllib.request
+    urllib.request.urlretrieve(SPEECH_URL, SPEECH_SAMPLE)
+    log.info("Saved: %s", SPEECH_SAMPLE)
+
+
+def generate(duration: int, interval: int):
+    FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
+    ensure_speech_sample()
+
+    video_path = FIXTURES_DIR / f"test_scene_{duration}s.mp4"
+    truth_path = FIXTURES_DIR / f"test_scene_{duration}s_ground_truth.json"
+
+    num_scenes = duration // interval
+    nc = len(COLORS)
+
+    # Video: colored segments with timer overlay, concatenated.
+    # Audio: speech sample looped to fill duration (real speech for whisper testing).
+    filter_parts = []
+    for i in range(num_scenes):
+        color = COLORS[i % nc]
+        seg_dur = interval if (i + 1) * interval <= duration else duration - i * interval
+        offset = i * interval
+        filter_parts.append(
+            f"color=c=0x{color}:s=1920x1080:d={seg_dur}:r=30,"
+            f"drawtext=text='%{{eif\\:t+{offset}\\:d}}s':"
+            f"fontsize=200:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:"
+            f"borderw=6:bordercolor=black"
+            f"[v{i}]"
+        )
+
+    v_inputs = "".join(f"[v{i}]" for i in range(num_scenes))
+    filter_parts.append(f"{v_inputs}concat=n={num_scenes}:v=1:a=0[vout]")
+    # Audio: speech sample is input 1 (input 0 is the lavfi dummy)
+    filter_parts.append(
+        f"[1:a]aresample=48000,aloop=loop=-1:size=48000*{duration},"
+        f"atrim=0:{duration},volume=0.8[aout]"
+    )
+
+    filter_complex = ";\n".join(filter_parts)
+
+    cmd = [
+        "ffmpeg", "-y",
+        "-f", "lavfi", "-i", "anullsrc",  # dummy (video segments come from filter)
+        *[arg for i in range(num_scenes) for arg in []],  # no extra inputs needed for video
+        "-i", str(SPEECH_SAMPLE),  # speech audio input
+        "-filter_complex", filter_complex,
+        "-map", "[vout]", "-map", "[aout]",
+        "-c:v", "libx264", "-preset", "ultrafast", "-crf", "18",
+        "-g", "30", "-keyint_min", "30",
+        "-c:a", "aac", "-b:a", "128k",
+        "-t", str(duration),
+        str(video_path),
+        "-hide_banner", "-loglevel", "warning",
+    ]
+
+    log.info("Generating %ds test video (%d scenes, %ds interval, speech audio)", duration, num_scenes, interval)
+    log.info("Output: %s", video_path)
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        log.error("ffmpeg failed:\n%s", result.stderr)
+        sys.exit(1)
+
+    log.info("Video generated: %s", video_path)
+
+    # Write ground truth
+    scenes = []
+    for i in range(num_scenes):
+        scenes.append({
+            "scene_index": i,
+            "timestamp_s": i * interval,
+            "color_hex": COLORS[i % nc],
+        })
+
+    truth = {
+        "duration_s": duration,
+        "interval_s": interval,
+        "num_scenes": num_scenes,
+        "video_path": str(video_path),
+        "scenes": scenes,
+    }
+    truth_path.write_text(json.dumps(truth, indent=2))
+    log.info("Ground truth: %s", truth_path)
+
+    return video_path, truth_path
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    parser = argparse.ArgumentParser(description="Generate CHT test video")
+    parser.add_argument("--duration", type=int, default=30, help="Video duration in seconds")
+    parser.add_argument("--interval", type=int, default=5, help="Seconds between scene changes")
+    args = parser.parse_args()
+
+    generate(args.duration, args.interval)
+
+
+if __name__ == "__main__":
+    main()
--- a/ctrl/sender.sh
+++ b/ctrl/sender.sh
@@ -1,8 +0,0 @@
-#!/bin/bash
-# Start the sender on this machine
-# Usage: ./sender.sh RECEIVER_IP [PORT]
-set -euo pipefail
-
-PROJECT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
-
-exec sudo "$PROJECT_DIR/sender/stream_av.sh" "$@"
--- a/ctrl/server.sh
+++ b/ctrl/server.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+# Start the Rust media server (receiver).
+# Thin wrapper around media/ctrl/server.sh.
+# Usage: ctrl/server.sh [port]
+set -euo pipefail
+
+PROJECT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+exec "$PROJECT_DIR/media/ctrl/server.sh" "$@"