proper tests

2026-04-10 18:29:58 -03:00
parent e906b0a963
commit ea9dbf8772
16 changed files with 1077 additions and 15 deletions
--- a/ctrl/gen_test_video.py
+++ b/ctrl/gen_test_video.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+"""Generate a test video with known scene changes and audio markers.
+
+The video is played fullscreen on the sender while kmsgrab captures it —
+simulating a meeting with deterministic, reproducible content.
+
+- Scene changes: solid color blocks every INTERVAL seconds
+- Overlay: large elapsed-seconds counter for visual sync measurement
+- Audio: sine tone (changes frequency each scene for transcription ground truth)
+
+Outputs:
+    tests/fixtures/test_scene_30s.mp4
+    tests/fixtures/test_scene_30s_ground_truth.json
+
+Usage:
+    python ctrl/gen_test_video.py [--duration 30] [--interval 5]
+"""
+
+import argparse
+import json
+import logging
+import subprocess
+import sys
+from pathlib import Path
+
+log = logging.getLogger("gen_test_video")
+
+PROJECT_DIR = Path(__file__).resolve().parent.parent
+FIXTURES_DIR = PROJECT_DIR / "tests" / "fixtures"
+
+# Scene colors (RGB hex)
+COLORS = ["FF0000", "0000FF", "00FF00", "FFFF00", "FF00FF", "00FFFF"]
+
+# Speech sample (Harvard sentences, public domain, Open Speech Repository)
+SPEECH_SAMPLE = FIXTURES_DIR / "test_speech_harvard.wav"
+SPEECH_URL = "http://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav"
+
+
+def ensure_speech_sample():
+    """Download speech sample if not present."""
+    FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
+    if SPEECH_SAMPLE.exists():
+        return
+    log.info("Downloading speech sample from Open Speech Repository...")
+    import urllib.request
+    urllib.request.urlretrieve(SPEECH_URL, SPEECH_SAMPLE)
+    log.info("Saved: %s", SPEECH_SAMPLE)
+
+
+def generate(duration: int, interval: int):
+    FIXTURES_DIR.mkdir(parents=True, exist_ok=True)
+    ensure_speech_sample()
+
+    video_path = FIXTURES_DIR / f"test_scene_{duration}s.mp4"
+    truth_path = FIXTURES_DIR / f"test_scene_{duration}s_ground_truth.json"
+
+    num_scenes = duration // interval
+    nc = len(COLORS)
+
+    # Video: colored segments with timer overlay, concatenated.
+    # Audio: speech sample looped to fill duration (real speech for whisper testing).
+    filter_parts = []
+    for i in range(num_scenes):
+        color = COLORS[i % nc]
+        seg_dur = interval if (i + 1) * interval <= duration else duration - i * interval
+        offset = i * interval
+        filter_parts.append(
+            f"color=c=0x{color}:s=1920x1080:d={seg_dur}:r=30,"
+            f"drawtext=text='%{{eif\\:t+{offset}\\:d}}s':"
+            f"fontsize=200:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:"
+            f"borderw=6:bordercolor=black"
+            f"[v{i}]"
+        )
+
+    v_inputs = "".join(f"[v{i}]" for i in range(num_scenes))
+    filter_parts.append(f"{v_inputs}concat=n={num_scenes}:v=1:a=0[vout]")
+    # Audio: speech sample is input 1 (input 0 is the lavfi dummy)
+    filter_parts.append(
+        f"[1:a]aresample=48000,aloop=loop=-1:size=48000*{duration},"
+        f"atrim=0:{duration},volume=0.8[aout]"
+    )
+
+    filter_complex = ";\n".join(filter_parts)
+
+    cmd = [
+        "ffmpeg", "-y",
+        "-f", "lavfi", "-i", "anullsrc",  # dummy (video segments come from filter)
+        *[arg for i in range(num_scenes) for arg in []],  # no extra inputs needed for video
+        "-i", str(SPEECH_SAMPLE),  # speech audio input
+        "-filter_complex", filter_complex,
+        "-map", "[vout]", "-map", "[aout]",
+        "-c:v", "libx264", "-preset", "ultrafast", "-crf", "18",
+        "-g", "30", "-keyint_min", "30",
+        "-c:a", "aac", "-b:a", "128k",
+        "-t", str(duration),
+        str(video_path),
+        "-hide_banner", "-loglevel", "warning",
+    ]
+
+    log.info("Generating %ds test video (%d scenes, %ds interval, speech audio)", duration, num_scenes, interval)
+    log.info("Output: %s", video_path)
+
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    if result.returncode != 0:
+        log.error("ffmpeg failed:\n%s", result.stderr)
+        sys.exit(1)
+
+    log.info("Video generated: %s", video_path)
+
+    # Write ground truth
+    scenes = []
+    for i in range(num_scenes):
+        scenes.append({
+            "scene_index": i,
+            "timestamp_s": i * interval,
+            "color_hex": COLORS[i % nc],
+        })
+
+    truth = {
+        "duration_s": duration,
+        "interval_s": interval,
+        "num_scenes": num_scenes,
+        "video_path": str(video_path),
+        "scenes": scenes,
+    }
+    truth_path.write_text(json.dumps(truth, indent=2))
+    log.info("Ground truth: %s", truth_path)
+
+    return video_path, truth_path
+
+
+def main():
+    logging.basicConfig(
+        level=logging.INFO,
+        format="%(asctime)s %(levelname)-7s %(name)s: %(message)s",
+        datefmt="%H:%M:%S",
+    )
+
+    parser = argparse.ArgumentParser(description="Generate CHT test video")
+    parser.add_argument("--duration", type=int, default=30, help="Video duration in seconds")
+    parser.add_argument("--interval", type=int, default=5, help="Seconds between scene changes")
+    args = parser.parse_args()
+
+    generate(args.duration, args.interval)
+
+
+if __name__ == "__main__":
+    main()