#!/usr/bin/env python3 """Generate a test video with known scene changes and audio markers. The video is played fullscreen on the sender while kmsgrab captures it — simulating a meeting with deterministic, reproducible content. - Scene changes: solid color blocks every INTERVAL seconds - Overlay: large elapsed-seconds counter for visual sync measurement - Audio: sine tone (changes frequency each scene for transcription ground truth) Outputs: tests/fixtures/test_scene_30s.mp4 tests/fixtures/test_scene_30s_ground_truth.json Usage: python ctrl/gen_test_video.py [--duration 30] [--interval 5] """ import argparse import json import logging import subprocess import sys from pathlib import Path log = logging.getLogger("gen_test_video") PROJECT_DIR = Path(__file__).resolve().parent.parent FIXTURES_DIR = PROJECT_DIR / "tests" / "fixtures" # Scene colors (RGB hex) COLORS = ["FF0000", "0000FF", "00FF00", "FFFF00", "FF00FF", "00FFFF"] # Speech sample (Harvard sentences, public domain, Open Speech Repository) SPEECH_SAMPLE = FIXTURES_DIR / "test_speech_harvard.wav" SPEECH_URL = "http://www.voiptroubleshooter.com/open_speech/american/OSR_us_000_0010_8k.wav" def ensure_speech_sample(): """Download speech sample if not present.""" FIXTURES_DIR.mkdir(parents=True, exist_ok=True) if SPEECH_SAMPLE.exists(): return log.info("Downloading speech sample from Open Speech Repository...") import urllib.request urllib.request.urlretrieve(SPEECH_URL, SPEECH_SAMPLE) log.info("Saved: %s", SPEECH_SAMPLE) def generate(duration: int, interval: int): FIXTURES_DIR.mkdir(parents=True, exist_ok=True) ensure_speech_sample() video_path = FIXTURES_DIR / f"test_scene_{duration}s.mp4" truth_path = FIXTURES_DIR / f"test_scene_{duration}s_ground_truth.json" num_scenes = duration // interval nc = len(COLORS) # Video: colored segments with timer overlay, concatenated. # Audio: speech sample looped to fill duration (real speech for whisper testing). filter_parts = [] for i in range(num_scenes): color = COLORS[i % nc] seg_dur = interval if (i + 1) * interval <= duration else duration - i * interval offset = i * interval filter_parts.append( f"color=c=0x{color}:s=1920x1080:d={seg_dur}:r=30," f"drawtext=text='%{{eif\\:t+{offset}\\:d}}s':" f"fontsize=200:fontcolor=white:x=(w-text_w)/2:y=(h-text_h)/2:" f"borderw=6:bordercolor=black" f"[v{i}]" ) v_inputs = "".join(f"[v{i}]" for i in range(num_scenes)) filter_parts.append(f"{v_inputs}concat=n={num_scenes}:v=1:a=0[vout]") # Audio: speech sample is input 1 (input 0 is the lavfi dummy) filter_parts.append( f"[1:a]aresample=48000,aloop=loop=-1:size=48000*{duration}," f"atrim=0:{duration},volume=0.8[aout]" ) filter_complex = ";\n".join(filter_parts) cmd = [ "ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc", # dummy (video segments come from filter) *[arg for i in range(num_scenes) for arg in []], # no extra inputs needed for video "-i", str(SPEECH_SAMPLE), # speech audio input "-filter_complex", filter_complex, "-map", "[vout]", "-map", "[aout]", "-c:v", "libx264", "-preset", "ultrafast", "-crf", "18", "-g", "30", "-keyint_min", "30", "-c:a", "aac", "-b:a", "128k", "-t", str(duration), str(video_path), "-hide_banner", "-loglevel", "warning", ] log.info("Generating %ds test video (%d scenes, %ds interval, speech audio)", duration, num_scenes, interval) log.info("Output: %s", video_path) result = subprocess.run(cmd, capture_output=True, text=True) if result.returncode != 0: log.error("ffmpeg failed:\n%s", result.stderr) sys.exit(1) log.info("Video generated: %s", video_path) # Write ground truth scenes = [] for i in range(num_scenes): scenes.append({ "scene_index": i, "timestamp_s": i * interval, "color_hex": COLORS[i % nc], }) truth = { "duration_s": duration, "interval_s": interval, "num_scenes": num_scenes, "video_path": str(video_path), "scenes": scenes, } truth_path.write_text(json.dumps(truth, indent=2)) log.info("Ground truth: %s", truth_path) return video_path, truth_path def main(): logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-7s %(name)s: %(message)s", datefmt="%H:%M:%S", ) parser = argparse.ArgumentParser(description="Generate CHT test video") parser.add_argument("--duration", type=int, default=30, help="Video duration in seconds") parser.add_argument("--interval", type=int, default=5, help="Seconds between scene changes") args = parser.parse_args() generate(args.duration, args.interval) if __name__ == "__main__": main()