audio and transcript

2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions
--- a/cht/stream/ffmpeg.py
+++ b/cht/stream/ffmpeg.py
@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")


+def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
+    """Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
+
+    Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
+    Returns (stdout, stderr) as decoded strings.
+    """
+    kwargs = {"ss": start_time}
+    if duration is not None:
+        kwargs["t"] = duration
+    stream = ffmpeg.input(str(input_path), **kwargs)
+    output = (
+        ffmpeg.output(
+            stream, str(output_path),
+            acodec="pcm_s16le", ac=1, ar=16000,
+            vn=None,
+        )
+        .overwrite_output()
+        .global_args(*QUIET_ARGS)
+    )
+    log.info("extract_audio_chunk: %s", " ".join(output.compile()))
+    try:
+        stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
+    except ffmpeg.Error as e:
+        stderr = e.stderr or b""
+        log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
+        stdout = e.stdout or b""
+    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
+
+
 def extract_frame_at(input_path, output_path, timestamp):
    """Extract a single frame at the given timestamp."""
    output = (