audio and transcript

2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions
--- a/cht/stream/ffmpeg.py
+++ b/cht/stream/ffmpeg.py
@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")


+def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
+    """Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
+
+    Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
+    Returns (stdout, stderr) as decoded strings.
+    """
+    kwargs = {"ss": start_time}
+    if duration is not None:
+        kwargs["t"] = duration
+    stream = ffmpeg.input(str(input_path), **kwargs)
+    output = (
+        ffmpeg.output(
+            stream, str(output_path),
+            acodec="pcm_s16le", ac=1, ar=16000,
+            vn=None,
+        )
+        .overwrite_output()
+        .global_args(*QUIET_ARGS)
+    )
+    log.info("extract_audio_chunk: %s", " ".join(output.compile()))
+    try:
+        stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
+    except ffmpeg.Error as e:
+        stderr = e.stderr or b""
+        log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
+        stdout = e.stdout or b""
+    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
+
+
 def extract_frame_at(input_path, output_path, timestamp):
    """Extract a single frame at the given timestamp."""
    output = (
--- a/cht/stream/manager.py
+++ b/cht/stream/manager.py
@@ -20,6 +20,8 @@ from cht.config import (
    RELAY_PORT,
    SCENE_THRESHOLD,
    SESSIONS_DIR,
+    AUDIO_EXTRACT_INTERVAL,
+    AUDIO_SAFETY_MARGIN,
 )
 from cht.stream import ffmpeg as ff

@@ -46,6 +48,7 @@ class StreamManager:
        self.stream_dir = self.session_dir / "stream"
        self.frames_dir = self.session_dir / "frames"
        self.transcript_dir = self.session_dir / "transcript"
+        self.audio_dir = self.session_dir / "audio"
        self.agent_dir = self.session_dir / "agent"

        self._procs = {}
@@ -103,7 +106,7 @@ class StreamManager:
        return total

    def setup_dirs(self):
-        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir):
+        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
            d.mkdir(parents=True, exist_ok=True)

    @property
@@ -349,6 +352,77 @@ class StreamManager:

        Thread(target=_capture, daemon=True, name="capture_now").start()

+    # -- Audio Extraction --
+
+    def start_audio_extractor(self, on_new_audio=None):
+        """Periodically extract audio from the growing recording as WAV chunks.
+
+        Same incremental pattern as scene detector: polls recording, extracts
+        new time range, calls back with (wav_path, start_time, duration).
+
+        Args:
+            on_new_audio: callback(wav_path, start_time, duration)
+        """
+        self._on_new_audio = on_new_audio
+        self.audio_dir.mkdir(parents=True, exist_ok=True)
+
+        def _extract():
+            processed_time = 0.0
+            chunk_num = 0
+            current_segment = None
+
+            while "stop" not in self._stop_flags:
+                time.sleep(AUDIO_EXTRACT_INTERVAL)
+
+                seg = self.recording_path
+                if not seg.exists():
+                    continue
+
+                if seg != current_segment:
+                    current_segment = seg
+                    processed_time = 0.0
+                    chunk_num = 0
+                    log.info("Audio extractor: switched to %s", seg.name)
+
+                if seg.stat().st_size < 100_000:
+                    continue
+
+                safe_duration = self._estimate_safe_duration()
+                if safe_duration is None or safe_duration <= 0:
+                    continue
+
+                process_to = safe_duration - AUDIO_SAFETY_MARGIN
+                if process_to <= processed_time + 1.0:
+                    continue
+
+                chunk_duration = process_to - processed_time
+                wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
+
+                try:
+                    ff.extract_audio_chunk(
+                        seg, wav_path,
+                        start_time=processed_time,
+                        duration=chunk_duration,
+                    )
+                except Exception as e:
+                    log.error("Audio extraction failed: %s", e)
+                    continue
+
+                if wav_path.exists() and wav_path.stat().st_size > 100:
+                    log.info("Audio chunk: %s (%.1fs → %.1fs)",
+                             wav_path.name, processed_time, process_to)
+                    if self._on_new_audio:
+                        self._on_new_audio(wav_path, processed_time, chunk_duration)
+                    chunk_num += 1
+
+                processed_time = process_to
+
+            log.info("Audio extractor stopped")
+
+        t = Thread(target=_extract, daemon=True, name="audio_extractor")
+        t.start()
+        self._threads["audio_extractor"] = t
+
    # -- Lifecycle --

    def stop_all(self):