audio and transcript

2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions
--- a/cht/agent/base.py
+++ b/cht/agent/base.py
@@ -18,12 +18,22 @@ class FrameRef:
    timestamp: float  # seconds into recording


+@dataclass
+class TranscriptRef:
+    id: str        # "T0001"
+    start: float   # seconds into recording
+    end: float     # seconds into recording
+    text: str
+
+
@dataclass
 class SessionContext:
    session_dir: Path
    frames: list[FrameRef]               # all captured frames so far
    duration: float                       # current recording duration (seconds)
-    mentioned_frames: list[FrameRef] = field(default_factory=list)  # @-referenced in message
+    mentioned_frames: list[FrameRef] = field(default_factory=list)
+    transcript_segments: list[TranscriptRef] = field(default_factory=list)
+    mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)


 class AgentProvider(ABC):
--- a/cht/agent/claude_sdk_provider.py
+++ b/cht/agent/claude_sdk_provider.py
@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
            fm, fs = divmod(int(f.timestamp), 60)
            lines.append(f"  {f.id} at {fm:02d}:{fs:02d} — {f.path}")

+    # Transcript
+    if context.transcript_segments:
+        lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
+        for t in context.transcript_segments:
+            tm1, ts1 = divmod(int(t.start), 60)
+            tm2, ts2 = divmod(int(t.end), 60)
+            lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
+
+    if context.mentioned_transcripts:
+        lines.append("\nTranscript segments referenced in this message:")
+        for t in context.mentioned_transcripts:
+            tm1, ts1 = divmod(int(t.start), 60)
+            tm2, ts2 = divmod(int(t.end), 60)
+            lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
+
    lines.append(f"\nUser message: {message}")
    return "\n".join(lines)

--- a/cht/agent/openai_compat_provider.py
+++ b/cht/agent/openai_compat_provider.py
@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):

        # Build context header
        m, s = divmod(int(context.duration), 60)
-        ctx_text = (
-            f"Recording duration: {m:02d}:{s:02d}\n"
-            f"Total frames: {len(context.frames)}\n"
-        )
+        ctx_lines = [
+            f"Recording duration: {m:02d}:{s:02d}",
+            f"Total frames: {len(context.frames)}",
+        ]
+        if context.transcript_segments:
+            ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
+            for t in context.transcript_segments:
+                tm1, ts1 = divmod(int(t.start), 60)
+                tm2, ts2 = divmod(int(t.end), 60)
+                ctx_lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
+        ctx_text = "\n".join(ctx_lines) + "\n"

        frames_to_send = context.mentioned_frames

--- a/cht/agent/runner.py
+++ b/cht/agent/runner.py
@@ -15,7 +15,7 @@ from pathlib import Path
 from threading import Thread
 from typing import Callable

-from cht.agent.base import AgentProvider, FrameRef, SessionContext
+from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext

 log = logging.getLogger(__name__)

@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
        return []


+def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
+    index_path = transcript_dir / "index.json"
+    if not index_path.exists():
+        return []
+    try:
+        entries = json.loads(index_path.read_text())
+        return [TranscriptRef(**e) for e in entries]
+    except Exception as e:
+        log.warning("Could not load transcript index: %s", e)
+        return []
+
+
+def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
+    """Extract @T references from message. Accepts @T0001, @t1, @T1."""
+    mentioned = []
+    seen = set()
+    for match in re.finditer(r"@[Tt](\d+)", message):
+        num = int(match.group(1))
+        tid = f"T{num:04d}"
+        if tid not in seen:
+            seg = next((s for s in segments if s.id == tid), None)
+            if seg:
+                mentioned.append(seg)
+                seen.add(tid)
+    return mentioned
+
+
 class AgentRunner:
    """Runs agent queries in a background thread, streams chunks to a callback."""

@@ -152,12 +179,16 @@ class AgentRunner:
            try:
                provider = self._get_provider()
                frames = _load_frames(stream_mgr.frames_dir)
-                mentioned = _parse_mentions(message, frames)
+                mentioned_frames = _parse_mentions(message, frames)
+                transcript = _load_transcript(stream_mgr.transcript_dir)
+                mentioned_transcripts = _parse_transcript_mentions(message, transcript)
                context = SessionContext(
                    session_dir=stream_mgr.session_dir,
                    frames=frames,
                    duration=tracker.duration if tracker else 0.0,
-                    mentioned_frames=mentioned,
+                    mentioned_frames=mentioned_frames,
+                    transcript_segments=transcript,
+                    mentioned_transcripts=mentioned_transcripts,
                )
                for chunk in provider.stream(message, context):
                    on_chunk(chunk)
--- a/cht/audio/init.py
+++ b/cht/audio/init.py
--- a/cht/audio/waveform.py
+++ b/cht/audio/waveform.py
@@ -0,0 +1,90 @@
+"""
+Waveform peak computation from WAV files.
+
+Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk),
+computes RMS amplitude per time bucket, and stores peaks as a numpy array
+that grows incrementally during live recording.
+"""
+
+import logging
+import wave
+
+import numpy as np
+
+log = logging.getLogger(__name__)
+
+
+class WaveformEngine:
+    """Computes and accumulates waveform peak data from WAV chunks."""
+
+    def __init__(self, bucket_ms=50):
+        self._bucket_ms = bucket_ms
+        self._peaks = np.empty(0, dtype=np.float32)
+        self._total_duration = 0.0
+
+    @property
+    def peaks(self):
+        return self._peaks
+
+    @property
+    def bucket_duration(self):
+        return self._bucket_ms / 1000.0
+
+    @property
+    def total_duration(self):
+        return self._total_duration
+
+    def append_chunk(self, wav_path, start_time):
+        """Read a WAV chunk and append its peaks to the internal array."""
+        samples, sample_rate = self._read_wav(wav_path)
+        if samples is None:
+            return
+        new_peaks = self._compute_rms(samples, sample_rate)
+        if len(new_peaks) > 0:
+            self._peaks = np.concatenate([self._peaks, new_peaks])
+            chunk_duration = len(samples) / sample_rate
+            self._total_duration = start_time + chunk_duration
+            log.info("Waveform: +%d peaks (total %d, %.1fs)",
+                     len(new_peaks), len(self._peaks), self._total_duration)
+
+    def compute_full(self, wav_path):
+        """Compute all peaks from a complete WAV file (for loaded sessions)."""
+        self._peaks = np.empty(0, dtype=np.float32)
+        self._total_duration = 0.0
+        samples, sample_rate = self._read_wav(wav_path)
+        if samples is None:
+            return
+        self._peaks = self._compute_rms(samples, sample_rate)
+        self._total_duration = len(samples) / sample_rate
+        log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration)
+
+    def reset(self):
+        self._peaks = np.empty(0, dtype=np.float32)
+        self._total_duration = 0.0
+
+    def _read_wav(self, wav_path):
+        """Read a 16-bit PCM WAV file into a float32 numpy array."""
+        try:
+            with wave.open(str(wav_path), "rb") as wf:
+                n_frames = wf.getnframes()
+                if n_frames == 0:
+                    return None, 0
+                sample_rate = wf.getframerate()
+                raw = wf.readframes(n_frames)
+                samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
+                return samples, sample_rate
+        except Exception as e:
+            log.warning("Failed to read WAV %s: %s", wav_path, e)
+            return None, 0
+
+    def _compute_rms(self, samples, sample_rate):
+        """Compute RMS amplitude per bucket."""
+        bucket_size = int(sample_rate * self._bucket_ms / 1000)
+        if bucket_size <= 0 or len(samples) < bucket_size:
+            return np.empty(0, dtype=np.float32)
+
+        # Trim to whole buckets
+        n_buckets = len(samples) // bucket_size
+        trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size)
+        rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32)
+        return rms
--- a/cht/config.py
+++ b/cht/config.py
@@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10  # 0-1, lower = more sensitive; 0.1 catches slide/window

 # Segment recording
 SEGMENT_DURATION = 60  # seconds per .ts segment
+
+# Audio extraction
+AUDIO_EXTRACT_INTERVAL = 3    # seconds between extraction cycles
+AUDIO_SAFETY_MARGIN = 2       # seconds safety margin (matches scene detector)
+WAVEFORM_BUCKET_MS = 50       # milliseconds per waveform peak bucket
+
+# Transcription
+WHISPER_MODEL = "small"        # "small" for speed, "medium" for accuracy
+WHISPER_DEVICE = "cuda"        # "cuda" or "cpu"
--- a/cht/stream/ffmpeg.py
+++ b/cht/stream/ffmpeg.py
@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")


+def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
+    """Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
+
+    Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
+    Returns (stdout, stderr) as decoded strings.
+    """
+    kwargs = {"ss": start_time}
+    if duration is not None:
+        kwargs["t"] = duration
+    stream = ffmpeg.input(str(input_path), **kwargs)
+    output = (
+        ffmpeg.output(
+            stream, str(output_path),
+            acodec="pcm_s16le", ac=1, ar=16000,
+            vn=None,
+        )
+        .overwrite_output()
+        .global_args(*QUIET_ARGS)
+    )
+    log.info("extract_audio_chunk: %s", " ".join(output.compile()))
+    try:
+        stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
+    except ffmpeg.Error as e:
+        stderr = e.stderr or b""
+        log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
+        stdout = e.stdout or b""
+    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
+
+
 def extract_frame_at(input_path, output_path, timestamp):
    """Extract a single frame at the given timestamp."""
    output = (
--- a/cht/stream/manager.py
+++ b/cht/stream/manager.py
@@ -20,6 +20,8 @@ from cht.config import (
    RELAY_PORT,
    SCENE_THRESHOLD,
    SESSIONS_DIR,
+    AUDIO_EXTRACT_INTERVAL,
+    AUDIO_SAFETY_MARGIN,
 )
 from cht.stream import ffmpeg as ff

@@ -46,6 +48,7 @@ class StreamManager:
        self.stream_dir = self.session_dir / "stream"
        self.frames_dir = self.session_dir / "frames"
        self.transcript_dir = self.session_dir / "transcript"
+        self.audio_dir = self.session_dir / "audio"
        self.agent_dir = self.session_dir / "agent"

        self._procs = {}
@@ -103,7 +106,7 @@ class StreamManager:
        return total

    def setup_dirs(self):
-        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir):
+        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
            d.mkdir(parents=True, exist_ok=True)

    @property
@@ -349,6 +352,77 @@ class StreamManager:

        Thread(target=_capture, daemon=True, name="capture_now").start()

+    # -- Audio Extraction --
+
+    def start_audio_extractor(self, on_new_audio=None):
+        """Periodically extract audio from the growing recording as WAV chunks.
+
+        Same incremental pattern as scene detector: polls recording, extracts
+        new time range, calls back with (wav_path, start_time, duration).
+
+        Args:
+            on_new_audio: callback(wav_path, start_time, duration)
+        """
+        self._on_new_audio = on_new_audio
+        self.audio_dir.mkdir(parents=True, exist_ok=True)
+
+        def _extract():
+            processed_time = 0.0
+            chunk_num = 0
+            current_segment = None
+
+            while "stop" not in self._stop_flags:
+                time.sleep(AUDIO_EXTRACT_INTERVAL)
+
+                seg = self.recording_path
+                if not seg.exists():
+                    continue
+
+                if seg != current_segment:
+                    current_segment = seg
+                    processed_time = 0.0
+                    chunk_num = 0
+                    log.info("Audio extractor: switched to %s", seg.name)
+
+                if seg.stat().st_size < 100_000:
+                    continue
+
+                safe_duration = self._estimate_safe_duration()
+                if safe_duration is None or safe_duration <= 0:
+                    continue
+
+                process_to = safe_duration - AUDIO_SAFETY_MARGIN
+                if process_to <= processed_time + 1.0:
+                    continue
+
+                chunk_duration = process_to - processed_time
+                wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
+
+                try:
+                    ff.extract_audio_chunk(
+                        seg, wav_path,
+                        start_time=processed_time,
+                        duration=chunk_duration,
+                    )
+                except Exception as e:
+                    log.error("Audio extraction failed: %s", e)
+                    continue
+
+                if wav_path.exists() and wav_path.stat().st_size > 100:
+                    log.info("Audio chunk: %s (%.1fs → %.1fs)",
+                             wav_path.name, processed_time, process_to)
+                    if self._on_new_audio:
+                        self._on_new_audio(wav_path, processed_time, chunk_duration)
+                    chunk_num += 1
+
+                processed_time = process_to
+
+            log.info("Audio extractor stopped")
+
+        t = Thread(target=_extract, daemon=True, name="audio_extractor")
+        t.start()
+        self._threads["audio_extractor"] = t
+
    # -- Lifecycle --

    def stop_all(self):
--- a/cht/transcriber/engine.py
+++ b/cht/transcriber/engine.py
@@ -0,0 +1,98 @@
+"""
+Transcription engine using faster-whisper.
+
+Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...),
+and persists to transcript/index.json in the session directory.
+"""
+
+import json
+import logging
+from dataclasses import dataclass, asdict
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+
+@dataclass
+class TranscriptSegment:
+    id: str        # "T0001"
+    start: float   # seconds into recording
+    end: float     # seconds into recording
+    text: str      # transcribed text
+
+
+class TranscriberEngine:
+    """Incremental transcription via faster-whisper with GPU acceleration."""
+
+    def __init__(self, model_size="small", device="cuda"):
+        self._model = None
+        self._model_size = model_size
+        self._device = device
+        self._segments: list[TranscriptSegment] = []
+        self._next_id = 1
+
+    def _ensure_model(self):
+        if self._model is not None:
+            return
+        log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device)
+        from faster_whisper import WhisperModel
+        self._model = WhisperModel(
+            self._model_size,
+            device=self._device,
+            compute_type="float16" if self._device == "cuda" else "int8",
+        )
+        log.info("Whisper model loaded")
+
+    def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]:
+        """Transcribe a WAV chunk. Returns new segments with absolute timestamps."""
+        self._ensure_model()
+        try:
+            segments_iter, _info = self._model.transcribe(
+                str(wav_path),
+                beam_size=5,
+                vad_filter=True,
+            )
+        except Exception as e:
+            log.error("Whisper transcription failed: %s", e)
+            return []
+
+        new_segments = []
+        for seg in segments_iter:
+            text = seg.text.strip()
+            if not text:
+                continue
+            tid = f"T{self._next_id:04d}"
+            self._next_id += 1
+            entry = TranscriptSegment(
+                id=tid,
+                start=time_offset + seg.start,
+                end=time_offset + seg.end,
+                text=text,
+            )
+            self._segments.append(entry)
+            new_segments.append(entry)
+
+        return new_segments
+
+    def all_segments(self) -> list[TranscriptSegment]:
+        return list(self._segments)
+
+    def save_index(self, path: Path):
+        data = [asdict(s) for s in self._segments]
+        path.write_text(json.dumps(data, indent=2))
+
+    def load_index(self, path: Path):
+        try:
+            data = json.loads(path.read_text())
+        except Exception as e:
+            log.warning("Failed to load transcript index: %s", e)
+            return
+        self._segments = [TranscriptSegment(**e) for e in data]
+        if self._segments:
+            last_num = max(int(s.id.lstrip("T")) for s in self._segments)
+            self._next_id = last_num + 1
+        log.info("Loaded %d transcript segments", len(self._segments))
+
+    def reset(self):
+        self._segments.clear()
+        self._next_id = 1
--- a/cht/ui/waveform.py
+++ b/cht/ui/waveform.py
@@ -0,0 +1,107 @@
+"""
+WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead.
+
+Driven by Timeline "changed" signal — redraws when cursor or duration changes.
+Peak data is set externally via set_peaks() from GLib.idle_add.
+"""
+
+import logging
+import math
+
+import numpy as np
+
+import gi
+gi.require_version("Gtk", "4.0")
+from gi.repository import Gtk, GLib
+
+log = logging.getLogger(__name__)
+
+
+class WaveformWidget(Gtk.Box):
+    """Waveform display synced to Timeline state."""
+
+    def __init__(self, timeline, **kwargs):
+        super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs)
+        self._timeline = timeline
+        self._peaks = None
+        self._bucket_duration = 0.05
+
+        label = Gtk.Label(label="Waveform")
+        label.add_css_class("heading")
+        label.set_margin_top(4)
+        label.set_margin_bottom(4)
+        self.append(label)
+
+        self._area = Gtk.DrawingArea()
+        self._area.set_content_height(250)
+        self._area.set_hexpand(True)
+        self._area.set_vexpand(True)
+        self._area.set_draw_func(self._draw)
+        self.append(self._area)
+
+        timeline.connect("changed", self._on_timeline_changed)
+
+    def set_peaks(self, peaks, bucket_duration):
+        """Update peak data. Call from GLib.idle_add."""
+        self._peaks = peaks
+        self._bucket_duration = bucket_duration
+        self._area.queue_draw()
+
+    def _on_timeline_changed(self, timeline):
+        self._area.queue_draw()
+
+    def _draw(self, area, cr, width, height):
+        # Background
+        cr.set_source_rgb(0.1, 0.1, 0.12)
+        cr.rectangle(0, 0, width, height)
+        cr.fill()
+
+        state = self._timeline.state
+        duration = state.duration
+        mid_y = height / 2
+
+        # Center line
+        cr.set_source_rgba(0.3, 0.3, 0.35, 1.0)
+        cr.set_line_width(1)
+        cr.move_to(0, mid_y)
+        cr.line_to(width, mid_y)
+        cr.stroke()
+
+        # Draw peaks
+        if self._peaks is not None and len(self._peaks) > 0 and duration > 0:
+            n_peaks = len(self._peaks)
+            # Map peaks to pixel columns
+            peak_duration = n_peaks * self._bucket_duration
+            max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0
+
+            for x in range(width):
+                # Time at this pixel
+                t = (x / width) * duration
+                # Corresponding peak index
+                idx = int(t / self._bucket_duration)
+                if 0 <= idx < n_peaks:
+                    val = self._peaks[idx] / max_peak
+                    bar_h = val * (height * 0.45)  # 90% of half-height
+                    # Green gradient based on amplitude
+                    cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85)
+                    cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2)
+                    cr.fill()
+
+        # Scene markers
+        if duration > 0:
+            cr.set_source_rgba(1.0, 1.0, 0.3, 0.3)
+            cr.set_line_width(1)
+            for marker in state.scene_markers:
+                mx = (marker / duration) * width
+                cr.move_to(mx, 0)
+                cr.line_to(mx, height)
+                cr.stroke()
+
+        # Playhead
+        if duration > 0:
+            px = (state.cursor / duration) * width
+            cr.set_source_rgba(1.0, 0.3, 0.3, 0.9)
+            cr.set_line_width(2)
+            cr.move_to(px, 0)
+            cr.line_to(px, height)
+            cr.stroke()
--- a/cht/window.py
+++ b/cht/window.py
@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
 gi.require_version("GdkPixbuf", "2.0")
 from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf

+from threading import Thread
+
 from cht.config import APP_NAME, SCENE_THRESHOLD
 from cht.ui.timeline import Timeline, TimelineControls
 from cht.ui.monitor import MonitorWidget
+from cht.ui.waveform import WaveformWidget
+from cht.audio.waveform import WaveformEngine
+from cht.transcriber.engine import TranscriberEngine
 from cht.stream.manager import StreamManager, list_sessions
 from cht.stream.tracker import RecordingTracker
 from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
        # Timeline is the central state machine
        self._timeline = Timeline()
        self._agent = AgentRunner()
+        self._waveform_engine = WaveformEngine()
+        self._transcriber = TranscriberEngine()

        # Main layout
        self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
        # Load existing frames into the strip
        self._load_existing_frames()

+        # Load existing transcript
+        transcript_index = self._stream_mgr.transcript_dir / "index.json"
+        if transcript_index.exists():
+            self._transcriber.load_index(transcript_index)
+            segs = self._transcriber.all_segments()
+            if segs:
+                self._append_transcript_segments(segs)
+                self._append_agent_output(f"  Loaded {len(segs)} transcript segments.\n")
+
+        # Compute waveform from existing recordings (background thread)
+        if segments:
+            from cht.stream import ffmpeg as ff
+
+            def _compute_waveform():
+                audio_dir = self._stream_mgr.audio_dir
+                audio_dir.mkdir(parents=True, exist_ok=True)
+                full_wav = audio_dir / "full.wav"
+                try:
+                    ff.extract_audio_chunk(segments[0], full_wav)
+                    self._waveform_engine.compute_full(full_wav)
+                    peaks = self._waveform_engine.peaks
+                    bucket_dur = self._waveform_engine.bucket_duration
+                    GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
+                except Exception as e:
+                    log.error("Waveform computation failed: %s", e)
+
+            Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
+
        # Set up agent auth/model if not already done
        self._populate_model_dropdown()

@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
        # Start scene detection
        self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)

+        # Start audio extraction (waveform + transcription)
+        self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
+
        # Start polling for frame thumbnails
        GLib.timeout_add(1000, self._poll_frames)

@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
        for f in frames:
            GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])

+    def _on_new_audio(self, wav_path, start_time, duration):
+        """Called from audio extractor thread with new WAV chunk."""
+        # Compute waveform peaks (fast, ~1ms)
+        self._waveform_engine.append_chunk(wav_path, start_time)
+        peaks = self._waveform_engine.peaks
+        bucket_dur = self._waveform_engine.bucket_duration
+        GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
+
+        # Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
+        def _transcribe():
+            new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
+            if self._stream_mgr:
+                self._transcriber.save_index(
+                    self._stream_mgr.transcript_dir / "index.json"
+                )
+            if new_segs:
+                GLib.idle_add(self._append_transcript_segments, new_segs)
+
+        Thread(target=_transcribe, daemon=True, name="transcriber").start()
+
    def _check_recorder(self):
        """Watchdog: restart recorder if it died (sender disconnect, etc)."""
        if not self._streaming or not self._stream_mgr:
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
        log.info("Stopping stream...")
        self._timeline.reset()
        self._monitor.stop()
+        self._waveform_engine.reset()
+        self._waveform_widget.set_peaks(None, 0.05)
+        self._transcriber.reset()
+        self._transcript_view.get_buffer().set_text("")
        if self._tracker:
            self._tracker.stop()
            self._tracker = None
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
        stream_frame.set_child(self._monitor)
        top_paned.set_start_child(stream_frame)

-        self._waveform_area = self._build_placeholder("Waveform", height=250, width=200)
-        top_paned.set_end_child(self._waveform_area)
+        self._waveform_widget = WaveformWidget(self._timeline)
+        waveform_frame = Gtk.Frame()
+        waveform_frame.set_child(self._waveform_widget)
+        top_paned.set_end_child(waveform_frame)
        top_paned.set_position(650)
        right_box.append(top_paned)

@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
        # Auto-scroll to bottom
        self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)

+    def _append_transcript_segments(self, segments):
+        """Append transcription segments to the transcript panel."""
+        buf = self._transcript_view.get_buffer()
+        for seg in segments:
+            m1, s1 = divmod(int(seg.start), 60)
+            m2, s2 = divmod(int(seg.end), 60)
+            line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id}  {seg.text}\n"
+            buf.insert(buf.get_end_iter(), line)
+        self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
+
    # -- Frame thumbnails --

    def _load_existing_frames(self):
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
    "claude-agent-sdk",
    "openai",
    "pygments",
+    "faster-whisper",
 ]

 [tool.setuptools.packages.find]