audio and transcript

2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions
--- a/cht/agent/base.py
+++ b/cht/agent/base.py
@@ -18,12 +18,22 @@ class FrameRef:
    timestamp: float  # seconds into recording
@dataclass
 class TranscriptRef:
    id: str        # "T0001"
    start: float   # seconds into recording
    end: float     # seconds into recording
    text: str
@dataclass
 class SessionContext:
    session_dir: Path
    frames: list[FrameRef]               # all captured frames so far
    duration: float                       # current recording duration (seconds)
-    mentioned_frames: list[FrameRef] = field(default_factory=list)  # @-referenced in message
+    mentioned_frames: list[FrameRef] = field(default_factory=list)
    transcript_segments: list[TranscriptRef] = field(default_factory=list)
    mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)
 class AgentProvider(ABC):
--- a/cht/agent/claude_sdk_provider.py
+++ b/cht/agent/claude_sdk_provider.py
@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
            fm, fs = divmod(int(f.timestamp), 60)
            lines.append(f"  {f.id} at {fm:02d}:{fs:02d} — {f.path}")
    # Transcript
    if context.transcript_segments:
        lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
        for t in context.transcript_segments:
            tm1, ts1 = divmod(int(t.start), 60)
            tm2, ts2 = divmod(int(t.end), 60)
            lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
    if context.mentioned_transcripts:
        lines.append("\nTranscript segments referenced in this message:")
        for t in context.mentioned_transcripts:
            tm1, ts1 = divmod(int(t.start), 60)
            tm2, ts2 = divmod(int(t.end), 60)
            lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
    lines.append(f"\nUser message: {message}")
    return "\n".join(lines)
--- a/cht/agent/openai_compat_provider.py
+++ b/cht/agent/openai_compat_provider.py
@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):
        # Build context header
        m, s = divmod(int(context.duration), 60)
-        ctx_text = (
+        ctx_lines = [
-            f"Recording duration: {m:02d}:{s:02d}\n"
+            f"Recording duration: {m:02d}:{s:02d}",
-            f"Total frames: {len(context.frames)}\n"
+            f"Total frames: {len(context.frames)}",
-        )
+        ]
        if context.transcript_segments:
            ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
            for t in context.transcript_segments:
                tm1, ts1 = divmod(int(t.start), 60)
                tm2, ts2 = divmod(int(t.end), 60)
                ctx_lines.append(f"  {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
        ctx_text = "\n".join(ctx_lines) + "\n"
        frames_to_send = context.mentioned_frames
--- a/cht/agent/runner.py
+++ b/cht/agent/runner.py
@@ -15,7 +15,7 @@ from pathlib import Path
 from threading import Thread
 from typing import Callable
-from cht.agent.base import AgentProvider, FrameRef, SessionContext
+from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext
 log = logging.getLogger(__name__)
@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
        return []
 def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
    index_path = transcript_dir / "index.json"
    if not index_path.exists():
        return []
    try:
        entries = json.loads(index_path.read_text())
        return [TranscriptRef(**e) for e in entries]
    except Exception as e:
        log.warning("Could not load transcript index: %s", e)
        return []
 def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
    """Extract @T references from message. Accepts @T0001, @t1, @T1."""
    mentioned = []
    seen = set()
    for match in re.finditer(r"@[Tt](\d+)", message):
        num = int(match.group(1))
        tid = f"T{num:04d}"
        if tid not in seen:
            seg = next((s for s in segments if s.id == tid), None)
            if seg:
                mentioned.append(seg)
                seen.add(tid)
    return mentioned
 class AgentRunner:
    """Runs agent queries in a background thread, streams chunks to a callback."""
@@ -152,12 +179,16 @@ class AgentRunner:
            try:
                provider = self._get_provider()
                frames = _load_frames(stream_mgr.frames_dir)
-                mentioned = _parse_mentions(message, frames)
+                mentioned_frames = _parse_mentions(message, frames)
                transcript = _load_transcript(stream_mgr.transcript_dir)
                mentioned_transcripts = _parse_transcript_mentions(message, transcript)
                context = SessionContext(
                    session_dir=stream_mgr.session_dir,
                    frames=frames,
                    duration=tracker.duration if tracker else 0.0,
-                    mentioned_frames=mentioned,
+                    mentioned_frames=mentioned_frames,
                    transcript_segments=transcript,
                    mentioned_transcripts=mentioned_transcripts,
                )
                for chunk in provider.stream(message, context):
                    on_chunk(chunk)
--- a/cht/audio/init.py
+++ b/cht/audio/init.py
--- a/cht/audio/waveform.py
+++ b/cht/audio/waveform.py
@@ -0,0 +1,90 @@
 """
 Waveform peak computation from WAV files.
 Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk),
 computes RMS amplitude per time bucket, and stores peaks as a numpy array
 that grows incrementally during live recording.
 """
 import logging
 import wave
 import numpy as np
 log = logging.getLogger(__name__)
 class WaveformEngine:
    """Computes and accumulates waveform peak data from WAV chunks."""
    def __init__(self, bucket_ms=50):
        self._bucket_ms = bucket_ms
        self._peaks = np.empty(0, dtype=np.float32)
        self._total_duration = 0.0
    @property
    def peaks(self):
        return self._peaks
    @property
    def bucket_duration(self):
        return self._bucket_ms / 1000.0
    @property
    def total_duration(self):
        return self._total_duration
    def append_chunk(self, wav_path, start_time):
        """Read a WAV chunk and append its peaks to the internal array."""
        samples, sample_rate = self._read_wav(wav_path)
        if samples is None:
            return
        new_peaks = self._compute_rms(samples, sample_rate)
        if len(new_peaks) > 0:
            self._peaks = np.concatenate([self._peaks, new_peaks])
            chunk_duration = len(samples) / sample_rate
            self._total_duration = start_time + chunk_duration
            log.info("Waveform: +%d peaks (total %d, %.1fs)",
                     len(new_peaks), len(self._peaks), self._total_duration)
    def compute_full(self, wav_path):
        """Compute all peaks from a complete WAV file (for loaded sessions)."""
        self._peaks = np.empty(0, dtype=np.float32)
        self._total_duration = 0.0
        samples, sample_rate = self._read_wav(wav_path)
        if samples is None:
            return
        self._peaks = self._compute_rms(samples, sample_rate)
        self._total_duration = len(samples) / sample_rate
        log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration)
    def reset(self):
        self._peaks = np.empty(0, dtype=np.float32)
        self._total_duration = 0.0
    def _read_wav(self, wav_path):
        """Read a 16-bit PCM WAV file into a float32 numpy array."""
        try:
            with wave.open(str(wav_path), "rb") as wf:
                n_frames = wf.getnframes()
                if n_frames == 0:
                    return None, 0
                sample_rate = wf.getframerate()
                raw = wf.readframes(n_frames)
                samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
                return samples, sample_rate
        except Exception as e:
            log.warning("Failed to read WAV %s: %s", wav_path, e)
            return None, 0
    def _compute_rms(self, samples, sample_rate):
        """Compute RMS amplitude per bucket."""
        bucket_size = int(sample_rate * self._bucket_ms / 1000)
        if bucket_size <= 0 or len(samples) < bucket_size:
            return np.empty(0, dtype=np.float32)
        # Trim to whole buckets
        n_buckets = len(samples) // bucket_size
        trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size)
        rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32)
        return rms
--- a/cht/config.py
+++ b/cht/config.py
@@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10  # 0-1, lower = more sensitive; 0.1 catches slide/window
 # Segment recording
 SEGMENT_DURATION = 60  # seconds per .ts segment
 # Audio extraction
 AUDIO_EXTRACT_INTERVAL = 3    # seconds between extraction cycles
 AUDIO_SAFETY_MARGIN = 2       # seconds safety margin (matches scene detector)
 WAVEFORM_BUCKET_MS = 50       # milliseconds per waveform peak bucket
 # Transcription
 WHISPER_MODEL = "small"        # "small" for speed, "medium" for accuracy
 WHISPER_DEVICE = "cuda"        # "cuda" or "cpu"
--- a/cht/stream/ffmpeg.py
+++ b/cht/stream/ffmpeg.py
@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
 def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
    """Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
    Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
    Returns (stdout, stderr) as decoded strings.
    """
    kwargs = {"ss": start_time}
    if duration is not None:
        kwargs["t"] = duration
    stream = ffmpeg.input(str(input_path), **kwargs)
    output = (
        ffmpeg.output(
            stream, str(output_path),
            acodec="pcm_s16le", ac=1, ar=16000,
            vn=None,
        )
        .overwrite_output()
        .global_args(*QUIET_ARGS)
    )
    log.info("extract_audio_chunk: %s", " ".join(output.compile()))
    try:
        stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
    except ffmpeg.Error as e:
        stderr = e.stderr or b""
        log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
        stdout = e.stdout or b""
    return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
 def extract_frame_at(input_path, output_path, timestamp):
    """Extract a single frame at the given timestamp."""
    output = (
--- a/cht/stream/manager.py
+++ b/cht/stream/manager.py
@@ -20,6 +20,8 @@ from cht.config import (
    RELAY_PORT,
    SCENE_THRESHOLD,
    SESSIONS_DIR,
    AUDIO_EXTRACT_INTERVAL,
    AUDIO_SAFETY_MARGIN,
 )
 from cht.stream import ffmpeg as ff
@@ -46,6 +48,7 @@ class StreamManager:
        self.stream_dir = self.session_dir / "stream"
        self.frames_dir = self.session_dir / "frames"
        self.transcript_dir = self.session_dir / "transcript"
        self.audio_dir = self.session_dir / "audio"
        self.agent_dir = self.session_dir / "agent"
        self._procs = {}
@@ -103,7 +106,7 @@ class StreamManager:
        return total
    def setup_dirs(self):
-        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir):
+        for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
            d.mkdir(parents=True, exist_ok=True)
    @property
@@ -349,6 +352,77 @@ class StreamManager:
        Thread(target=_capture, daemon=True, name="capture_now").start()
    # -- Audio Extraction --
    def start_audio_extractor(self, on_new_audio=None):
        """Periodically extract audio from the growing recording as WAV chunks.
        Same incremental pattern as scene detector: polls recording, extracts
        new time range, calls back with (wav_path, start_time, duration).
        Args:
            on_new_audio: callback(wav_path, start_time, duration)
        """
        self._on_new_audio = on_new_audio
        self.audio_dir.mkdir(parents=True, exist_ok=True)
        def _extract():
            processed_time = 0.0
            chunk_num = 0
            current_segment = None
            while "stop" not in self._stop_flags:
                time.sleep(AUDIO_EXTRACT_INTERVAL)
                seg = self.recording_path
                if not seg.exists():
                    continue
                if seg != current_segment:
                    current_segment = seg
                    processed_time = 0.0
                    chunk_num = 0
                    log.info("Audio extractor: switched to %s", seg.name)
                if seg.stat().st_size < 100_000:
                    continue
                safe_duration = self._estimate_safe_duration()
                if safe_duration is None or safe_duration <= 0:
                    continue
                process_to = safe_duration - AUDIO_SAFETY_MARGIN
                if process_to <= processed_time + 1.0:
                    continue
                chunk_duration = process_to - processed_time
                wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
                try:
                    ff.extract_audio_chunk(
                        seg, wav_path,
                        start_time=processed_time,
                        duration=chunk_duration,
                    )
                except Exception as e:
                    log.error("Audio extraction failed: %s", e)
                    continue
                if wav_path.exists() and wav_path.stat().st_size > 100:
                    log.info("Audio chunk: %s (%.1fs → %.1fs)",
                             wav_path.name, processed_time, process_to)
                    if self._on_new_audio:
                        self._on_new_audio(wav_path, processed_time, chunk_duration)
                    chunk_num += 1
                processed_time = process_to
            log.info("Audio extractor stopped")
        t = Thread(target=_extract, daemon=True, name="audio_extractor")
        t.start()
        self._threads["audio_extractor"] = t
    # -- Lifecycle --
    def stop_all(self):
--- a/cht/transcriber/engine.py
+++ b/cht/transcriber/engine.py
@@ -0,0 +1,98 @@
 """
 Transcription engine using faster-whisper.
 Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...),
 and persists to transcript/index.json in the session directory.
 """
 import json
 import logging
 from dataclasses import dataclass, asdict
 from pathlib import Path
 log = logging.getLogger(__name__)
@dataclass
 class TranscriptSegment:
    id: str        # "T0001"
    start: float   # seconds into recording
    end: float     # seconds into recording
    text: str      # transcribed text
 class TranscriberEngine:
    """Incremental transcription via faster-whisper with GPU acceleration."""
    def __init__(self, model_size="small", device="cuda"):
        self._model = None
        self._model_size = model_size
        self._device = device
        self._segments: list[TranscriptSegment] = []
        self._next_id = 1
    def _ensure_model(self):
        if self._model is not None:
            return
        log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device)
        from faster_whisper import WhisperModel
        self._model = WhisperModel(
            self._model_size,
            device=self._device,
            compute_type="float16" if self._device == "cuda" else "int8",
        )
        log.info("Whisper model loaded")
    def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]:
        """Transcribe a WAV chunk. Returns new segments with absolute timestamps."""
        self._ensure_model()
        try:
            segments_iter, _info = self._model.transcribe(
                str(wav_path),
                beam_size=5,
                vad_filter=True,
            )
        except Exception as e:
            log.error("Whisper transcription failed: %s", e)
            return []
        new_segments = []
        for seg in segments_iter:
            text = seg.text.strip()
            if not text:
                continue
            tid = f"T{self._next_id:04d}"
            self._next_id += 1
            entry = TranscriptSegment(
                id=tid,
                start=time_offset + seg.start,
                end=time_offset + seg.end,
                text=text,
            )
            self._segments.append(entry)
            new_segments.append(entry)
        return new_segments
    def all_segments(self) -> list[TranscriptSegment]:
        return list(self._segments)
    def save_index(self, path: Path):
        data = [asdict(s) for s in self._segments]
        path.write_text(json.dumps(data, indent=2))
    def load_index(self, path: Path):
        try:
            data = json.loads(path.read_text())
        except Exception as e:
            log.warning("Failed to load transcript index: %s", e)
            return
        self._segments = [TranscriptSegment(**e) for e in data]
        if self._segments:
            last_num = max(int(s.id.lstrip("T")) for s in self._segments)
            self._next_id = last_num + 1
        log.info("Loaded %d transcript segments", len(self._segments))
    def reset(self):
        self._segments.clear()
        self._next_id = 1
--- a/cht/ui/waveform.py
+++ b/cht/ui/waveform.py
@@ -0,0 +1,107 @@
 """
 WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead.
 Driven by Timeline "changed" signal — redraws when cursor or duration changes.
 Peak data is set externally via set_peaks() from GLib.idle_add.
 """
 import logging
 import math
 import numpy as np
 import gi
 gi.require_version("Gtk", "4.0")
 from gi.repository import Gtk, GLib
 log = logging.getLogger(__name__)
 class WaveformWidget(Gtk.Box):
    """Waveform display synced to Timeline state."""
    def __init__(self, timeline, **kwargs):
        super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs)
        self._timeline = timeline
        self._peaks = None
        self._bucket_duration = 0.05
        label = Gtk.Label(label="Waveform")
        label.add_css_class("heading")
        label.set_margin_top(4)
        label.set_margin_bottom(4)
        self.append(label)
        self._area = Gtk.DrawingArea()
        self._area.set_content_height(250)
        self._area.set_hexpand(True)
        self._area.set_vexpand(True)
        self._area.set_draw_func(self._draw)
        self.append(self._area)
        timeline.connect("changed", self._on_timeline_changed)
    def set_peaks(self, peaks, bucket_duration):
        """Update peak data. Call from GLib.idle_add."""
        self._peaks = peaks
        self._bucket_duration = bucket_duration
        self._area.queue_draw()
    def _on_timeline_changed(self, timeline):
        self._area.queue_draw()
    def _draw(self, area, cr, width, height):
        # Background
        cr.set_source_rgb(0.1, 0.1, 0.12)
        cr.rectangle(0, 0, width, height)
        cr.fill()
        state = self._timeline.state
        duration = state.duration
        mid_y = height / 2
        # Center line
        cr.set_source_rgba(0.3, 0.3, 0.35, 1.0)
        cr.set_line_width(1)
        cr.move_to(0, mid_y)
        cr.line_to(width, mid_y)
        cr.stroke()
        # Draw peaks
        if self._peaks is not None and len(self._peaks) > 0 and duration > 0:
            n_peaks = len(self._peaks)
            # Map peaks to pixel columns
            peak_duration = n_peaks * self._bucket_duration
            max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0
            for x in range(width):
                # Time at this pixel
                t = (x / width) * duration
                # Corresponding peak index
                idx = int(t / self._bucket_duration)
                if 0 <= idx < n_peaks:
                    val = self._peaks[idx] / max_peak
                    bar_h = val * (height * 0.45)  # 90% of half-height
                    # Green gradient based on amplitude
                    cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85)
                    cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2)
                    cr.fill()
        # Scene markers
        if duration > 0:
            cr.set_source_rgba(1.0, 1.0, 0.3, 0.3)
            cr.set_line_width(1)
            for marker in state.scene_markers:
                mx = (marker / duration) * width
                cr.move_to(mx, 0)
                cr.line_to(mx, height)
                cr.stroke()
        # Playhead
        if duration > 0:
            px = (state.cursor / duration) * width
            cr.set_source_rgba(1.0, 0.3, 0.3, 0.9)
            cr.set_line_width(2)
            cr.move_to(px, 0)
            cr.line_to(px, height)
            cr.stroke()
--- a/cht/window.py
+++ b/cht/window.py
@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
 gi.require_version("GdkPixbuf", "2.0")
 from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf
 from threading import Thread
 from cht.config import APP_NAME, SCENE_THRESHOLD
 from cht.ui.timeline import Timeline, TimelineControls
 from cht.ui.monitor import MonitorWidget
 from cht.ui.waveform import WaveformWidget
 from cht.audio.waveform import WaveformEngine
 from cht.transcriber.engine import TranscriberEngine
 from cht.stream.manager import StreamManager, list_sessions
 from cht.stream.tracker import RecordingTracker
 from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
        # Timeline is the central state machine
        self._timeline = Timeline()
        self._agent = AgentRunner()
        self._waveform_engine = WaveformEngine()
        self._transcriber = TranscriberEngine()
        # Main layout
        self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
        # Load existing frames into the strip
        self._load_existing_frames()
        # Load existing transcript
        transcript_index = self._stream_mgr.transcript_dir / "index.json"
        if transcript_index.exists():
            self._transcriber.load_index(transcript_index)
            segs = self._transcriber.all_segments()
            if segs:
                self._append_transcript_segments(segs)
                self._append_agent_output(f"  Loaded {len(segs)} transcript segments.\n")
        # Compute waveform from existing recordings (background thread)
        if segments:
            from cht.stream import ffmpeg as ff
            def _compute_waveform():
                audio_dir = self._stream_mgr.audio_dir
                audio_dir.mkdir(parents=True, exist_ok=True)
                full_wav = audio_dir / "full.wav"
                try:
                    ff.extract_audio_chunk(segments[0], full_wav)
                    self._waveform_engine.compute_full(full_wav)
                    peaks = self._waveform_engine.peaks
                    bucket_dur = self._waveform_engine.bucket_duration
                    GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
                except Exception as e:
                    log.error("Waveform computation failed: %s", e)
            Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
        # Set up agent auth/model if not already done
        self._populate_model_dropdown()
@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
        # Start scene detection
        self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)
        # Start audio extraction (waveform + transcription)
        self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
        # Start polling for frame thumbnails
        GLib.timeout_add(1000, self._poll_frames)
@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
        for f in frames:
            GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])
    def _on_new_audio(self, wav_path, start_time, duration):
        """Called from audio extractor thread with new WAV chunk."""
        # Compute waveform peaks (fast, ~1ms)
        self._waveform_engine.append_chunk(wav_path, start_time)
        peaks = self._waveform_engine.peaks
        bucket_dur = self._waveform_engine.bucket_duration
        GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
        # Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
        def _transcribe():
            new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
            if self._stream_mgr:
                self._transcriber.save_index(
                    self._stream_mgr.transcript_dir / "index.json"
                )
            if new_segs:
                GLib.idle_add(self._append_transcript_segments, new_segs)
        Thread(target=_transcribe, daemon=True, name="transcriber").start()
    def _check_recorder(self):
        """Watchdog: restart recorder if it died (sender disconnect, etc)."""
        if not self._streaming or not self._stream_mgr:
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
        log.info("Stopping stream...")
        self._timeline.reset()
        self._monitor.stop()
        self._waveform_engine.reset()
        self._waveform_widget.set_peaks(None, 0.05)
        self._transcriber.reset()
        self._transcript_view.get_buffer().set_text("")
        if self._tracker:
            self._tracker.stop()
            self._tracker = None
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
        stream_frame.set_child(self._monitor)
        top_paned.set_start_child(stream_frame)
-        self._waveform_area = self._build_placeholder("Waveform", height=250, width=200)
+        self._waveform_widget = WaveformWidget(self._timeline)
-        top_paned.set_end_child(self._waveform_area)
+        waveform_frame = Gtk.Frame()
        waveform_frame.set_child(self._waveform_widget)
        top_paned.set_end_child(waveform_frame)
        top_paned.set_position(650)
        right_box.append(top_paned)
@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
        # Auto-scroll to bottom
        self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
    def _append_transcript_segments(self, segments):
        """Append transcription segments to the transcript panel."""
        buf = self._transcript_view.get_buffer()
        for seg in segments:
            m1, s1 = divmod(int(seg.start), 60)
            m2, s2 = divmod(int(seg.end), 60)
            line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id}  {seg.text}\n"
            buf.insert(buf.get_end_iter(), line)
        self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
    # -- Frame thumbnails --
    def _load_existing_frames(self):
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
    "claude-agent-sdk",
    "openai",
    "pygments",
    "faster-whisper",
 ]
 [tool.setuptools.packages.find]