From d61e2a54927e10c4917c2313127479532fc1314b Mon Sep 17 00:00:00 2001 From: buenosairesam Date: Thu, 2 Apr 2026 22:57:21 -0300 Subject: [PATCH] audio and transcript --- cht/agent/base.py | 12 +++- cht/agent/claude_sdk_provider.py | 15 ++++ cht/agent/openai_compat_provider.py | 15 ++-- cht/agent/runner.py | 37 +++++++++- cht/audio/__init__.py | 0 cht/audio/waveform.py | 90 +++++++++++++++++++++++ cht/config.py | 9 +++ cht/stream/ffmpeg.py | 29 ++++++++ cht/stream/manager.py | 76 +++++++++++++++++++- cht/transcriber/engine.py | 98 +++++++++++++++++++++++++ cht/ui/waveform.py | 107 ++++++++++++++++++++++++++++ cht/window.py | 78 +++++++++++++++++++- pyproject.toml | 1 + 13 files changed, 556 insertions(+), 11 deletions(-) create mode 100644 cht/audio/__init__.py create mode 100644 cht/audio/waveform.py create mode 100644 cht/transcriber/engine.py create mode 100644 cht/ui/waveform.py diff --git a/cht/agent/base.py b/cht/agent/base.py index 81a4d16..7335123 100644 --- a/cht/agent/base.py +++ b/cht/agent/base.py @@ -18,12 +18,22 @@ class FrameRef: timestamp: float # seconds into recording +@dataclass +class TranscriptRef: + id: str # "T0001" + start: float # seconds into recording + end: float # seconds into recording + text: str + + @dataclass class SessionContext: session_dir: Path frames: list[FrameRef] # all captured frames so far duration: float # current recording duration (seconds) - mentioned_frames: list[FrameRef] = field(default_factory=list) # @-referenced in message + mentioned_frames: list[FrameRef] = field(default_factory=list) + transcript_segments: list[TranscriptRef] = field(default_factory=list) + mentioned_transcripts: list[TranscriptRef] = field(default_factory=list) class AgentProvider(ABC): diff --git a/cht/agent/claude_sdk_provider.py b/cht/agent/claude_sdk_provider.py index b9cd448..6f8deac 100644 --- a/cht/agent/claude_sdk_provider.py +++ b/cht/agent/claude_sdk_provider.py @@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str: fm, fs = divmod(int(f.timestamp), 60) lines.append(f" {f.id} at {fm:02d}:{fs:02d} — {f.path}") + # Transcript + if context.transcript_segments: + lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):") + for t in context.transcript_segments: + tm1, ts1 = divmod(int(t.start), 60) + tm2, ts2 = divmod(int(t.end), 60) + lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}") + + if context.mentioned_transcripts: + lines.append("\nTranscript segments referenced in this message:") + for t in context.mentioned_transcripts: + tm1, ts1 = divmod(int(t.start), 60) + tm2, ts2 = divmod(int(t.end), 60) + lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}") + lines.append(f"\nUser message: {message}") return "\n".join(lines) diff --git a/cht/agent/openai_compat_provider.py b/cht/agent/openai_compat_provider.py index c76fbcb..20a045a 100644 --- a/cht/agent/openai_compat_provider.py +++ b/cht/agent/openai_compat_provider.py @@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider): # Build context header m, s = divmod(int(context.duration), 60) - ctx_text = ( - f"Recording duration: {m:02d}:{s:02d}\n" - f"Total frames: {len(context.frames)}\n" - ) + ctx_lines = [ + f"Recording duration: {m:02d}:{s:02d}", + f"Total frames: {len(context.frames)}", + ] + if context.transcript_segments: + ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):") + for t in context.transcript_segments: + tm1, ts1 = divmod(int(t.start), 60) + tm2, ts2 = divmod(int(t.end), 60) + ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}") + ctx_text = "\n".join(ctx_lines) + "\n" frames_to_send = context.mentioned_frames diff --git a/cht/agent/runner.py b/cht/agent/runner.py index 26de4ed..7c44ed8 100644 --- a/cht/agent/runner.py +++ b/cht/agent/runner.py @@ -15,7 +15,7 @@ from pathlib import Path from threading import Thread from typing import Callable -from cht.agent.base import AgentProvider, FrameRef, SessionContext +from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext log = logging.getLogger(__name__) @@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]: return [] +def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]: + index_path = transcript_dir / "index.json" + if not index_path.exists(): + return [] + try: + entries = json.loads(index_path.read_text()) + return [TranscriptRef(**e) for e in entries] + except Exception as e: + log.warning("Could not load transcript index: %s", e) + return [] + + +def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]: + """Extract @T references from message. Accepts @T0001, @t1, @T1.""" + mentioned = [] + seen = set() + for match in re.finditer(r"@[Tt](\d+)", message): + num = int(match.group(1)) + tid = f"T{num:04d}" + if tid not in seen: + seg = next((s for s in segments if s.id == tid), None) + if seg: + mentioned.append(seg) + seen.add(tid) + return mentioned + + class AgentRunner: """Runs agent queries in a background thread, streams chunks to a callback.""" @@ -152,12 +179,16 @@ class AgentRunner: try: provider = self._get_provider() frames = _load_frames(stream_mgr.frames_dir) - mentioned = _parse_mentions(message, frames) + mentioned_frames = _parse_mentions(message, frames) + transcript = _load_transcript(stream_mgr.transcript_dir) + mentioned_transcripts = _parse_transcript_mentions(message, transcript) context = SessionContext( session_dir=stream_mgr.session_dir, frames=frames, duration=tracker.duration if tracker else 0.0, - mentioned_frames=mentioned, + mentioned_frames=mentioned_frames, + transcript_segments=transcript, + mentioned_transcripts=mentioned_transcripts, ) for chunk in provider.stream(message, context): on_chunk(chunk) diff --git a/cht/audio/__init__.py b/cht/audio/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/cht/audio/waveform.py b/cht/audio/waveform.py new file mode 100644 index 0000000..1113b4e --- /dev/null +++ b/cht/audio/waveform.py @@ -0,0 +1,90 @@ +""" +Waveform peak computation from WAV files. + +Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk), +computes RMS amplitude per time bucket, and stores peaks as a numpy array +that grows incrementally during live recording. +""" + +import logging +import wave + +import numpy as np + +log = logging.getLogger(__name__) + + +class WaveformEngine: + """Computes and accumulates waveform peak data from WAV chunks.""" + + def __init__(self, bucket_ms=50): + self._bucket_ms = bucket_ms + self._peaks = np.empty(0, dtype=np.float32) + self._total_duration = 0.0 + + @property + def peaks(self): + return self._peaks + + @property + def bucket_duration(self): + return self._bucket_ms / 1000.0 + + @property + def total_duration(self): + return self._total_duration + + def append_chunk(self, wav_path, start_time): + """Read a WAV chunk and append its peaks to the internal array.""" + samples, sample_rate = self._read_wav(wav_path) + if samples is None: + return + new_peaks = self._compute_rms(samples, sample_rate) + if len(new_peaks) > 0: + self._peaks = np.concatenate([self._peaks, new_peaks]) + chunk_duration = len(samples) / sample_rate + self._total_duration = start_time + chunk_duration + log.info("Waveform: +%d peaks (total %d, %.1fs)", + len(new_peaks), len(self._peaks), self._total_duration) + + def compute_full(self, wav_path): + """Compute all peaks from a complete WAV file (for loaded sessions).""" + self._peaks = np.empty(0, dtype=np.float32) + self._total_duration = 0.0 + samples, sample_rate = self._read_wav(wav_path) + if samples is None: + return + self._peaks = self._compute_rms(samples, sample_rate) + self._total_duration = len(samples) / sample_rate + log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration) + + def reset(self): + self._peaks = np.empty(0, dtype=np.float32) + self._total_duration = 0.0 + + def _read_wav(self, wav_path): + """Read a 16-bit PCM WAV file into a float32 numpy array.""" + try: + with wave.open(str(wav_path), "rb") as wf: + n_frames = wf.getnframes() + if n_frames == 0: + return None, 0 + sample_rate = wf.getframerate() + raw = wf.readframes(n_frames) + samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 + return samples, sample_rate + except Exception as e: + log.warning("Failed to read WAV %s: %s", wav_path, e) + return None, 0 + + def _compute_rms(self, samples, sample_rate): + """Compute RMS amplitude per bucket.""" + bucket_size = int(sample_rate * self._bucket_ms / 1000) + if bucket_size <= 0 or len(samples) < bucket_size: + return np.empty(0, dtype=np.float32) + + # Trim to whole buckets + n_buckets = len(samples) // bucket_size + trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size) + rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32) + return rms diff --git a/cht/config.py b/cht/config.py index 7f7f611..faeb614 100644 --- a/cht/config.py +++ b/cht/config.py @@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10 # 0-1, lower = more sensitive; 0.1 catches slide/window # Segment recording SEGMENT_DURATION = 60 # seconds per .ts segment + +# Audio extraction +AUDIO_EXTRACT_INTERVAL = 3 # seconds between extraction cycles +AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector) +WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket + +# Transcription +WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy +WHISPER_DEVICE = "cuda" # "cuda" or "cpu" diff --git a/cht/stream/ffmpeg.py b/cht/stream/ffmpeg.py index fe523a7..7da0394 100644 --- a/cht/stream/ffmpeg.py +++ b/cht/stream/ffmpeg.py @@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10, return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace") +def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None): + """Extract audio from recording as 16kHz mono WAV (optimal for Whisper). + + Uses input-level seeking (-ss before -i) for fast keyframe-based seek. + Returns (stdout, stderr) as decoded strings. + """ + kwargs = {"ss": start_time} + if duration is not None: + kwargs["t"] = duration + stream = ffmpeg.input(str(input_path), **kwargs) + output = ( + ffmpeg.output( + stream, str(output_path), + acodec="pcm_s16le", ac=1, ar=16000, + vn=None, + ) + .overwrite_output() + .global_args(*QUIET_ARGS) + ) + log.info("extract_audio_chunk: %s", " ".join(output.compile())) + try: + stdout, stderr = output.run(capture_stdout=True, capture_stderr=True) + except ffmpeg.Error as e: + stderr = e.stderr or b"" + log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1]) + stdout = e.stdout or b"" + return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace") + + def extract_frame_at(input_path, output_path, timestamp): """Extract a single frame at the given timestamp.""" output = ( diff --git a/cht/stream/manager.py b/cht/stream/manager.py index e85a2c8..dfafe9c 100644 --- a/cht/stream/manager.py +++ b/cht/stream/manager.py @@ -20,6 +20,8 @@ from cht.config import ( RELAY_PORT, SCENE_THRESHOLD, SESSIONS_DIR, + AUDIO_EXTRACT_INTERVAL, + AUDIO_SAFETY_MARGIN, ) from cht.stream import ffmpeg as ff @@ -46,6 +48,7 @@ class StreamManager: self.stream_dir = self.session_dir / "stream" self.frames_dir = self.session_dir / "frames" self.transcript_dir = self.session_dir / "transcript" + self.audio_dir = self.session_dir / "audio" self.agent_dir = self.session_dir / "agent" self._procs = {} @@ -103,7 +106,7 @@ class StreamManager: return total def setup_dirs(self): - for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir): + for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir): d.mkdir(parents=True, exist_ok=True) @property @@ -349,6 +352,77 @@ class StreamManager: Thread(target=_capture, daemon=True, name="capture_now").start() + # -- Audio Extraction -- + + def start_audio_extractor(self, on_new_audio=None): + """Periodically extract audio from the growing recording as WAV chunks. + + Same incremental pattern as scene detector: polls recording, extracts + new time range, calls back with (wav_path, start_time, duration). + + Args: + on_new_audio: callback(wav_path, start_time, duration) + """ + self._on_new_audio = on_new_audio + self.audio_dir.mkdir(parents=True, exist_ok=True) + + def _extract(): + processed_time = 0.0 + chunk_num = 0 + current_segment = None + + while "stop" not in self._stop_flags: + time.sleep(AUDIO_EXTRACT_INTERVAL) + + seg = self.recording_path + if not seg.exists(): + continue + + if seg != current_segment: + current_segment = seg + processed_time = 0.0 + chunk_num = 0 + log.info("Audio extractor: switched to %s", seg.name) + + if seg.stat().st_size < 100_000: + continue + + safe_duration = self._estimate_safe_duration() + if safe_duration is None or safe_duration <= 0: + continue + + process_to = safe_duration - AUDIO_SAFETY_MARGIN + if process_to <= processed_time + 1.0: + continue + + chunk_duration = process_to - processed_time + wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav" + + try: + ff.extract_audio_chunk( + seg, wav_path, + start_time=processed_time, + duration=chunk_duration, + ) + except Exception as e: + log.error("Audio extraction failed: %s", e) + continue + + if wav_path.exists() and wav_path.stat().st_size > 100: + log.info("Audio chunk: %s (%.1fs → %.1fs)", + wav_path.name, processed_time, process_to) + if self._on_new_audio: + self._on_new_audio(wav_path, processed_time, chunk_duration) + chunk_num += 1 + + processed_time = process_to + + log.info("Audio extractor stopped") + + t = Thread(target=_extract, daemon=True, name="audio_extractor") + t.start() + self._threads["audio_extractor"] = t + # -- Lifecycle -- def stop_all(self): diff --git a/cht/transcriber/engine.py b/cht/transcriber/engine.py new file mode 100644 index 0000000..ba37534 --- /dev/null +++ b/cht/transcriber/engine.py @@ -0,0 +1,98 @@ +""" +Transcription engine using faster-whisper. + +Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...), +and persists to transcript/index.json in the session directory. +""" + +import json +import logging +from dataclasses import dataclass, asdict +from pathlib import Path + +log = logging.getLogger(__name__) + + +@dataclass +class TranscriptSegment: + id: str # "T0001" + start: float # seconds into recording + end: float # seconds into recording + text: str # transcribed text + + +class TranscriberEngine: + """Incremental transcription via faster-whisper with GPU acceleration.""" + + def __init__(self, model_size="small", device="cuda"): + self._model = None + self._model_size = model_size + self._device = device + self._segments: list[TranscriptSegment] = [] + self._next_id = 1 + + def _ensure_model(self): + if self._model is not None: + return + log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device) + from faster_whisper import WhisperModel + self._model = WhisperModel( + self._model_size, + device=self._device, + compute_type="float16" if self._device == "cuda" else "int8", + ) + log.info("Whisper model loaded") + + def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]: + """Transcribe a WAV chunk. Returns new segments with absolute timestamps.""" + self._ensure_model() + try: + segments_iter, _info = self._model.transcribe( + str(wav_path), + beam_size=5, + vad_filter=True, + ) + except Exception as e: + log.error("Whisper transcription failed: %s", e) + return [] + + new_segments = [] + for seg in segments_iter: + text = seg.text.strip() + if not text: + continue + tid = f"T{self._next_id:04d}" + self._next_id += 1 + entry = TranscriptSegment( + id=tid, + start=time_offset + seg.start, + end=time_offset + seg.end, + text=text, + ) + self._segments.append(entry) + new_segments.append(entry) + + return new_segments + + def all_segments(self) -> list[TranscriptSegment]: + return list(self._segments) + + def save_index(self, path: Path): + data = [asdict(s) for s in self._segments] + path.write_text(json.dumps(data, indent=2)) + + def load_index(self, path: Path): + try: + data = json.loads(path.read_text()) + except Exception as e: + log.warning("Failed to load transcript index: %s", e) + return + self._segments = [TranscriptSegment(**e) for e in data] + if self._segments: + last_num = max(int(s.id.lstrip("T")) for s in self._segments) + self._next_id = last_num + 1 + log.info("Loaded %d transcript segments", len(self._segments)) + + def reset(self): + self._segments.clear() + self._next_id = 1 diff --git a/cht/ui/waveform.py b/cht/ui/waveform.py new file mode 100644 index 0000000..049df44 --- /dev/null +++ b/cht/ui/waveform.py @@ -0,0 +1,107 @@ +""" +WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead. + +Driven by Timeline "changed" signal — redraws when cursor or duration changes. +Peak data is set externally via set_peaks() from GLib.idle_add. +""" + +import logging +import math + +import numpy as np + +import gi +gi.require_version("Gtk", "4.0") +from gi.repository import Gtk, GLib + +log = logging.getLogger(__name__) + + +class WaveformWidget(Gtk.Box): + """Waveform display synced to Timeline state.""" + + def __init__(self, timeline, **kwargs): + super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs) + self._timeline = timeline + self._peaks = None + self._bucket_duration = 0.05 + + label = Gtk.Label(label="Waveform") + label.add_css_class("heading") + label.set_margin_top(4) + label.set_margin_bottom(4) + self.append(label) + + self._area = Gtk.DrawingArea() + self._area.set_content_height(250) + self._area.set_hexpand(True) + self._area.set_vexpand(True) + self._area.set_draw_func(self._draw) + self.append(self._area) + + timeline.connect("changed", self._on_timeline_changed) + + def set_peaks(self, peaks, bucket_duration): + """Update peak data. Call from GLib.idle_add.""" + self._peaks = peaks + self._bucket_duration = bucket_duration + self._area.queue_draw() + + def _on_timeline_changed(self, timeline): + self._area.queue_draw() + + def _draw(self, area, cr, width, height): + # Background + cr.set_source_rgb(0.1, 0.1, 0.12) + cr.rectangle(0, 0, width, height) + cr.fill() + + state = self._timeline.state + duration = state.duration + mid_y = height / 2 + + # Center line + cr.set_source_rgba(0.3, 0.3, 0.35, 1.0) + cr.set_line_width(1) + cr.move_to(0, mid_y) + cr.line_to(width, mid_y) + cr.stroke() + + # Draw peaks + if self._peaks is not None and len(self._peaks) > 0 and duration > 0: + n_peaks = len(self._peaks) + # Map peaks to pixel columns + peak_duration = n_peaks * self._bucket_duration + max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0 + + for x in range(width): + # Time at this pixel + t = (x / width) * duration + # Corresponding peak index + idx = int(t / self._bucket_duration) + if 0 <= idx < n_peaks: + val = self._peaks[idx] / max_peak + bar_h = val * (height * 0.45) # 90% of half-height + # Green gradient based on amplitude + cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85) + cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2) + cr.fill() + + # Scene markers + if duration > 0: + cr.set_source_rgba(1.0, 1.0, 0.3, 0.3) + cr.set_line_width(1) + for marker in state.scene_markers: + mx = (marker / duration) * width + cr.move_to(mx, 0) + cr.line_to(mx, height) + cr.stroke() + + # Playhead + if duration > 0: + px = (state.cursor / duration) * width + cr.set_source_rgba(1.0, 0.3, 0.3, 0.9) + cr.set_line_width(2) + cr.move_to(px, 0) + cr.line_to(px, height) + cr.stroke() diff --git a/cht/window.py b/cht/window.py index d3ff1a7..de1e3be 100644 --- a/cht/window.py +++ b/cht/window.py @@ -10,9 +10,14 @@ gi.require_version("Adw", "1") gi.require_version("GdkPixbuf", "2.0") from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf +from threading import Thread + from cht.config import APP_NAME, SCENE_THRESHOLD from cht.ui.timeline import Timeline, TimelineControls from cht.ui.monitor import MonitorWidget +from cht.ui.waveform import WaveformWidget +from cht.audio.waveform import WaveformEngine +from cht.transcriber.engine import TranscriberEngine from cht.stream.manager import StreamManager, list_sessions from cht.stream.tracker import RecordingTracker from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli @@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow): # Timeline is the central state machine self._timeline = Timeline() self._agent = AgentRunner() + self._waveform_engine = WaveformEngine() + self._transcriber = TranscriberEngine() # Main layout self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL) @@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow): # Load existing frames into the strip self._load_existing_frames() + # Load existing transcript + transcript_index = self._stream_mgr.transcript_dir / "index.json" + if transcript_index.exists(): + self._transcriber.load_index(transcript_index) + segs = self._transcriber.all_segments() + if segs: + self._append_transcript_segments(segs) + self._append_agent_output(f" Loaded {len(segs)} transcript segments.\n") + + # Compute waveform from existing recordings (background thread) + if segments: + from cht.stream import ffmpeg as ff + + def _compute_waveform(): + audio_dir = self._stream_mgr.audio_dir + audio_dir.mkdir(parents=True, exist_ok=True) + full_wav = audio_dir / "full.wav" + try: + ff.extract_audio_chunk(segments[0], full_wav) + self._waveform_engine.compute_full(full_wav) + peaks = self._waveform_engine.peaks + bucket_dur = self._waveform_engine.bucket_duration + GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur) + except Exception as e: + log.error("Waveform computation failed: %s", e) + + Thread(target=_compute_waveform, daemon=True, name="waveform_load").start() + # Set up agent auth/model if not already done self._populate_model_dropdown() @@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow): # Start scene detection self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames) + # Start audio extraction (waveform + transcription) + self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio) + # Start polling for frame thumbnails GLib.timeout_add(1000, self._poll_frames) @@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow): for f in frames: GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"]) + def _on_new_audio(self, wav_path, start_time, duration): + """Called from audio extractor thread with new WAV chunk.""" + # Compute waveform peaks (fast, ~1ms) + self._waveform_engine.append_chunk(wav_path, start_time) + peaks = self._waveform_engine.peaks + bucket_dur = self._waveform_engine.bucket_duration + GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur) + + # Transcribe in separate thread (GPU-bound, ~1-2s per chunk) + def _transcribe(): + new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time) + if self._stream_mgr: + self._transcriber.save_index( + self._stream_mgr.transcript_dir / "index.json" + ) + if new_segs: + GLib.idle_add(self._append_transcript_segments, new_segs) + + Thread(target=_transcribe, daemon=True, name="transcriber").start() + def _check_recorder(self): """Watchdog: restart recorder if it died (sender disconnect, etc).""" if not self._streaming or not self._stream_mgr: @@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow): log.info("Stopping stream...") self._timeline.reset() self._monitor.stop() + self._waveform_engine.reset() + self._waveform_widget.set_peaks(None, 0.05) + self._transcriber.reset() + self._transcript_view.get_buffer().set_text("") if self._tracker: self._tracker.stop() self._tracker = None @@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow): stream_frame.set_child(self._monitor) top_paned.set_start_child(stream_frame) - self._waveform_area = self._build_placeholder("Waveform", height=250, width=200) - top_paned.set_end_child(self._waveform_area) + self._waveform_widget = WaveformWidget(self._timeline) + waveform_frame = Gtk.Frame() + waveform_frame.set_child(self._waveform_widget) + top_paned.set_end_child(waveform_frame) top_paned.set_position(650) right_box.append(top_paned) @@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow): # Auto-scroll to bottom self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0) + def _append_transcript_segments(self, segments): + """Append transcription segments to the transcript panel.""" + buf = self._transcript_view.get_buffer() + for seg in segments: + m1, s1 = divmod(int(seg.start), 60) + m2, s2 = divmod(int(seg.end), 60) + line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id} {seg.text}\n" + buf.insert(buf.get_end_iter(), line) + self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0) + # -- Frame thumbnails -- def _load_existing_frames(self): diff --git a/pyproject.toml b/pyproject.toml index b9d4d96..58d5857 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "claude-agent-sdk", "openai", "pygments", + "faster-whisper", ] [tool.setuptools.packages.find]