audio and transcript

This commit is contained in:
2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions

View File

@@ -18,12 +18,22 @@ class FrameRef:
timestamp: float # seconds into recording timestamp: float # seconds into recording
@dataclass
class TranscriptRef:
id: str # "T0001"
start: float # seconds into recording
end: float # seconds into recording
text: str
@dataclass @dataclass
class SessionContext: class SessionContext:
session_dir: Path session_dir: Path
frames: list[FrameRef] # all captured frames so far frames: list[FrameRef] # all captured frames so far
duration: float # current recording duration (seconds) duration: float # current recording duration (seconds)
mentioned_frames: list[FrameRef] = field(default_factory=list) # @-referenced in message mentioned_frames: list[FrameRef] = field(default_factory=list)
transcript_segments: list[TranscriptRef] = field(default_factory=list)
mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)
class AgentProvider(ABC): class AgentProvider(ABC):

View File

@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
fm, fs = divmod(int(f.timestamp), 60) fm, fs = divmod(int(f.timestamp), 60)
lines.append(f" {f.id} at {fm:02d}:{fs:02d}{f.path}") lines.append(f" {f.id} at {fm:02d}:{fs:02d}{f.path}")
# Transcript
if context.transcript_segments:
lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
if context.mentioned_transcripts:
lines.append("\nTranscript segments referenced in this message:")
for t in context.mentioned_transcripts:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
lines.append(f"\nUser message: {message}") lines.append(f"\nUser message: {message}")
return "\n".join(lines) return "\n".join(lines)

View File

@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):
# Build context header # Build context header
m, s = divmod(int(context.duration), 60) m, s = divmod(int(context.duration), 60)
ctx_text = ( ctx_lines = [
f"Recording duration: {m:02d}:{s:02d}\n" f"Recording duration: {m:02d}:{s:02d}",
f"Total frames: {len(context.frames)}\n" f"Total frames: {len(context.frames)}",
) ]
if context.transcript_segments:
ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
ctx_text = "\n".join(ctx_lines) + "\n"
frames_to_send = context.mentioned_frames frames_to_send = context.mentioned_frames

View File

@@ -15,7 +15,7 @@ from pathlib import Path
from threading import Thread from threading import Thread
from typing import Callable from typing import Callable
from cht.agent.base import AgentProvider, FrameRef, SessionContext from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
return [] return []
def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
index_path = transcript_dir / "index.json"
if not index_path.exists():
return []
try:
entries = json.loads(index_path.read_text())
return [TranscriptRef(**e) for e in entries]
except Exception as e:
log.warning("Could not load transcript index: %s", e)
return []
def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
"""Extract @T references from message. Accepts @T0001, @t1, @T1."""
mentioned = []
seen = set()
for match in re.finditer(r"@[Tt](\d+)", message):
num = int(match.group(1))
tid = f"T{num:04d}"
if tid not in seen:
seg = next((s for s in segments if s.id == tid), None)
if seg:
mentioned.append(seg)
seen.add(tid)
return mentioned
class AgentRunner: class AgentRunner:
"""Runs agent queries in a background thread, streams chunks to a callback.""" """Runs agent queries in a background thread, streams chunks to a callback."""
@@ -152,12 +179,16 @@ class AgentRunner:
try: try:
provider = self._get_provider() provider = self._get_provider()
frames = _load_frames(stream_mgr.frames_dir) frames = _load_frames(stream_mgr.frames_dir)
mentioned = _parse_mentions(message, frames) mentioned_frames = _parse_mentions(message, frames)
transcript = _load_transcript(stream_mgr.transcript_dir)
mentioned_transcripts = _parse_transcript_mentions(message, transcript)
context = SessionContext( context = SessionContext(
session_dir=stream_mgr.session_dir, session_dir=stream_mgr.session_dir,
frames=frames, frames=frames,
duration=tracker.duration if tracker else 0.0, duration=tracker.duration if tracker else 0.0,
mentioned_frames=mentioned, mentioned_frames=mentioned_frames,
transcript_segments=transcript,
mentioned_transcripts=mentioned_transcripts,
) )
for chunk in provider.stream(message, context): for chunk in provider.stream(message, context):
on_chunk(chunk) on_chunk(chunk)

0
cht/audio/__init__.py Normal file
View File

90
cht/audio/waveform.py Normal file
View File

@@ -0,0 +1,90 @@
"""
Waveform peak computation from WAV files.
Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk),
computes RMS amplitude per time bucket, and stores peaks as a numpy array
that grows incrementally during live recording.
"""
import logging
import wave
import numpy as np
log = logging.getLogger(__name__)
class WaveformEngine:
"""Computes and accumulates waveform peak data from WAV chunks."""
def __init__(self, bucket_ms=50):
self._bucket_ms = bucket_ms
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
@property
def peaks(self):
return self._peaks
@property
def bucket_duration(self):
return self._bucket_ms / 1000.0
@property
def total_duration(self):
return self._total_duration
def append_chunk(self, wav_path, start_time):
"""Read a WAV chunk and append its peaks to the internal array."""
samples, sample_rate = self._read_wav(wav_path)
if samples is None:
return
new_peaks = self._compute_rms(samples, sample_rate)
if len(new_peaks) > 0:
self._peaks = np.concatenate([self._peaks, new_peaks])
chunk_duration = len(samples) / sample_rate
self._total_duration = start_time + chunk_duration
log.info("Waveform: +%d peaks (total %d, %.1fs)",
len(new_peaks), len(self._peaks), self._total_duration)
def compute_full(self, wav_path):
"""Compute all peaks from a complete WAV file (for loaded sessions)."""
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
samples, sample_rate = self._read_wav(wav_path)
if samples is None:
return
self._peaks = self._compute_rms(samples, sample_rate)
self._total_duration = len(samples) / sample_rate
log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration)
def reset(self):
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
def _read_wav(self, wav_path):
"""Read a 16-bit PCM WAV file into a float32 numpy array."""
try:
with wave.open(str(wav_path), "rb") as wf:
n_frames = wf.getnframes()
if n_frames == 0:
return None, 0
sample_rate = wf.getframerate()
raw = wf.readframes(n_frames)
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
return samples, sample_rate
except Exception as e:
log.warning("Failed to read WAV %s: %s", wav_path, e)
return None, 0
def _compute_rms(self, samples, sample_rate):
"""Compute RMS amplitude per bucket."""
bucket_size = int(sample_rate * self._bucket_ms / 1000)
if bucket_size <= 0 or len(samples) < bucket_size:
return np.empty(0, dtype=np.float32)
# Trim to whole buckets
n_buckets = len(samples) // bucket_size
trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size)
rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32)
return rms

View File

@@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10 # 0-1, lower = more sensitive; 0.1 catches slide/window
# Segment recording # Segment recording
SEGMENT_DURATION = 60 # seconds per .ts segment SEGMENT_DURATION = 60 # seconds per .ts segment
# Audio extraction
AUDIO_EXTRACT_INTERVAL = 3 # seconds between extraction cycles
AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector)
WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket
# Transcription
WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy
WHISPER_DEVICE = "cuda" # "cuda" or "cpu"

View File

@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace") return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
"""Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
Returns (stdout, stderr) as decoded strings.
"""
kwargs = {"ss": start_time}
if duration is not None:
kwargs["t"] = duration
stream = ffmpeg.input(str(input_path), **kwargs)
output = (
ffmpeg.output(
stream, str(output_path),
acodec="pcm_s16le", ac=1, ar=16000,
vn=None,
)
.overwrite_output()
.global_args(*QUIET_ARGS)
)
log.info("extract_audio_chunk: %s", " ".join(output.compile()))
try:
stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
stderr = e.stderr or b""
log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
stdout = e.stdout or b""
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def extract_frame_at(input_path, output_path, timestamp): def extract_frame_at(input_path, output_path, timestamp):
"""Extract a single frame at the given timestamp.""" """Extract a single frame at the given timestamp."""
output = ( output = (

View File

@@ -20,6 +20,8 @@ from cht.config import (
RELAY_PORT, RELAY_PORT,
SCENE_THRESHOLD, SCENE_THRESHOLD,
SESSIONS_DIR, SESSIONS_DIR,
AUDIO_EXTRACT_INTERVAL,
AUDIO_SAFETY_MARGIN,
) )
from cht.stream import ffmpeg as ff from cht.stream import ffmpeg as ff
@@ -46,6 +48,7 @@ class StreamManager:
self.stream_dir = self.session_dir / "stream" self.stream_dir = self.session_dir / "stream"
self.frames_dir = self.session_dir / "frames" self.frames_dir = self.session_dir / "frames"
self.transcript_dir = self.session_dir / "transcript" self.transcript_dir = self.session_dir / "transcript"
self.audio_dir = self.session_dir / "audio"
self.agent_dir = self.session_dir / "agent" self.agent_dir = self.session_dir / "agent"
self._procs = {} self._procs = {}
@@ -103,7 +106,7 @@ class StreamManager:
return total return total
def setup_dirs(self): def setup_dirs(self):
for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir): for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
d.mkdir(parents=True, exist_ok=True) d.mkdir(parents=True, exist_ok=True)
@property @property
@@ -349,6 +352,77 @@ class StreamManager:
Thread(target=_capture, daemon=True, name="capture_now").start() Thread(target=_capture, daemon=True, name="capture_now").start()
# -- Audio Extraction --
def start_audio_extractor(self, on_new_audio=None):
"""Periodically extract audio from the growing recording as WAV chunks.
Same incremental pattern as scene detector: polls recording, extracts
new time range, calls back with (wav_path, start_time, duration).
Args:
on_new_audio: callback(wav_path, start_time, duration)
"""
self._on_new_audio = on_new_audio
self.audio_dir.mkdir(parents=True, exist_ok=True)
def _extract():
processed_time = 0.0
chunk_num = 0
current_segment = None
while "stop" not in self._stop_flags:
time.sleep(AUDIO_EXTRACT_INTERVAL)
seg = self.recording_path
if not seg.exists():
continue
if seg != current_segment:
current_segment = seg
processed_time = 0.0
chunk_num = 0
log.info("Audio extractor: switched to %s", seg.name)
if seg.stat().st_size < 100_000:
continue
safe_duration = self._estimate_safe_duration()
if safe_duration is None or safe_duration <= 0:
continue
process_to = safe_duration - AUDIO_SAFETY_MARGIN
if process_to <= processed_time + 1.0:
continue
chunk_duration = process_to - processed_time
wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
try:
ff.extract_audio_chunk(
seg, wav_path,
start_time=processed_time,
duration=chunk_duration,
)
except Exception as e:
log.error("Audio extraction failed: %s", e)
continue
if wav_path.exists() and wav_path.stat().st_size > 100:
log.info("Audio chunk: %s (%.1fs → %.1fs)",
wav_path.name, processed_time, process_to)
if self._on_new_audio:
self._on_new_audio(wav_path, processed_time, chunk_duration)
chunk_num += 1
processed_time = process_to
log.info("Audio extractor stopped")
t = Thread(target=_extract, daemon=True, name="audio_extractor")
t.start()
self._threads["audio_extractor"] = t
# -- Lifecycle -- # -- Lifecycle --
def stop_all(self): def stop_all(self):

98
cht/transcriber/engine.py Normal file
View File

@@ -0,0 +1,98 @@
"""
Transcription engine using faster-whisper.
Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...),
and persists to transcript/index.json in the session directory.
"""
import json
import logging
from dataclasses import dataclass, asdict
from pathlib import Path
log = logging.getLogger(__name__)
@dataclass
class TranscriptSegment:
id: str # "T0001"
start: float # seconds into recording
end: float # seconds into recording
text: str # transcribed text
class TranscriberEngine:
"""Incremental transcription via faster-whisper with GPU acceleration."""
def __init__(self, model_size="small", device="cuda"):
self._model = None
self._model_size = model_size
self._device = device
self._segments: list[TranscriptSegment] = []
self._next_id = 1
def _ensure_model(self):
if self._model is not None:
return
log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device)
from faster_whisper import WhisperModel
self._model = WhisperModel(
self._model_size,
device=self._device,
compute_type="float16" if self._device == "cuda" else "int8",
)
log.info("Whisper model loaded")
def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]:
"""Transcribe a WAV chunk. Returns new segments with absolute timestamps."""
self._ensure_model()
try:
segments_iter, _info = self._model.transcribe(
str(wav_path),
beam_size=5,
vad_filter=True,
)
except Exception as e:
log.error("Whisper transcription failed: %s", e)
return []
new_segments = []
for seg in segments_iter:
text = seg.text.strip()
if not text:
continue
tid = f"T{self._next_id:04d}"
self._next_id += 1
entry = TranscriptSegment(
id=tid,
start=time_offset + seg.start,
end=time_offset + seg.end,
text=text,
)
self._segments.append(entry)
new_segments.append(entry)
return new_segments
def all_segments(self) -> list[TranscriptSegment]:
return list(self._segments)
def save_index(self, path: Path):
data = [asdict(s) for s in self._segments]
path.write_text(json.dumps(data, indent=2))
def load_index(self, path: Path):
try:
data = json.loads(path.read_text())
except Exception as e:
log.warning("Failed to load transcript index: %s", e)
return
self._segments = [TranscriptSegment(**e) for e in data]
if self._segments:
last_num = max(int(s.id.lstrip("T")) for s in self._segments)
self._next_id = last_num + 1
log.info("Loaded %d transcript segments", len(self._segments))
def reset(self):
self._segments.clear()
self._next_id = 1

107
cht/ui/waveform.py Normal file
View File

@@ -0,0 +1,107 @@
"""
WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead.
Driven by Timeline "changed" signal — redraws when cursor or duration changes.
Peak data is set externally via set_peaks() from GLib.idle_add.
"""
import logging
import math
import numpy as np
import gi
gi.require_version("Gtk", "4.0")
from gi.repository import Gtk, GLib
log = logging.getLogger(__name__)
class WaveformWidget(Gtk.Box):
"""Waveform display synced to Timeline state."""
def __init__(self, timeline, **kwargs):
super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs)
self._timeline = timeline
self._peaks = None
self._bucket_duration = 0.05
label = Gtk.Label(label="Waveform")
label.add_css_class("heading")
label.set_margin_top(4)
label.set_margin_bottom(4)
self.append(label)
self._area = Gtk.DrawingArea()
self._area.set_content_height(250)
self._area.set_hexpand(True)
self._area.set_vexpand(True)
self._area.set_draw_func(self._draw)
self.append(self._area)
timeline.connect("changed", self._on_timeline_changed)
def set_peaks(self, peaks, bucket_duration):
"""Update peak data. Call from GLib.idle_add."""
self._peaks = peaks
self._bucket_duration = bucket_duration
self._area.queue_draw()
def _on_timeline_changed(self, timeline):
self._area.queue_draw()
def _draw(self, area, cr, width, height):
# Background
cr.set_source_rgb(0.1, 0.1, 0.12)
cr.rectangle(0, 0, width, height)
cr.fill()
state = self._timeline.state
duration = state.duration
mid_y = height / 2
# Center line
cr.set_source_rgba(0.3, 0.3, 0.35, 1.0)
cr.set_line_width(1)
cr.move_to(0, mid_y)
cr.line_to(width, mid_y)
cr.stroke()
# Draw peaks
if self._peaks is not None and len(self._peaks) > 0 and duration > 0:
n_peaks = len(self._peaks)
# Map peaks to pixel columns
peak_duration = n_peaks * self._bucket_duration
max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0
for x in range(width):
# Time at this pixel
t = (x / width) * duration
# Corresponding peak index
idx = int(t / self._bucket_duration)
if 0 <= idx < n_peaks:
val = self._peaks[idx] / max_peak
bar_h = val * (height * 0.45) # 90% of half-height
# Green gradient based on amplitude
cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85)
cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2)
cr.fill()
# Scene markers
if duration > 0:
cr.set_source_rgba(1.0, 1.0, 0.3, 0.3)
cr.set_line_width(1)
for marker in state.scene_markers:
mx = (marker / duration) * width
cr.move_to(mx, 0)
cr.line_to(mx, height)
cr.stroke()
# Playhead
if duration > 0:
px = (state.cursor / duration) * width
cr.set_source_rgba(1.0, 0.3, 0.3, 0.9)
cr.set_line_width(2)
cr.move_to(px, 0)
cr.line_to(px, height)
cr.stroke()

View File

@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
gi.require_version("GdkPixbuf", "2.0") gi.require_version("GdkPixbuf", "2.0")
from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf
from threading import Thread
from cht.config import APP_NAME, SCENE_THRESHOLD from cht.config import APP_NAME, SCENE_THRESHOLD
from cht.ui.timeline import Timeline, TimelineControls from cht.ui.timeline import Timeline, TimelineControls
from cht.ui.monitor import MonitorWidget from cht.ui.monitor import MonitorWidget
from cht.ui.waveform import WaveformWidget
from cht.audio.waveform import WaveformEngine
from cht.transcriber.engine import TranscriberEngine
from cht.stream.manager import StreamManager, list_sessions from cht.stream.manager import StreamManager, list_sessions
from cht.stream.tracker import RecordingTracker from cht.stream.tracker import RecordingTracker
from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
# Timeline is the central state machine # Timeline is the central state machine
self._timeline = Timeline() self._timeline = Timeline()
self._agent = AgentRunner() self._agent = AgentRunner()
self._waveform_engine = WaveformEngine()
self._transcriber = TranscriberEngine()
# Main layout # Main layout
self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL) self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
# Load existing frames into the strip # Load existing frames into the strip
self._load_existing_frames() self._load_existing_frames()
# Load existing transcript
transcript_index = self._stream_mgr.transcript_dir / "index.json"
if transcript_index.exists():
self._transcriber.load_index(transcript_index)
segs = self._transcriber.all_segments()
if segs:
self._append_transcript_segments(segs)
self._append_agent_output(f" Loaded {len(segs)} transcript segments.\n")
# Compute waveform from existing recordings (background thread)
if segments:
from cht.stream import ffmpeg as ff
def _compute_waveform():
audio_dir = self._stream_mgr.audio_dir
audio_dir.mkdir(parents=True, exist_ok=True)
full_wav = audio_dir / "full.wav"
try:
ff.extract_audio_chunk(segments[0], full_wav)
self._waveform_engine.compute_full(full_wav)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
except Exception as e:
log.error("Waveform computation failed: %s", e)
Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
# Set up agent auth/model if not already done # Set up agent auth/model if not already done
self._populate_model_dropdown() self._populate_model_dropdown()
@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
# Start scene detection # Start scene detection
self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames) self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)
# Start audio extraction (waveform + transcription)
self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
# Start polling for frame thumbnails # Start polling for frame thumbnails
GLib.timeout_add(1000, self._poll_frames) GLib.timeout_add(1000, self._poll_frames)
@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
for f in frames: for f in frames:
GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"]) GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])
def _on_new_audio(self, wav_path, start_time, duration):
"""Called from audio extractor thread with new WAV chunk."""
# Compute waveform peaks (fast, ~1ms)
self._waveform_engine.append_chunk(wav_path, start_time)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
# Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
def _transcribe():
new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
if self._stream_mgr:
self._transcriber.save_index(
self._stream_mgr.transcript_dir / "index.json"
)
if new_segs:
GLib.idle_add(self._append_transcript_segments, new_segs)
Thread(target=_transcribe, daemon=True, name="transcriber").start()
def _check_recorder(self): def _check_recorder(self):
"""Watchdog: restart recorder if it died (sender disconnect, etc).""" """Watchdog: restart recorder if it died (sender disconnect, etc)."""
if not self._streaming or not self._stream_mgr: if not self._streaming or not self._stream_mgr:
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
log.info("Stopping stream...") log.info("Stopping stream...")
self._timeline.reset() self._timeline.reset()
self._monitor.stop() self._monitor.stop()
self._waveform_engine.reset()
self._waveform_widget.set_peaks(None, 0.05)
self._transcriber.reset()
self._transcript_view.get_buffer().set_text("")
if self._tracker: if self._tracker:
self._tracker.stop() self._tracker.stop()
self._tracker = None self._tracker = None
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
stream_frame.set_child(self._monitor) stream_frame.set_child(self._monitor)
top_paned.set_start_child(stream_frame) top_paned.set_start_child(stream_frame)
self._waveform_area = self._build_placeholder("Waveform", height=250, width=200) self._waveform_widget = WaveformWidget(self._timeline)
top_paned.set_end_child(self._waveform_area) waveform_frame = Gtk.Frame()
waveform_frame.set_child(self._waveform_widget)
top_paned.set_end_child(waveform_frame)
top_paned.set_position(650) top_paned.set_position(650)
right_box.append(top_paned) right_box.append(top_paned)
@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
# Auto-scroll to bottom # Auto-scroll to bottom
self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0) self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
def _append_transcript_segments(self, segments):
"""Append transcription segments to the transcript panel."""
buf = self._transcript_view.get_buffer()
for seg in segments:
m1, s1 = divmod(int(seg.start), 60)
m2, s2 = divmod(int(seg.end), 60)
line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id} {seg.text}\n"
buf.insert(buf.get_end_iter(), line)
self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
# -- Frame thumbnails -- # -- Frame thumbnails --
def _load_existing_frames(self): def _load_existing_frames(self):

View File

@@ -15,6 +15,7 @@ dependencies = [
"claude-agent-sdk", "claude-agent-sdk",
"openai", "openai",
"pygments", "pygments",
"faster-whisper",
] ]
[tool.setuptools.packages.find] [tool.setuptools.packages.find]