audio and transcript

This commit is contained in:
2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions

View File

@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
gi.require_version("GdkPixbuf", "2.0")
from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf
from threading import Thread
from cht.config import APP_NAME, SCENE_THRESHOLD
from cht.ui.timeline import Timeline, TimelineControls
from cht.ui.monitor import MonitorWidget
from cht.ui.waveform import WaveformWidget
from cht.audio.waveform import WaveformEngine
from cht.transcriber.engine import TranscriberEngine
from cht.stream.manager import StreamManager, list_sessions
from cht.stream.tracker import RecordingTracker
from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
# Timeline is the central state machine
self._timeline = Timeline()
self._agent = AgentRunner()
self._waveform_engine = WaveformEngine()
self._transcriber = TranscriberEngine()
# Main layout
self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
# Load existing frames into the strip
self._load_existing_frames()
# Load existing transcript
transcript_index = self._stream_mgr.transcript_dir / "index.json"
if transcript_index.exists():
self._transcriber.load_index(transcript_index)
segs = self._transcriber.all_segments()
if segs:
self._append_transcript_segments(segs)
self._append_agent_output(f" Loaded {len(segs)} transcript segments.\n")
# Compute waveform from existing recordings (background thread)
if segments:
from cht.stream import ffmpeg as ff
def _compute_waveform():
audio_dir = self._stream_mgr.audio_dir
audio_dir.mkdir(parents=True, exist_ok=True)
full_wav = audio_dir / "full.wav"
try:
ff.extract_audio_chunk(segments[0], full_wav)
self._waveform_engine.compute_full(full_wav)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
except Exception as e:
log.error("Waveform computation failed: %s", e)
Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
# Set up agent auth/model if not already done
self._populate_model_dropdown()
@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
# Start scene detection
self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)
# Start audio extraction (waveform + transcription)
self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
# Start polling for frame thumbnails
GLib.timeout_add(1000, self._poll_frames)
@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
for f in frames:
GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])
def _on_new_audio(self, wav_path, start_time, duration):
"""Called from audio extractor thread with new WAV chunk."""
# Compute waveform peaks (fast, ~1ms)
self._waveform_engine.append_chunk(wav_path, start_time)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
# Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
def _transcribe():
new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
if self._stream_mgr:
self._transcriber.save_index(
self._stream_mgr.transcript_dir / "index.json"
)
if new_segs:
GLib.idle_add(self._append_transcript_segments, new_segs)
Thread(target=_transcribe, daemon=True, name="transcriber").start()
def _check_recorder(self):
"""Watchdog: restart recorder if it died (sender disconnect, etc)."""
if not self._streaming or not self._stream_mgr:
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
log.info("Stopping stream...")
self._timeline.reset()
self._monitor.stop()
self._waveform_engine.reset()
self._waveform_widget.set_peaks(None, 0.05)
self._transcriber.reset()
self._transcript_view.get_buffer().set_text("")
if self._tracker:
self._tracker.stop()
self._tracker = None
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
stream_frame.set_child(self._monitor)
top_paned.set_start_child(stream_frame)
self._waveform_area = self._build_placeholder("Waveform", height=250, width=200)
top_paned.set_end_child(self._waveform_area)
self._waveform_widget = WaveformWidget(self._timeline)
waveform_frame = Gtk.Frame()
waveform_frame.set_child(self._waveform_widget)
top_paned.set_end_child(waveform_frame)
top_paned.set_position(650)
right_box.append(top_paned)
@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
# Auto-scroll to bottom
self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
def _append_transcript_segments(self, segments):
"""Append transcription segments to the transcript panel."""
buf = self._transcript_view.get_buffer()
for seg in segments:
m1, s1 = divmod(int(seg.start), 60)
m2, s2 = divmod(int(seg.end), 60)
line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id} {seg.text}\n"
buf.insert(buf.get_end_iter(), line)
self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
# -- Frame thumbnails --
def _load_existing_frames(self):