audio and transcript

This commit is contained in:
2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions

View File

@@ -18,12 +18,22 @@ class FrameRef:
timestamp: float # seconds into recording
@dataclass
class TranscriptRef:
id: str # "T0001"
start: float # seconds into recording
end: float # seconds into recording
text: str
@dataclass
class SessionContext:
session_dir: Path
frames: list[FrameRef] # all captured frames so far
duration: float # current recording duration (seconds)
mentioned_frames: list[FrameRef] = field(default_factory=list) # @-referenced in message
mentioned_frames: list[FrameRef] = field(default_factory=list)
transcript_segments: list[TranscriptRef] = field(default_factory=list)
mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)
class AgentProvider(ABC):

View File

@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
fm, fs = divmod(int(f.timestamp), 60)
lines.append(f" {f.id} at {fm:02d}:{fs:02d}{f.path}")
# Transcript
if context.transcript_segments:
lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
if context.mentioned_transcripts:
lines.append("\nTranscript segments referenced in this message:")
for t in context.mentioned_transcripts:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
lines.append(f"\nUser message: {message}")
return "\n".join(lines)

View File

@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):
# Build context header
m, s = divmod(int(context.duration), 60)
ctx_text = (
f"Recording duration: {m:02d}:{s:02d}\n"
f"Total frames: {len(context.frames)}\n"
)
ctx_lines = [
f"Recording duration: {m:02d}:{s:02d}",
f"Total frames: {len(context.frames)}",
]
if context.transcript_segments:
ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
ctx_text = "\n".join(ctx_lines) + "\n"
frames_to_send = context.mentioned_frames

View File

@@ -15,7 +15,7 @@ from pathlib import Path
from threading import Thread
from typing import Callable
from cht.agent.base import AgentProvider, FrameRef, SessionContext
from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext
log = logging.getLogger(__name__)
@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
return []
def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
index_path = transcript_dir / "index.json"
if not index_path.exists():
return []
try:
entries = json.loads(index_path.read_text())
return [TranscriptRef(**e) for e in entries]
except Exception as e:
log.warning("Could not load transcript index: %s", e)
return []
def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
"""Extract @T references from message. Accepts @T0001, @t1, @T1."""
mentioned = []
seen = set()
for match in re.finditer(r"@[Tt](\d+)", message):
num = int(match.group(1))
tid = f"T{num:04d}"
if tid not in seen:
seg = next((s for s in segments if s.id == tid), None)
if seg:
mentioned.append(seg)
seen.add(tid)
return mentioned
class AgentRunner:
"""Runs agent queries in a background thread, streams chunks to a callback."""
@@ -152,12 +179,16 @@ class AgentRunner:
try:
provider = self._get_provider()
frames = _load_frames(stream_mgr.frames_dir)
mentioned = _parse_mentions(message, frames)
mentioned_frames = _parse_mentions(message, frames)
transcript = _load_transcript(stream_mgr.transcript_dir)
mentioned_transcripts = _parse_transcript_mentions(message, transcript)
context = SessionContext(
session_dir=stream_mgr.session_dir,
frames=frames,
duration=tracker.duration if tracker else 0.0,
mentioned_frames=mentioned,
mentioned_frames=mentioned_frames,
transcript_segments=transcript,
mentioned_transcripts=mentioned_transcripts,
)
for chunk in provider.stream(message, context):
on_chunk(chunk)

0
cht/audio/__init__.py Normal file
View File

90
cht/audio/waveform.py Normal file
View File

@@ -0,0 +1,90 @@
"""
Waveform peak computation from WAV files.
Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk),
computes RMS amplitude per time bucket, and stores peaks as a numpy array
that grows incrementally during live recording.
"""
import logging
import wave
import numpy as np
log = logging.getLogger(__name__)
class WaveformEngine:
"""Computes and accumulates waveform peak data from WAV chunks."""
def __init__(self, bucket_ms=50):
self._bucket_ms = bucket_ms
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
@property
def peaks(self):
return self._peaks
@property
def bucket_duration(self):
return self._bucket_ms / 1000.0
@property
def total_duration(self):
return self._total_duration
def append_chunk(self, wav_path, start_time):
"""Read a WAV chunk and append its peaks to the internal array."""
samples, sample_rate = self._read_wav(wav_path)
if samples is None:
return
new_peaks = self._compute_rms(samples, sample_rate)
if len(new_peaks) > 0:
self._peaks = np.concatenate([self._peaks, new_peaks])
chunk_duration = len(samples) / sample_rate
self._total_duration = start_time + chunk_duration
log.info("Waveform: +%d peaks (total %d, %.1fs)",
len(new_peaks), len(self._peaks), self._total_duration)
def compute_full(self, wav_path):
"""Compute all peaks from a complete WAV file (for loaded sessions)."""
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
samples, sample_rate = self._read_wav(wav_path)
if samples is None:
return
self._peaks = self._compute_rms(samples, sample_rate)
self._total_duration = len(samples) / sample_rate
log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration)
def reset(self):
self._peaks = np.empty(0, dtype=np.float32)
self._total_duration = 0.0
def _read_wav(self, wav_path):
"""Read a 16-bit PCM WAV file into a float32 numpy array."""
try:
with wave.open(str(wav_path), "rb") as wf:
n_frames = wf.getnframes()
if n_frames == 0:
return None, 0
sample_rate = wf.getframerate()
raw = wf.readframes(n_frames)
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
return samples, sample_rate
except Exception as e:
log.warning("Failed to read WAV %s: %s", wav_path, e)
return None, 0
def _compute_rms(self, samples, sample_rate):
"""Compute RMS amplitude per bucket."""
bucket_size = int(sample_rate * self._bucket_ms / 1000)
if bucket_size <= 0 or len(samples) < bucket_size:
return np.empty(0, dtype=np.float32)
# Trim to whole buckets
n_buckets = len(samples) // bucket_size
trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size)
rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32)
return rms

View File

@@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10 # 0-1, lower = more sensitive; 0.1 catches slide/window
# Segment recording
SEGMENT_DURATION = 60 # seconds per .ts segment
# Audio extraction
AUDIO_EXTRACT_INTERVAL = 3 # seconds between extraction cycles
AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector)
WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket
# Transcription
WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy
WHISPER_DEVICE = "cuda" # "cuda" or "cpu"

View File

@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
"""Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
Returns (stdout, stderr) as decoded strings.
"""
kwargs = {"ss": start_time}
if duration is not None:
kwargs["t"] = duration
stream = ffmpeg.input(str(input_path), **kwargs)
output = (
ffmpeg.output(
stream, str(output_path),
acodec="pcm_s16le", ac=1, ar=16000,
vn=None,
)
.overwrite_output()
.global_args(*QUIET_ARGS)
)
log.info("extract_audio_chunk: %s", " ".join(output.compile()))
try:
stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
except ffmpeg.Error as e:
stderr = e.stderr or b""
log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
stdout = e.stdout or b""
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
def extract_frame_at(input_path, output_path, timestamp):
"""Extract a single frame at the given timestamp."""
output = (

View File

@@ -20,6 +20,8 @@ from cht.config import (
RELAY_PORT,
SCENE_THRESHOLD,
SESSIONS_DIR,
AUDIO_EXTRACT_INTERVAL,
AUDIO_SAFETY_MARGIN,
)
from cht.stream import ffmpeg as ff
@@ -46,6 +48,7 @@ class StreamManager:
self.stream_dir = self.session_dir / "stream"
self.frames_dir = self.session_dir / "frames"
self.transcript_dir = self.session_dir / "transcript"
self.audio_dir = self.session_dir / "audio"
self.agent_dir = self.session_dir / "agent"
self._procs = {}
@@ -103,7 +106,7 @@ class StreamManager:
return total
def setup_dirs(self):
for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir):
for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
d.mkdir(parents=True, exist_ok=True)
@property
@@ -349,6 +352,77 @@ class StreamManager:
Thread(target=_capture, daemon=True, name="capture_now").start()
# -- Audio Extraction --
def start_audio_extractor(self, on_new_audio=None):
"""Periodically extract audio from the growing recording as WAV chunks.
Same incremental pattern as scene detector: polls recording, extracts
new time range, calls back with (wav_path, start_time, duration).
Args:
on_new_audio: callback(wav_path, start_time, duration)
"""
self._on_new_audio = on_new_audio
self.audio_dir.mkdir(parents=True, exist_ok=True)
def _extract():
processed_time = 0.0
chunk_num = 0
current_segment = None
while "stop" not in self._stop_flags:
time.sleep(AUDIO_EXTRACT_INTERVAL)
seg = self.recording_path
if not seg.exists():
continue
if seg != current_segment:
current_segment = seg
processed_time = 0.0
chunk_num = 0
log.info("Audio extractor: switched to %s", seg.name)
if seg.stat().st_size < 100_000:
continue
safe_duration = self._estimate_safe_duration()
if safe_duration is None or safe_duration <= 0:
continue
process_to = safe_duration - AUDIO_SAFETY_MARGIN
if process_to <= processed_time + 1.0:
continue
chunk_duration = process_to - processed_time
wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
try:
ff.extract_audio_chunk(
seg, wav_path,
start_time=processed_time,
duration=chunk_duration,
)
except Exception as e:
log.error("Audio extraction failed: %s", e)
continue
if wav_path.exists() and wav_path.stat().st_size > 100:
log.info("Audio chunk: %s (%.1fs → %.1fs)",
wav_path.name, processed_time, process_to)
if self._on_new_audio:
self._on_new_audio(wav_path, processed_time, chunk_duration)
chunk_num += 1
processed_time = process_to
log.info("Audio extractor stopped")
t = Thread(target=_extract, daemon=True, name="audio_extractor")
t.start()
self._threads["audio_extractor"] = t
# -- Lifecycle --
def stop_all(self):

98
cht/transcriber/engine.py Normal file
View File

@@ -0,0 +1,98 @@
"""
Transcription engine using faster-whisper.
Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...),
and persists to transcript/index.json in the session directory.
"""
import json
import logging
from dataclasses import dataclass, asdict
from pathlib import Path
log = logging.getLogger(__name__)
@dataclass
class TranscriptSegment:
id: str # "T0001"
start: float # seconds into recording
end: float # seconds into recording
text: str # transcribed text
class TranscriberEngine:
"""Incremental transcription via faster-whisper with GPU acceleration."""
def __init__(self, model_size="small", device="cuda"):
self._model = None
self._model_size = model_size
self._device = device
self._segments: list[TranscriptSegment] = []
self._next_id = 1
def _ensure_model(self):
if self._model is not None:
return
log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device)
from faster_whisper import WhisperModel
self._model = WhisperModel(
self._model_size,
device=self._device,
compute_type="float16" if self._device == "cuda" else "int8",
)
log.info("Whisper model loaded")
def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]:
"""Transcribe a WAV chunk. Returns new segments with absolute timestamps."""
self._ensure_model()
try:
segments_iter, _info = self._model.transcribe(
str(wav_path),
beam_size=5,
vad_filter=True,
)
except Exception as e:
log.error("Whisper transcription failed: %s", e)
return []
new_segments = []
for seg in segments_iter:
text = seg.text.strip()
if not text:
continue
tid = f"T{self._next_id:04d}"
self._next_id += 1
entry = TranscriptSegment(
id=tid,
start=time_offset + seg.start,
end=time_offset + seg.end,
text=text,
)
self._segments.append(entry)
new_segments.append(entry)
return new_segments
def all_segments(self) -> list[TranscriptSegment]:
return list(self._segments)
def save_index(self, path: Path):
data = [asdict(s) for s in self._segments]
path.write_text(json.dumps(data, indent=2))
def load_index(self, path: Path):
try:
data = json.loads(path.read_text())
except Exception as e:
log.warning("Failed to load transcript index: %s", e)
return
self._segments = [TranscriptSegment(**e) for e in data]
if self._segments:
last_num = max(int(s.id.lstrip("T")) for s in self._segments)
self._next_id = last_num + 1
log.info("Loaded %d transcript segments", len(self._segments))
def reset(self):
self._segments.clear()
self._next_id = 1

107
cht/ui/waveform.py Normal file
View File

@@ -0,0 +1,107 @@
"""
WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead.
Driven by Timeline "changed" signal — redraws when cursor or duration changes.
Peak data is set externally via set_peaks() from GLib.idle_add.
"""
import logging
import math
import numpy as np
import gi
gi.require_version("Gtk", "4.0")
from gi.repository import Gtk, GLib
log = logging.getLogger(__name__)
class WaveformWidget(Gtk.Box):
"""Waveform display synced to Timeline state."""
def __init__(self, timeline, **kwargs):
super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs)
self._timeline = timeline
self._peaks = None
self._bucket_duration = 0.05
label = Gtk.Label(label="Waveform")
label.add_css_class("heading")
label.set_margin_top(4)
label.set_margin_bottom(4)
self.append(label)
self._area = Gtk.DrawingArea()
self._area.set_content_height(250)
self._area.set_hexpand(True)
self._area.set_vexpand(True)
self._area.set_draw_func(self._draw)
self.append(self._area)
timeline.connect("changed", self._on_timeline_changed)
def set_peaks(self, peaks, bucket_duration):
"""Update peak data. Call from GLib.idle_add."""
self._peaks = peaks
self._bucket_duration = bucket_duration
self._area.queue_draw()
def _on_timeline_changed(self, timeline):
self._area.queue_draw()
def _draw(self, area, cr, width, height):
# Background
cr.set_source_rgb(0.1, 0.1, 0.12)
cr.rectangle(0, 0, width, height)
cr.fill()
state = self._timeline.state
duration = state.duration
mid_y = height / 2
# Center line
cr.set_source_rgba(0.3, 0.3, 0.35, 1.0)
cr.set_line_width(1)
cr.move_to(0, mid_y)
cr.line_to(width, mid_y)
cr.stroke()
# Draw peaks
if self._peaks is not None and len(self._peaks) > 0 and duration > 0:
n_peaks = len(self._peaks)
# Map peaks to pixel columns
peak_duration = n_peaks * self._bucket_duration
max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0
for x in range(width):
# Time at this pixel
t = (x / width) * duration
# Corresponding peak index
idx = int(t / self._bucket_duration)
if 0 <= idx < n_peaks:
val = self._peaks[idx] / max_peak
bar_h = val * (height * 0.45) # 90% of half-height
# Green gradient based on amplitude
cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85)
cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2)
cr.fill()
# Scene markers
if duration > 0:
cr.set_source_rgba(1.0, 1.0, 0.3, 0.3)
cr.set_line_width(1)
for marker in state.scene_markers:
mx = (marker / duration) * width
cr.move_to(mx, 0)
cr.line_to(mx, height)
cr.stroke()
# Playhead
if duration > 0:
px = (state.cursor / duration) * width
cr.set_source_rgba(1.0, 0.3, 0.3, 0.9)
cr.set_line_width(2)
cr.move_to(px, 0)
cr.line_to(px, height)
cr.stroke()

View File

@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
gi.require_version("GdkPixbuf", "2.0")
from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf
from threading import Thread
from cht.config import APP_NAME, SCENE_THRESHOLD
from cht.ui.timeline import Timeline, TimelineControls
from cht.ui.monitor import MonitorWidget
from cht.ui.waveform import WaveformWidget
from cht.audio.waveform import WaveformEngine
from cht.transcriber.engine import TranscriberEngine
from cht.stream.manager import StreamManager, list_sessions
from cht.stream.tracker import RecordingTracker
from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
# Timeline is the central state machine
self._timeline = Timeline()
self._agent = AgentRunner()
self._waveform_engine = WaveformEngine()
self._transcriber = TranscriberEngine()
# Main layout
self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
# Load existing frames into the strip
self._load_existing_frames()
# Load existing transcript
transcript_index = self._stream_mgr.transcript_dir / "index.json"
if transcript_index.exists():
self._transcriber.load_index(transcript_index)
segs = self._transcriber.all_segments()
if segs:
self._append_transcript_segments(segs)
self._append_agent_output(f" Loaded {len(segs)} transcript segments.\n")
# Compute waveform from existing recordings (background thread)
if segments:
from cht.stream import ffmpeg as ff
def _compute_waveform():
audio_dir = self._stream_mgr.audio_dir
audio_dir.mkdir(parents=True, exist_ok=True)
full_wav = audio_dir / "full.wav"
try:
ff.extract_audio_chunk(segments[0], full_wav)
self._waveform_engine.compute_full(full_wav)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
except Exception as e:
log.error("Waveform computation failed: %s", e)
Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
# Set up agent auth/model if not already done
self._populate_model_dropdown()
@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
# Start scene detection
self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)
# Start audio extraction (waveform + transcription)
self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
# Start polling for frame thumbnails
GLib.timeout_add(1000, self._poll_frames)
@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
for f in frames:
GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])
def _on_new_audio(self, wav_path, start_time, duration):
"""Called from audio extractor thread with new WAV chunk."""
# Compute waveform peaks (fast, ~1ms)
self._waveform_engine.append_chunk(wav_path, start_time)
peaks = self._waveform_engine.peaks
bucket_dur = self._waveform_engine.bucket_duration
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
# Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
def _transcribe():
new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
if self._stream_mgr:
self._transcriber.save_index(
self._stream_mgr.transcript_dir / "index.json"
)
if new_segs:
GLib.idle_add(self._append_transcript_segments, new_segs)
Thread(target=_transcribe, daemon=True, name="transcriber").start()
def _check_recorder(self):
"""Watchdog: restart recorder if it died (sender disconnect, etc)."""
if not self._streaming or not self._stream_mgr:
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
log.info("Stopping stream...")
self._timeline.reset()
self._monitor.stop()
self._waveform_engine.reset()
self._waveform_widget.set_peaks(None, 0.05)
self._transcriber.reset()
self._transcript_view.get_buffer().set_text("")
if self._tracker:
self._tracker.stop()
self._tracker = None
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
stream_frame.set_child(self._monitor)
top_paned.set_start_child(stream_frame)
self._waveform_area = self._build_placeholder("Waveform", height=250, width=200)
top_paned.set_end_child(self._waveform_area)
self._waveform_widget = WaveformWidget(self._timeline)
waveform_frame = Gtk.Frame()
waveform_frame.set_child(self._waveform_widget)
top_paned.set_end_child(waveform_frame)
top_paned.set_position(650)
right_box.append(top_paned)
@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
# Auto-scroll to bottom
self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
def _append_transcript_segments(self, segments):
"""Append transcription segments to the transcript panel."""
buf = self._transcript_view.get_buffer()
for seg in segments:
m1, s1 = divmod(int(seg.start), 60)
m2, s2 = divmod(int(seg.end), 60)
line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id} {seg.text}\n"
buf.insert(buf.get_end_iter(), line)
self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
# -- Frame thumbnails --
def _load_existing_frames(self):