audio and transcript
This commit is contained in:
@@ -18,12 +18,22 @@ class FrameRef:
|
||||
timestamp: float # seconds into recording
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptRef:
|
||||
id: str # "T0001"
|
||||
start: float # seconds into recording
|
||||
end: float # seconds into recording
|
||||
text: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class SessionContext:
|
||||
session_dir: Path
|
||||
frames: list[FrameRef] # all captured frames so far
|
||||
duration: float # current recording duration (seconds)
|
||||
mentioned_frames: list[FrameRef] = field(default_factory=list) # @-referenced in message
|
||||
mentioned_frames: list[FrameRef] = field(default_factory=list)
|
||||
transcript_segments: list[TranscriptRef] = field(default_factory=list)
|
||||
mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)
|
||||
|
||||
|
||||
class AgentProvider(ABC):
|
||||
|
||||
@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
|
||||
fm, fs = divmod(int(f.timestamp), 60)
|
||||
lines.append(f" {f.id} at {fm:02d}:{fs:02d} — {f.path}")
|
||||
|
||||
# Transcript
|
||||
if context.transcript_segments:
|
||||
lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
|
||||
for t in context.transcript_segments:
|
||||
tm1, ts1 = divmod(int(t.start), 60)
|
||||
tm2, ts2 = divmod(int(t.end), 60)
|
||||
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
|
||||
|
||||
if context.mentioned_transcripts:
|
||||
lines.append("\nTranscript segments referenced in this message:")
|
||||
for t in context.mentioned_transcripts:
|
||||
tm1, ts1 = divmod(int(t.start), 60)
|
||||
tm2, ts2 = divmod(int(t.end), 60)
|
||||
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
|
||||
|
||||
lines.append(f"\nUser message: {message}")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):
|
||||
|
||||
# Build context header
|
||||
m, s = divmod(int(context.duration), 60)
|
||||
ctx_text = (
|
||||
f"Recording duration: {m:02d}:{s:02d}\n"
|
||||
f"Total frames: {len(context.frames)}\n"
|
||||
)
|
||||
ctx_lines = [
|
||||
f"Recording duration: {m:02d}:{s:02d}",
|
||||
f"Total frames: {len(context.frames)}",
|
||||
]
|
||||
if context.transcript_segments:
|
||||
ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
|
||||
for t in context.transcript_segments:
|
||||
tm1, ts1 = divmod(int(t.start), 60)
|
||||
tm2, ts2 = divmod(int(t.end), 60)
|
||||
ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
|
||||
ctx_text = "\n".join(ctx_lines) + "\n"
|
||||
|
||||
frames_to_send = context.mentioned_frames
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from pathlib import Path
|
||||
from threading import Thread
|
||||
from typing import Callable
|
||||
|
||||
from cht.agent.base import AgentProvider, FrameRef, SessionContext
|
||||
from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
|
||||
return []
|
||||
|
||||
|
||||
def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
|
||||
index_path = transcript_dir / "index.json"
|
||||
if not index_path.exists():
|
||||
return []
|
||||
try:
|
||||
entries = json.loads(index_path.read_text())
|
||||
return [TranscriptRef(**e) for e in entries]
|
||||
except Exception as e:
|
||||
log.warning("Could not load transcript index: %s", e)
|
||||
return []
|
||||
|
||||
|
||||
def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
|
||||
"""Extract @T references from message. Accepts @T0001, @t1, @T1."""
|
||||
mentioned = []
|
||||
seen = set()
|
||||
for match in re.finditer(r"@[Tt](\d+)", message):
|
||||
num = int(match.group(1))
|
||||
tid = f"T{num:04d}"
|
||||
if tid not in seen:
|
||||
seg = next((s for s in segments if s.id == tid), None)
|
||||
if seg:
|
||||
mentioned.append(seg)
|
||||
seen.add(tid)
|
||||
return mentioned
|
||||
|
||||
|
||||
class AgentRunner:
|
||||
"""Runs agent queries in a background thread, streams chunks to a callback."""
|
||||
|
||||
@@ -152,12 +179,16 @@ class AgentRunner:
|
||||
try:
|
||||
provider = self._get_provider()
|
||||
frames = _load_frames(stream_mgr.frames_dir)
|
||||
mentioned = _parse_mentions(message, frames)
|
||||
mentioned_frames = _parse_mentions(message, frames)
|
||||
transcript = _load_transcript(stream_mgr.transcript_dir)
|
||||
mentioned_transcripts = _parse_transcript_mentions(message, transcript)
|
||||
context = SessionContext(
|
||||
session_dir=stream_mgr.session_dir,
|
||||
frames=frames,
|
||||
duration=tracker.duration if tracker else 0.0,
|
||||
mentioned_frames=mentioned,
|
||||
mentioned_frames=mentioned_frames,
|
||||
transcript_segments=transcript,
|
||||
mentioned_transcripts=mentioned_transcripts,
|
||||
)
|
||||
for chunk in provider.stream(message, context):
|
||||
on_chunk(chunk)
|
||||
|
||||
0
cht/audio/__init__.py
Normal file
0
cht/audio/__init__.py
Normal file
90
cht/audio/waveform.py
Normal file
90
cht/audio/waveform.py
Normal file
@@ -0,0 +1,90 @@
|
||||
"""
|
||||
Waveform peak computation from WAV files.
|
||||
|
||||
Reads 16kHz mono PCM WAV files (as produced by ffmpeg extract_audio_chunk),
|
||||
computes RMS amplitude per time bucket, and stores peaks as a numpy array
|
||||
that grows incrementally during live recording.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import wave
|
||||
|
||||
import numpy as np
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WaveformEngine:
|
||||
"""Computes and accumulates waveform peak data from WAV chunks."""
|
||||
|
||||
def __init__(self, bucket_ms=50):
|
||||
self._bucket_ms = bucket_ms
|
||||
self._peaks = np.empty(0, dtype=np.float32)
|
||||
self._total_duration = 0.0
|
||||
|
||||
@property
|
||||
def peaks(self):
|
||||
return self._peaks
|
||||
|
||||
@property
|
||||
def bucket_duration(self):
|
||||
return self._bucket_ms / 1000.0
|
||||
|
||||
@property
|
||||
def total_duration(self):
|
||||
return self._total_duration
|
||||
|
||||
def append_chunk(self, wav_path, start_time):
|
||||
"""Read a WAV chunk and append its peaks to the internal array."""
|
||||
samples, sample_rate = self._read_wav(wav_path)
|
||||
if samples is None:
|
||||
return
|
||||
new_peaks = self._compute_rms(samples, sample_rate)
|
||||
if len(new_peaks) > 0:
|
||||
self._peaks = np.concatenate([self._peaks, new_peaks])
|
||||
chunk_duration = len(samples) / sample_rate
|
||||
self._total_duration = start_time + chunk_duration
|
||||
log.info("Waveform: +%d peaks (total %d, %.1fs)",
|
||||
len(new_peaks), len(self._peaks), self._total_duration)
|
||||
|
||||
def compute_full(self, wav_path):
|
||||
"""Compute all peaks from a complete WAV file (for loaded sessions)."""
|
||||
self._peaks = np.empty(0, dtype=np.float32)
|
||||
self._total_duration = 0.0
|
||||
samples, sample_rate = self._read_wav(wav_path)
|
||||
if samples is None:
|
||||
return
|
||||
self._peaks = self._compute_rms(samples, sample_rate)
|
||||
self._total_duration = len(samples) / sample_rate
|
||||
log.info("Waveform full: %d peaks, %.1fs", len(self._peaks), self._total_duration)
|
||||
|
||||
def reset(self):
|
||||
self._peaks = np.empty(0, dtype=np.float32)
|
||||
self._total_duration = 0.0
|
||||
|
||||
def _read_wav(self, wav_path):
|
||||
"""Read a 16-bit PCM WAV file into a float32 numpy array."""
|
||||
try:
|
||||
with wave.open(str(wav_path), "rb") as wf:
|
||||
n_frames = wf.getnframes()
|
||||
if n_frames == 0:
|
||||
return None, 0
|
||||
sample_rate = wf.getframerate()
|
||||
raw = wf.readframes(n_frames)
|
||||
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
|
||||
return samples, sample_rate
|
||||
except Exception as e:
|
||||
log.warning("Failed to read WAV %s: %s", wav_path, e)
|
||||
return None, 0
|
||||
|
||||
def _compute_rms(self, samples, sample_rate):
|
||||
"""Compute RMS amplitude per bucket."""
|
||||
bucket_size = int(sample_rate * self._bucket_ms / 1000)
|
||||
if bucket_size <= 0 or len(samples) < bucket_size:
|
||||
return np.empty(0, dtype=np.float32)
|
||||
|
||||
# Trim to whole buckets
|
||||
n_buckets = len(samples) // bucket_size
|
||||
trimmed = samples[:n_buckets * bucket_size].reshape(n_buckets, bucket_size)
|
||||
rms = np.sqrt(np.mean(trimmed ** 2, axis=1)).astype(np.float32)
|
||||
return rms
|
||||
@@ -19,3 +19,12 @@ SCENE_THRESHOLD = 0.10 # 0-1, lower = more sensitive; 0.1 catches slide/window
|
||||
|
||||
# Segment recording
|
||||
SEGMENT_DURATION = 60 # seconds per .ts segment
|
||||
|
||||
# Audio extraction
|
||||
AUDIO_EXTRACT_INTERVAL = 3 # seconds between extraction cycles
|
||||
AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector)
|
||||
WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket
|
||||
|
||||
# Transcription
|
||||
WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy
|
||||
WHISPER_DEVICE = "cuda" # "cuda" or "cpu"
|
||||
|
||||
@@ -122,6 +122,35 @@ def extract_scene_frames(input_path, output_dir, scene_threshold=0.10,
|
||||
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def extract_audio_chunk(input_path, output_path, start_time=0.0, duration=None):
|
||||
"""Extract audio from recording as 16kHz mono WAV (optimal for Whisper).
|
||||
|
||||
Uses input-level seeking (-ss before -i) for fast keyframe-based seek.
|
||||
Returns (stdout, stderr) as decoded strings.
|
||||
"""
|
||||
kwargs = {"ss": start_time}
|
||||
if duration is not None:
|
||||
kwargs["t"] = duration
|
||||
stream = ffmpeg.input(str(input_path), **kwargs)
|
||||
output = (
|
||||
ffmpeg.output(
|
||||
stream, str(output_path),
|
||||
acodec="pcm_s16le", ac=1, ar=16000,
|
||||
vn=None,
|
||||
)
|
||||
.overwrite_output()
|
||||
.global_args(*QUIET_ARGS)
|
||||
)
|
||||
log.info("extract_audio_chunk: %s", " ".join(output.compile()))
|
||||
try:
|
||||
stdout, stderr = output.run(capture_stdout=True, capture_stderr=True)
|
||||
except ffmpeg.Error as e:
|
||||
stderr = e.stderr or b""
|
||||
log.debug("ffmpeg audio error: %s", stderr.decode("utf-8", errors="replace").strip().split("\n")[-1])
|
||||
stdout = e.stdout or b""
|
||||
return stdout.decode("utf-8", errors="replace"), stderr.decode("utf-8", errors="replace")
|
||||
|
||||
|
||||
def extract_frame_at(input_path, output_path, timestamp):
|
||||
"""Extract a single frame at the given timestamp."""
|
||||
output = (
|
||||
|
||||
@@ -20,6 +20,8 @@ from cht.config import (
|
||||
RELAY_PORT,
|
||||
SCENE_THRESHOLD,
|
||||
SESSIONS_DIR,
|
||||
AUDIO_EXTRACT_INTERVAL,
|
||||
AUDIO_SAFETY_MARGIN,
|
||||
)
|
||||
from cht.stream import ffmpeg as ff
|
||||
|
||||
@@ -46,6 +48,7 @@ class StreamManager:
|
||||
self.stream_dir = self.session_dir / "stream"
|
||||
self.frames_dir = self.session_dir / "frames"
|
||||
self.transcript_dir = self.session_dir / "transcript"
|
||||
self.audio_dir = self.session_dir / "audio"
|
||||
self.agent_dir = self.session_dir / "agent"
|
||||
|
||||
self._procs = {}
|
||||
@@ -103,7 +106,7 @@ class StreamManager:
|
||||
return total
|
||||
|
||||
def setup_dirs(self):
|
||||
for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.agent_dir):
|
||||
for d in (self.stream_dir, self.frames_dir, self.transcript_dir, self.audio_dir, self.agent_dir):
|
||||
d.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
@property
|
||||
@@ -349,6 +352,77 @@ class StreamManager:
|
||||
|
||||
Thread(target=_capture, daemon=True, name="capture_now").start()
|
||||
|
||||
# -- Audio Extraction --
|
||||
|
||||
def start_audio_extractor(self, on_new_audio=None):
|
||||
"""Periodically extract audio from the growing recording as WAV chunks.
|
||||
|
||||
Same incremental pattern as scene detector: polls recording, extracts
|
||||
new time range, calls back with (wav_path, start_time, duration).
|
||||
|
||||
Args:
|
||||
on_new_audio: callback(wav_path, start_time, duration)
|
||||
"""
|
||||
self._on_new_audio = on_new_audio
|
||||
self.audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def _extract():
|
||||
processed_time = 0.0
|
||||
chunk_num = 0
|
||||
current_segment = None
|
||||
|
||||
while "stop" not in self._stop_flags:
|
||||
time.sleep(AUDIO_EXTRACT_INTERVAL)
|
||||
|
||||
seg = self.recording_path
|
||||
if not seg.exists():
|
||||
continue
|
||||
|
||||
if seg != current_segment:
|
||||
current_segment = seg
|
||||
processed_time = 0.0
|
||||
chunk_num = 0
|
||||
log.info("Audio extractor: switched to %s", seg.name)
|
||||
|
||||
if seg.stat().st_size < 100_000:
|
||||
continue
|
||||
|
||||
safe_duration = self._estimate_safe_duration()
|
||||
if safe_duration is None or safe_duration <= 0:
|
||||
continue
|
||||
|
||||
process_to = safe_duration - AUDIO_SAFETY_MARGIN
|
||||
if process_to <= processed_time + 1.0:
|
||||
continue
|
||||
|
||||
chunk_duration = process_to - processed_time
|
||||
wav_path = self.audio_dir / f"chunk_{chunk_num:04d}.wav"
|
||||
|
||||
try:
|
||||
ff.extract_audio_chunk(
|
||||
seg, wav_path,
|
||||
start_time=processed_time,
|
||||
duration=chunk_duration,
|
||||
)
|
||||
except Exception as e:
|
||||
log.error("Audio extraction failed: %s", e)
|
||||
continue
|
||||
|
||||
if wav_path.exists() and wav_path.stat().st_size > 100:
|
||||
log.info("Audio chunk: %s (%.1fs → %.1fs)",
|
||||
wav_path.name, processed_time, process_to)
|
||||
if self._on_new_audio:
|
||||
self._on_new_audio(wav_path, processed_time, chunk_duration)
|
||||
chunk_num += 1
|
||||
|
||||
processed_time = process_to
|
||||
|
||||
log.info("Audio extractor stopped")
|
||||
|
||||
t = Thread(target=_extract, daemon=True, name="audio_extractor")
|
||||
t.start()
|
||||
self._threads["audio_extractor"] = t
|
||||
|
||||
# -- Lifecycle --
|
||||
|
||||
def stop_all(self):
|
||||
|
||||
98
cht/transcriber/engine.py
Normal file
98
cht/transcriber/engine.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""
|
||||
Transcription engine using faster-whisper.
|
||||
|
||||
Processes WAV chunks incrementally, assigns sequential IDs (T0001, T0002, ...),
|
||||
and persists to transcript/index.json in the session directory.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from dataclasses import dataclass, asdict
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TranscriptSegment:
|
||||
id: str # "T0001"
|
||||
start: float # seconds into recording
|
||||
end: float # seconds into recording
|
||||
text: str # transcribed text
|
||||
|
||||
|
||||
class TranscriberEngine:
|
||||
"""Incremental transcription via faster-whisper with GPU acceleration."""
|
||||
|
||||
def __init__(self, model_size="small", device="cuda"):
|
||||
self._model = None
|
||||
self._model_size = model_size
|
||||
self._device = device
|
||||
self._segments: list[TranscriptSegment] = []
|
||||
self._next_id = 1
|
||||
|
||||
def _ensure_model(self):
|
||||
if self._model is not None:
|
||||
return
|
||||
log.info("Loading whisper model: %s (device=%s)", self._model_size, self._device)
|
||||
from faster_whisper import WhisperModel
|
||||
self._model = WhisperModel(
|
||||
self._model_size,
|
||||
device=self._device,
|
||||
compute_type="float16" if self._device == "cuda" else "int8",
|
||||
)
|
||||
log.info("Whisper model loaded")
|
||||
|
||||
def transcribe_chunk(self, wav_path, time_offset=0.0) -> list[TranscriptSegment]:
|
||||
"""Transcribe a WAV chunk. Returns new segments with absolute timestamps."""
|
||||
self._ensure_model()
|
||||
try:
|
||||
segments_iter, _info = self._model.transcribe(
|
||||
str(wav_path),
|
||||
beam_size=5,
|
||||
vad_filter=True,
|
||||
)
|
||||
except Exception as e:
|
||||
log.error("Whisper transcription failed: %s", e)
|
||||
return []
|
||||
|
||||
new_segments = []
|
||||
for seg in segments_iter:
|
||||
text = seg.text.strip()
|
||||
if not text:
|
||||
continue
|
||||
tid = f"T{self._next_id:04d}"
|
||||
self._next_id += 1
|
||||
entry = TranscriptSegment(
|
||||
id=tid,
|
||||
start=time_offset + seg.start,
|
||||
end=time_offset + seg.end,
|
||||
text=text,
|
||||
)
|
||||
self._segments.append(entry)
|
||||
new_segments.append(entry)
|
||||
|
||||
return new_segments
|
||||
|
||||
def all_segments(self) -> list[TranscriptSegment]:
|
||||
return list(self._segments)
|
||||
|
||||
def save_index(self, path: Path):
|
||||
data = [asdict(s) for s in self._segments]
|
||||
path.write_text(json.dumps(data, indent=2))
|
||||
|
||||
def load_index(self, path: Path):
|
||||
try:
|
||||
data = json.loads(path.read_text())
|
||||
except Exception as e:
|
||||
log.warning("Failed to load transcript index: %s", e)
|
||||
return
|
||||
self._segments = [TranscriptSegment(**e) for e in data]
|
||||
if self._segments:
|
||||
last_num = max(int(s.id.lstrip("T")) for s in self._segments)
|
||||
self._next_id = last_num + 1
|
||||
log.info("Loaded %d transcript segments", len(self._segments))
|
||||
|
||||
def reset(self):
|
||||
self._segments.clear()
|
||||
self._next_id = 1
|
||||
107
cht/ui/waveform.py
Normal file
107
cht/ui/waveform.py
Normal file
@@ -0,0 +1,107 @@
|
||||
"""
|
||||
WaveformWidget: GTK4 DrawingArea that renders waveform peaks with a playhead.
|
||||
|
||||
Driven by Timeline "changed" signal — redraws when cursor or duration changes.
|
||||
Peak data is set externally via set_peaks() from GLib.idle_add.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import math
|
||||
|
||||
import numpy as np
|
||||
|
||||
import gi
|
||||
gi.require_version("Gtk", "4.0")
|
||||
from gi.repository import Gtk, GLib
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class WaveformWidget(Gtk.Box):
|
||||
"""Waveform display synced to Timeline state."""
|
||||
|
||||
def __init__(self, timeline, **kwargs):
|
||||
super().__init__(orientation=Gtk.Orientation.VERTICAL, **kwargs)
|
||||
self._timeline = timeline
|
||||
self._peaks = None
|
||||
self._bucket_duration = 0.05
|
||||
|
||||
label = Gtk.Label(label="Waveform")
|
||||
label.add_css_class("heading")
|
||||
label.set_margin_top(4)
|
||||
label.set_margin_bottom(4)
|
||||
self.append(label)
|
||||
|
||||
self._area = Gtk.DrawingArea()
|
||||
self._area.set_content_height(250)
|
||||
self._area.set_hexpand(True)
|
||||
self._area.set_vexpand(True)
|
||||
self._area.set_draw_func(self._draw)
|
||||
self.append(self._area)
|
||||
|
||||
timeline.connect("changed", self._on_timeline_changed)
|
||||
|
||||
def set_peaks(self, peaks, bucket_duration):
|
||||
"""Update peak data. Call from GLib.idle_add."""
|
||||
self._peaks = peaks
|
||||
self._bucket_duration = bucket_duration
|
||||
self._area.queue_draw()
|
||||
|
||||
def _on_timeline_changed(self, timeline):
|
||||
self._area.queue_draw()
|
||||
|
||||
def _draw(self, area, cr, width, height):
|
||||
# Background
|
||||
cr.set_source_rgb(0.1, 0.1, 0.12)
|
||||
cr.rectangle(0, 0, width, height)
|
||||
cr.fill()
|
||||
|
||||
state = self._timeline.state
|
||||
duration = state.duration
|
||||
mid_y = height / 2
|
||||
|
||||
# Center line
|
||||
cr.set_source_rgba(0.3, 0.3, 0.35, 1.0)
|
||||
cr.set_line_width(1)
|
||||
cr.move_to(0, mid_y)
|
||||
cr.line_to(width, mid_y)
|
||||
cr.stroke()
|
||||
|
||||
# Draw peaks
|
||||
if self._peaks is not None and len(self._peaks) > 0 and duration > 0:
|
||||
n_peaks = len(self._peaks)
|
||||
# Map peaks to pixel columns
|
||||
peak_duration = n_peaks * self._bucket_duration
|
||||
max_peak = np.max(self._peaks) if np.max(self._peaks) > 0 else 1.0
|
||||
|
||||
for x in range(width):
|
||||
# Time at this pixel
|
||||
t = (x / width) * duration
|
||||
# Corresponding peak index
|
||||
idx = int(t / self._bucket_duration)
|
||||
if 0 <= idx < n_peaks:
|
||||
val = self._peaks[idx] / max_peak
|
||||
bar_h = val * (height * 0.45) # 90% of half-height
|
||||
# Green gradient based on amplitude
|
||||
cr.set_source_rgba(0.2, 0.6 + val * 0.4, 0.3, 0.85)
|
||||
cr.rectangle(x, mid_y - bar_h, 1, bar_h * 2)
|
||||
cr.fill()
|
||||
|
||||
# Scene markers
|
||||
if duration > 0:
|
||||
cr.set_source_rgba(1.0, 1.0, 0.3, 0.3)
|
||||
cr.set_line_width(1)
|
||||
for marker in state.scene_markers:
|
||||
mx = (marker / duration) * width
|
||||
cr.move_to(mx, 0)
|
||||
cr.line_to(mx, height)
|
||||
cr.stroke()
|
||||
|
||||
# Playhead
|
||||
if duration > 0:
|
||||
px = (state.cursor / duration) * width
|
||||
cr.set_source_rgba(1.0, 0.3, 0.3, 0.9)
|
||||
cr.set_line_width(2)
|
||||
cr.move_to(px, 0)
|
||||
cr.line_to(px, height)
|
||||
cr.stroke()
|
||||
@@ -10,9 +10,14 @@ gi.require_version("Adw", "1")
|
||||
gi.require_version("GdkPixbuf", "2.0")
|
||||
from gi.repository import Gtk, Gdk, Adw, GLib, Pango, GdkPixbuf
|
||||
|
||||
from threading import Thread
|
||||
|
||||
from cht.config import APP_NAME, SCENE_THRESHOLD
|
||||
from cht.ui.timeline import Timeline, TimelineControls
|
||||
from cht.ui.monitor import MonitorWidget
|
||||
from cht.ui.waveform import WaveformWidget
|
||||
from cht.audio.waveform import WaveformEngine
|
||||
from cht.transcriber.engine import TranscriberEngine
|
||||
from cht.stream.manager import StreamManager, list_sessions
|
||||
from cht.stream.tracker import RecordingTracker
|
||||
from cht.agent.runner import AgentRunner, ACTIONS, check_claude_cli
|
||||
@@ -37,6 +42,8 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
# Timeline is the central state machine
|
||||
self._timeline = Timeline()
|
||||
self._agent = AgentRunner()
|
||||
self._waveform_engine = WaveformEngine()
|
||||
self._transcriber = TranscriberEngine()
|
||||
|
||||
# Main layout
|
||||
self._main_paned = Gtk.Paned(orientation=Gtk.Orientation.HORIZONTAL)
|
||||
@@ -165,6 +172,34 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
# Load existing frames into the strip
|
||||
self._load_existing_frames()
|
||||
|
||||
# Load existing transcript
|
||||
transcript_index = self._stream_mgr.transcript_dir / "index.json"
|
||||
if transcript_index.exists():
|
||||
self._transcriber.load_index(transcript_index)
|
||||
segs = self._transcriber.all_segments()
|
||||
if segs:
|
||||
self._append_transcript_segments(segs)
|
||||
self._append_agent_output(f" Loaded {len(segs)} transcript segments.\n")
|
||||
|
||||
# Compute waveform from existing recordings (background thread)
|
||||
if segments:
|
||||
from cht.stream import ffmpeg as ff
|
||||
|
||||
def _compute_waveform():
|
||||
audio_dir = self._stream_mgr.audio_dir
|
||||
audio_dir.mkdir(parents=True, exist_ok=True)
|
||||
full_wav = audio_dir / "full.wav"
|
||||
try:
|
||||
ff.extract_audio_chunk(segments[0], full_wav)
|
||||
self._waveform_engine.compute_full(full_wav)
|
||||
peaks = self._waveform_engine.peaks
|
||||
bucket_dur = self._waveform_engine.bucket_duration
|
||||
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
|
||||
except Exception as e:
|
||||
log.error("Waveform computation failed: %s", e)
|
||||
|
||||
Thread(target=_compute_waveform, daemon=True, name="waveform_load").start()
|
||||
|
||||
# Set up agent auth/model if not already done
|
||||
self._populate_model_dropdown()
|
||||
|
||||
@@ -197,6 +232,9 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
# Start scene detection
|
||||
self._stream_mgr.start_scene_detector(on_new_frames=self._on_new_scene_frames)
|
||||
|
||||
# Start audio extraction (waveform + transcription)
|
||||
self._stream_mgr.start_audio_extractor(on_new_audio=self._on_new_audio)
|
||||
|
||||
# Start polling for frame thumbnails
|
||||
GLib.timeout_add(1000, self._poll_frames)
|
||||
|
||||
@@ -237,6 +275,26 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
for f in frames:
|
||||
GLib.idle_add(self._timeline.add_scene_marker, f["timestamp"])
|
||||
|
||||
def _on_new_audio(self, wav_path, start_time, duration):
|
||||
"""Called from audio extractor thread with new WAV chunk."""
|
||||
# Compute waveform peaks (fast, ~1ms)
|
||||
self._waveform_engine.append_chunk(wav_path, start_time)
|
||||
peaks = self._waveform_engine.peaks
|
||||
bucket_dur = self._waveform_engine.bucket_duration
|
||||
GLib.idle_add(self._waveform_widget.set_peaks, peaks.copy(), bucket_dur)
|
||||
|
||||
# Transcribe in separate thread (GPU-bound, ~1-2s per chunk)
|
||||
def _transcribe():
|
||||
new_segs = self._transcriber.transcribe_chunk(wav_path, time_offset=start_time)
|
||||
if self._stream_mgr:
|
||||
self._transcriber.save_index(
|
||||
self._stream_mgr.transcript_dir / "index.json"
|
||||
)
|
||||
if new_segs:
|
||||
GLib.idle_add(self._append_transcript_segments, new_segs)
|
||||
|
||||
Thread(target=_transcribe, daemon=True, name="transcriber").start()
|
||||
|
||||
def _check_recorder(self):
|
||||
"""Watchdog: restart recorder if it died (sender disconnect, etc)."""
|
||||
if not self._streaming or not self._stream_mgr:
|
||||
@@ -257,6 +315,10 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
log.info("Stopping stream...")
|
||||
self._timeline.reset()
|
||||
self._monitor.stop()
|
||||
self._waveform_engine.reset()
|
||||
self._waveform_widget.set_peaks(None, 0.05)
|
||||
self._transcriber.reset()
|
||||
self._transcript_view.get_buffer().set_text("")
|
||||
if self._tracker:
|
||||
self._tracker.stop()
|
||||
self._tracker = None
|
||||
@@ -298,8 +360,10 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
stream_frame.set_child(self._monitor)
|
||||
top_paned.set_start_child(stream_frame)
|
||||
|
||||
self._waveform_area = self._build_placeholder("Waveform", height=250, width=200)
|
||||
top_paned.set_end_child(self._waveform_area)
|
||||
self._waveform_widget = WaveformWidget(self._timeline)
|
||||
waveform_frame = Gtk.Frame()
|
||||
waveform_frame.set_child(self._waveform_widget)
|
||||
top_paned.set_end_child(waveform_frame)
|
||||
top_paned.set_position(650)
|
||||
right_box.append(top_paned)
|
||||
|
||||
@@ -819,6 +883,16 @@ class ChtWindow(Adw.ApplicationWindow):
|
||||
# Auto-scroll to bottom
|
||||
self._agent_output_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
|
||||
|
||||
def _append_transcript_segments(self, segments):
|
||||
"""Append transcription segments to the transcript panel."""
|
||||
buf = self._transcript_view.get_buffer()
|
||||
for seg in segments:
|
||||
m1, s1 = divmod(int(seg.start), 60)
|
||||
m2, s2 = divmod(int(seg.end), 60)
|
||||
line = f"[{m1:02d}:{s1:02d}-{m2:02d}:{s2:02d}] {seg.id} {seg.text}\n"
|
||||
buf.insert(buf.get_end_iter(), line)
|
||||
self._transcript_view.scroll_to_iter(buf.get_end_iter(), 0, False, 0, 0)
|
||||
|
||||
# -- Frame thumbnails --
|
||||
|
||||
def _load_existing_frames(self):
|
||||
|
||||
@@ -15,6 +15,7 @@ dependencies = [
|
||||
"claude-agent-sdk",
|
||||
"openai",
|
||||
"pygments",
|
||||
"faster-whisper",
|
||||
]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
|
||||
Reference in New Issue
Block a user