audio and transcript

This commit is contained in:
2026-04-02 22:57:21 -03:00
parent 0b5575f3b3
commit d61e2a5492
13 changed files with 556 additions and 11 deletions

View File

@@ -18,12 +18,22 @@ class FrameRef:
timestamp: float # seconds into recording
@dataclass
class TranscriptRef:
id: str # "T0001"
start: float # seconds into recording
end: float # seconds into recording
text: str
@dataclass
class SessionContext:
session_dir: Path
frames: list[FrameRef] # all captured frames so far
duration: float # current recording duration (seconds)
mentioned_frames: list[FrameRef] = field(default_factory=list) # @-referenced in message
mentioned_frames: list[FrameRef] = field(default_factory=list)
transcript_segments: list[TranscriptRef] = field(default_factory=list)
mentioned_transcripts: list[TranscriptRef] = field(default_factory=list)
class AgentProvider(ABC):

View File

@@ -47,6 +47,21 @@ def _build_prompt(message: str, context: SessionContext) -> str:
fm, fs = divmod(int(f.timestamp), 60)
lines.append(f" {f.id} at {fm:02d}:{fs:02d}{f.path}")
# Transcript
if context.transcript_segments:
lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
if context.mentioned_transcripts:
lines.append("\nTranscript segments referenced in this message:")
for t in context.mentioned_transcripts:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
lines.append(f"\nUser message: {message}")
return "\n".join(lines)

View File

@@ -95,10 +95,17 @@ class OpenAICompatProvider(AgentProvider):
# Build context header
m, s = divmod(int(context.duration), 60)
ctx_text = (
f"Recording duration: {m:02d}:{s:02d}\n"
f"Total frames: {len(context.frames)}\n"
)
ctx_lines = [
f"Recording duration: {m:02d}:{s:02d}",
f"Total frames: {len(context.frames)}",
]
if context.transcript_segments:
ctx_lines.append(f"\nTranscript ({len(context.transcript_segments)} segments):")
for t in context.transcript_segments:
tm1, ts1 = divmod(int(t.start), 60)
tm2, ts2 = divmod(int(t.end), 60)
ctx_lines.append(f" {t.id} [{tm1:02d}:{ts1:02d}-{tm2:02d}:{ts2:02d}] {t.text}")
ctx_text = "\n".join(ctx_lines) + "\n"
frames_to_send = context.mentioned_frames

View File

@@ -15,7 +15,7 @@ from pathlib import Path
from threading import Thread
from typing import Callable
from cht.agent.base import AgentProvider, FrameRef, SessionContext
from cht.agent.base import AgentProvider, FrameRef, TranscriptRef, SessionContext
log = logging.getLogger(__name__)
@@ -98,6 +98,33 @@ def _load_frames(frames_dir: Path) -> list[FrameRef]:
return []
def _load_transcript(transcript_dir: Path) -> list[TranscriptRef]:
index_path = transcript_dir / "index.json"
if not index_path.exists():
return []
try:
entries = json.loads(index_path.read_text())
return [TranscriptRef(**e) for e in entries]
except Exception as e:
log.warning("Could not load transcript index: %s", e)
return []
def _parse_transcript_mentions(message: str, segments: list[TranscriptRef]) -> list[TranscriptRef]:
"""Extract @T references from message. Accepts @T0001, @t1, @T1."""
mentioned = []
seen = set()
for match in re.finditer(r"@[Tt](\d+)", message):
num = int(match.group(1))
tid = f"T{num:04d}"
if tid not in seen:
seg = next((s for s in segments if s.id == tid), None)
if seg:
mentioned.append(seg)
seen.add(tid)
return mentioned
class AgentRunner:
"""Runs agent queries in a background thread, streams chunks to a callback."""
@@ -152,12 +179,16 @@ class AgentRunner:
try:
provider = self._get_provider()
frames = _load_frames(stream_mgr.frames_dir)
mentioned = _parse_mentions(message, frames)
mentioned_frames = _parse_mentions(message, frames)
transcript = _load_transcript(stream_mgr.transcript_dir)
mentioned_transcripts = _parse_transcript_mentions(message, transcript)
context = SessionContext(
session_dir=stream_mgr.session_dir,
frames=frames,
duration=tracker.duration if tracker else 0.0,
mentioned_frames=mentioned,
mentioned_frames=mentioned_frames,
transcript_segments=transcript,
mentioned_transcripts=mentioned_transcripts,
)
for chunk in provider.stream(message, context):
on_chunk(chunk)