diff --git a/README.md b/README.md new file mode 100644 index 0000000..aedbe3e --- /dev/null +++ b/README.md @@ -0,0 +1,11 @@ +# Mitus + +Meeting stream viewer with an embedded AI agent. Captures screen and audio from a Wayland source machine, streams it over TCP to a receiver with GPU-accelerated decode, and runs a Claude Code agent that watches the feed autonomously — transcribing audio, extracting frames on scene changes, and acting on user-defined rules. The agent panel shows a live log of what it observes and the actions it takes, while a thumbnail grid gives a visual timeline of the session. Primary use case: staying present in meetings without manually feeding context to Claude. Provides summarization after the fact + +## Docs + +``` +cd docs && python3 -m http.server 8000 +``` + +Then open . diff --git a/cht/config.py b/cht/config.py index 867b2a6..be93a21 100644 --- a/cht/config.py +++ b/cht/config.py @@ -35,3 +35,16 @@ TRANSCRIBE_LINES_PER_GROUP = 3 # whisper segments grouped per transcript ID (1-5 # Agent settings AGENT_PERMISSION_MODE = "bypassPermissions" # default|acceptEdits|plan|bypassPermissions|dontAsk AGENT_MAX_TURNS = 5 + +# Offline summarization (post-session diarization + export) +# whisperx lives in its own venv to avoid dep clashes with cht's faster_whisper. +# Defaults mirror transcribe_oneoff.sh: large-v3 + int8 fits in ~3-4 GB VRAM and +# is less hallucination-prone than medium on long meetings. +WHISPERX_BIN = os.environ.get("CHT_WHISPERX_BIN", "/home/mariano/wdir/venv/def/bin/whisperx") +WHISPERX_MODEL = os.environ.get("CHT_WHISPERX_MODEL", "large-v3") +WHISPERX_DEVICE = os.environ.get("CHT_WHISPERX_DEVICE", "cuda") +WHISPERX_COMPUTE_TYPE = os.environ.get("CHT_WHISPERX_COMPUTE_TYPE", "int8") +WHISPERX_BATCH_SIZE = int(os.environ.get("CHT_WHISPERX_BATCH_SIZE", "4")) +HF_TOKEN = os.environ.get("HF_TOKEN") # required for pyannote diarization +DEFAULT_PARTICIPANTS = 2 +WHISPERX_LD_LIBRARY_PATH = os.environ.get("CHT_WHISPERX_LD_LIBRARY_PATH") # cuDNN override diff --git a/cht/summary/__init__.py b/cht/summary/__init__.py new file mode 100644 index 0000000..bc4062e --- /dev/null +++ b/cht/summary/__init__.py @@ -0,0 +1,5 @@ +"""Post-session summarization pipeline. + +Offline diarization (whisperx) + transcript/frame merger producing a clean +LLM-ready `_enhanced.txt`. +""" diff --git a/cht/summary/audio.py b/cht/summary/audio.py new file mode 100644 index 0000000..9933809 --- /dev/null +++ b/cht/summary/audio.py @@ -0,0 +1,98 @@ +"""Assemble a single WAV file covering the entire session audio. + +Prefers the recording source (fMP4 or raw AAC) over the live-extracted +WAV chunks: a single decode pass gives whisperx contiguous audio with no +chunk-boundary artifacts. Chunks are a fallback when the recording source +is missing. +""" + +import logging +import tempfile +from pathlib import Path + +import ffmpeg + +from cht.stream import ffmpeg as ff + +log = logging.getLogger(__name__) + + +def assemble_session_wav(session_dir: Path, *, force: bool = False) -> Path: + """Build `summary/full.wav` covering the whole session audio. + + Returns the cached path if already present and `force` is False. + Raises FileNotFoundError if no usable audio source exists. + """ + summary_dir = session_dir / "summary" + summary_dir.mkdir(parents=True, exist_ok=True) + out = summary_dir / "full.wav" + if out.exists() and not force: + log.info("assemble_session_wav: cached %s", out) + return out + + stream_dir = session_dir / "stream" + + # 1. Rust transport: standalone audio.aac. + aac = stream_dir / "audio.aac" + if aac.exists() and aac.stat().st_size > 100: + ff.extract_audio_chunk(aac, out) + log.info("assemble_session_wav: from audio.aac → %s", out) + return out + + # 2. fMP4 segments (Python transport). Single segment is the common case. + segments = sorted(stream_dir.glob("recording_*.mp4")) if stream_dir.exists() else [] + if len(segments) == 1: + ff.extract_audio_chunk(segments[0], out) + log.info("assemble_session_wav: from %s → %s", segments[0].name, out) + return out + if len(segments) > 1: + _concat_segments_audio(segments, out) + log.info("assemble_session_wav: concatenated %d segments → %s", len(segments), out) + return out + + # 3. Fallback: concat the live audio chunks. Last resort — chunk seams may + # introduce minor artifacts; whisperx still works but precision can suffer. + audio_dir = session_dir / "audio" + chunks = sorted(audio_dir.glob("chunk_*.wav")) if audio_dir.exists() else [] + if chunks: + log.warning("assemble_session_wav: no recording source, falling back to %d chunks", len(chunks)) + _concat_chunks(chunks, out) + return out + + raise FileNotFoundError(f"No audio source found in {session_dir}") + + +def _concat_segments_audio(segments: list[Path], out: Path) -> None: + """Decode + concatenate audio tracks from multiple fMP4 segments into 16kHz mono WAV.""" + inputs = [ffmpeg.input(str(p)) for p in segments] + audio_streams = [s.audio for s in inputs] + node = ( + ffmpeg.concat(*audio_streams, v=0, a=1) + .output(str(out), acodec="pcm_s16le", ac=1, ar=16000) + .overwrite_output() + .global_args("-hide_banner", "-loglevel", "warning") + ) + log.info("concat_segments_audio: %s", " ".join(node.compile())) + node.run(capture_stdout=True, capture_stderr=True) + + +def _concat_chunks(chunks: list[Path], out: Path) -> None: + """Concat already-PCM 16kHz mono WAV files via the concat demuxer (no re-decode).""" + with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f: + listfile = Path(f.name) + for c in chunks: + f.write(f"file '{c.resolve()}'\n") + try: + node = ( + ffmpeg.input(str(listfile), format="concat", safe=0) + .output(str(out), c="copy") + .overwrite_output() + .global_args("-hide_banner", "-loglevel", "warning") + ) + log.info("concat_chunks: %s", " ".join(node.compile())) + node.run(capture_stdout=True, capture_stderr=True) + finally: + try: + listfile.unlink() + except OSError: + pass diff --git a/cht/summary/diarize.py b/cht/summary/diarize.py new file mode 100644 index 0000000..fd913e9 --- /dev/null +++ b/cht/summary/diarize.py @@ -0,0 +1,102 @@ +"""WhisperX subprocess wrapper for offline diarized transcription. + +Runs whisperx CLI on a full-session WAV file, with min/max speakers pinned +to the user-provided count. Streams stderr to a progress callback. Loads the +resulting JSON and returns it. +""" + +import json +import logging +import os +import subprocess +import threading +from pathlib import Path + +from cht import config + +log = logging.getLogger(__name__) + + +def _cudnn_lib_for(whisperx_bin: str) -> str | None: + """Find nvidia/cudnn/lib inside the venv that owns *whisperx_bin*. + + whisperx ships with `nvidia-cudnn-cu12`; the runtime needs the .so files + on LD_LIBRARY_PATH or it dies with a missing-symbol error. + """ + bin_path = Path(whisperx_bin).resolve() + venv_root = bin_path.parent.parent # .../venv/def + if not venv_root.exists(): + return None + matches = list(venv_root.glob("lib/python*/site-packages/nvidia/cudnn/lib")) + return str(matches[0]) if matches else None + + +def run_whisperx( + wav_path: Path, + output_dir: Path, + *, + num_speakers: int, + on_progress=None, +) -> dict: + """Run whisperx diarization on `wav_path`. Returns parsed JSON. + + Writes whisperx outputs into `output_dir`. Caller is responsible for + persisting the relevant artifact elsewhere if desired. + """ + if not config.HF_TOKEN: + raise RuntimeError( + "HF_TOKEN environment variable is required for whisperx diarization." + ) + output_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + config.WHISPERX_BIN, + str(wav_path), + "--model", config.WHISPERX_MODEL, + "--device", config.WHISPERX_DEVICE, + "--compute_type", config.WHISPERX_COMPUTE_TYPE, + "--diarize", + "--min_speakers", str(num_speakers), + "--max_speakers", str(num_speakers), + "--hf_token", config.HF_TOKEN, + "--output_format", "json", + "--output_dir", str(output_dir), + ] + + env = os.environ.copy() + cudnn_path = config.WHISPERX_LD_LIBRARY_PATH or _cudnn_lib_for(config.WHISPERX_BIN) + if cudnn_path: + env["LD_LIBRARY_PATH"] = cudnn_path + os.pathsep + env.get("LD_LIBRARY_PATH", "") + + log.info("whisperx: %s", " ".join(c for c in cmd if c != config.HF_TOKEN)) + if on_progress: + on_progress("whisperx: starting", None) + + proc = subprocess.Popen( + cmd, env=env, + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, + text=True, bufsize=1, + ) + + # Drain stderr/stdout combined; report progress lines. + def _drain(): + for line in proc.stdout: + line = line.rstrip() + if not line: + continue + log.debug("[whisperx] %s", line) + if on_progress: + on_progress(line, None) + + t = threading.Thread(target=_drain, daemon=True, name="whisperx_drain") + t.start() + proc.wait() + t.join(timeout=2) + + if proc.returncode != 0: + raise RuntimeError(f"whisperx exited with status {proc.returncode}") + + out_json = output_dir / f"{wav_path.stem}.json" + if not out_json.exists(): + raise RuntimeError(f"whisperx finished but {out_json.name} not found") + return json.loads(out_json.read_text()) diff --git a/cht/summary/merger.py b/cht/summary/merger.py new file mode 100644 index 0000000..23a7762 --- /dev/null +++ b/cht/summary/merger.py @@ -0,0 +1,88 @@ +"""Interleave diarized audio segments with selected screen frames by timestamp. + +Direct port of mts/meetus/transcript_merger.py:merge_transcripts (line 162). +""" + +import logging + +log = logging.getLogger(__name__) + + +def merge(audio_segments: list[dict], frame_segments: list[dict], + name_map: dict[str, str] | None = None) -> list[dict]: + """Combine and group by speaker; screen frames break speaker groups. + + `audio_segments`: each {timestamp, text, speaker?}. + `frame_segments`: each {timestamp, frame_path}. + `name_map`: optional SPEAKER_xx → real name remap, applied to outputs. + Returns merged list sorted by timestamp. + """ + name_map = name_map or {} + + audio = [{**s, "type": "audio"} for s in audio_segments] + screen = [{**s, "type": "screen"} for s in frame_segments] + all_segs = sorted(audio + screen, key=lambda x: x["timestamp"]) + + grouped: list[dict] = [] + current = None + + def _label(speaker): + if not speaker: + return None + return name_map.get(speaker, speaker) + + for seg in all_segs: + if seg["type"] == "screen": + if current is not None: + grouped.append(current) + current = None + grouped.append(seg) + continue + + speaker = _label(seg.get("speaker")) + if current is None: + current = { + "timestamp": seg["timestamp"], + "text": seg["text"], + "speaker": speaker, + "type": "audio", + } + elif speaker == current.get("speaker"): + current["text"] += " " + seg["text"] + else: + grouped.append(current) + current = { + "timestamp": seg["timestamp"], + "text": seg["text"], + "speaker": speaker, + "type": "audio", + } + + if current is not None: + grouped.append(current) + return grouped + + +def whisperx_to_audio_segments(diarized: dict) -> list[dict]: + """Convert whisperx JSON segments to the merger's audio format.""" + out = [] + for seg in diarized.get("segments", []): + text = (seg.get("text") or "").strip() + if not text: + continue + out.append({ + "timestamp": float(seg.get("start", 0.0)), + "text": text, + "speaker": seg.get("speaker"), + }) + return out + + +def collect_speakers(diarized: dict) -> list[str]: + """Distinct SPEAKER_xx labels found in the diarization, sorted.""" + seen = set() + for seg in diarized.get("segments", []): + sp = seg.get("speaker") + if sp: + seen.add(sp) + return sorted(seen) diff --git a/cht/summary/output.py b/cht/summary/output.py new file mode 100644 index 0000000..fef6fe3 --- /dev/null +++ b/cht/summary/output.py @@ -0,0 +1,75 @@ +"""Format merged segments as a sequential LLM-ready transcript. + +Direct port of mts/meetus/transcript_merger.py:_format_detailed (line 249). +""" + +import json +import logging +from pathlib import Path + +log = logging.getLogger(__name__) + + +def format_detailed(merged_segments: list[dict], *, frames_relative_to: Path | None = None) -> str: + """Render the interleaved transcript. + + If `frames_relative_to` is given, frame paths are rewritten relative to it. + """ + lines = [] + lines.append("=" * 80) + lines.append("ENHANCED MEETING TRANSCRIPT") + lines.append("Audio transcript + Screen frames") + lines.append("=" * 80) + lines.append("") + + for seg in merged_segments: + ts = _format_timestamp(seg["timestamp"]) + if seg["type"] == "audio": + speaker = seg.get("speaker") or "SPEAKER" + lines.append(f"[{ts}] {speaker}:") + lines.append(f" {seg['text']}") + lines.append("") + else: + lines.append(f"[{ts}] SCREEN CONTENT:") + fp = seg.get("frame_path") + if fp: + if frames_relative_to is not None: + try: + fp = str(Path(fp).resolve().relative_to(frames_relative_to.resolve())) + except ValueError: + fp = str(fp) + else: + fp = str(fp) + lines.append(f" Frame: {fp}") + lines.append("") + + return "\n".join(lines) + + +def _format_timestamp(seconds: float) -> str: + seconds = int(seconds) + h, rem = divmod(seconds, 3600) + m, s = divmod(rem, 60) + if h: + return f"{h:02d}:{m:02d}:{s:02d}" + return f"{m:02d}:{s:02d}" + + +def write_outputs(session_dir: Path, merged: list[dict], *, name: str | None = None) -> Path: + """Write `_enhanced.txt` and `merged.json` under `session_dir/summary`. + + Returns the path of the enhanced transcript. + """ + summary_dir = session_dir / "summary" + summary_dir.mkdir(parents=True, exist_ok=True) + name = name or session_dir.name + + text = format_detailed(merged, frames_relative_to=session_dir) + text_path = summary_dir / f"{name}_enhanced.txt" + text_path.write_text(text) + + merged_path = summary_dir / "merged.json" + merged_path.write_text(json.dumps(merged, indent=2, default=str)) + + log.info("Wrote %s (%d entries)", text_path, len(merged)) + return text_path diff --git a/cht/summary/pipeline.py b/cht/summary/pipeline.py new file mode 100644 index 0000000..0c387e1 --- /dev/null +++ b/cht/summary/pipeline.py @@ -0,0 +1,72 @@ +"""End-to-end orchestrator for the summarization export. + +Two operations: + diarize(...) — heavy: assembles audio, runs whisperx, caches diarized.json. + export(...) — cheap: merges cached diarization with selected frames and + writes _enhanced.txt. Re-run any time the user + tweaks frame selection or speaker names. +""" + +import json +import logging +from pathlib import Path + +from cht.session import load_frame_index +from cht.summary import audio, diarize, merger, output + +log = logging.getLogger(__name__) + + +def diarized_path(session_dir: Path) -> Path: + return session_dir / "summary" / "diarized.json" + + +def has_diarization(session_dir: Path) -> bool: + return diarized_path(session_dir).exists() + + +def load_diarization(session_dir: Path) -> dict: + return json.loads(diarized_path(session_dir).read_text()) + + +def run_diarization(session_dir: Path, *, num_speakers: int, on_progress=None) -> dict: + """Assemble audio, run whisperx, cache and return the JSON.""" + if on_progress: + on_progress("assembling audio", 0.05) + wav = audio.assemble_session_wav(session_dir) + + if on_progress: + on_progress("running whisperx", 0.15) + summary_dir = session_dir / "summary" + diarized = diarize.run_whisperx( + wav, summary_dir, + num_speakers=num_speakers, + on_progress=lambda line, _frac: on_progress(line, None) if on_progress else None, + ) + + diarized_path(session_dir).write_text(json.dumps(diarized, indent=2)) + if on_progress: + on_progress("diarization done", 1.0) + return diarized + + +def export(session_dir: Path, + *, + selected_frame_ids: set[str] | None = None, + name_map: dict[str, str] | None = None) -> Path: + """Merge cached diarization + selected frames and write enhanced.txt.""" + if not has_diarization(session_dir): + raise RuntimeError("No diarization available — run diarization first.") + diarized = load_diarization(session_dir) + audio_segs = merger.whisperx_to_audio_segments(diarized) + + frames = load_frame_index(session_dir / "frames") + if selected_frame_ids is not None: + frames = [f for f in frames if f["id"] in selected_frame_ids] + frame_segs = [ + {"timestamp": f["timestamp"], "frame_path": str(f["path"])} + for f in frames + ] + + merged = merger.merge(audio_segs, frame_segs, name_map=name_map) + return output.write_outputs(session_dir, merged) diff --git a/cht/ui/summary_panel.py b/cht/ui/summary_panel.py new file mode 100644 index 0000000..d859d98 --- /dev/null +++ b/cht/ui/summary_panel.py @@ -0,0 +1,420 @@ +"""SummaryPanel: post-session export UI. + +Sits in a ViewStack page next to the live panels and is bound to a session +directory after disconnect / readonly load. Lets the user: + + 1. Set participant count + run whisperx diarization (heavy, threaded). + 2. Rename SPEAKER_xx to real names. + 3. Curate which captured frames go into the enhanced transcript. + 4. Export `_enhanced.txt` (cheap; re-runnable on rename / reselection). +""" + +import logging +from pathlib import Path +from threading import Thread + +import gi +gi.require_version("Gtk", "4.0") +gi.require_version("Adw", "1") +gi.require_version("GdkPixbuf", "2.0") +from gi.repository import Gtk, Gdk, GLib, Gio, Adw, Pango, GdkPixbuf, GObject + +from cht.config import DEFAULT_PARTICIPANTS +from cht.session import load_frame_index +from cht.summary import merger as summary_merger, pipeline as summary_pipeline + +log = logging.getLogger(__name__) + + +class SummaryPanel(Gtk.Box): + """Post-session export controls.""" + + __gsignals__ = { + "status-changed": (GObject.SignalFlags.RUN_FIRST, None, (str,)), + } + + def __init__(self, **kwargs): + super().__init__(orientation=Gtk.Orientation.VERTICAL, spacing=8, **kwargs) + self.set_margin_top(8) + self.set_margin_bottom(8) + self.set_margin_start(8) + self.set_margin_end(8) + + self._session_dir: Path | None = None + self._diarized: dict | None = None + self._speaker_entries: dict[str, Gtk.Entry] = {} + self._frame_checks: dict[str, Gtk.CheckButton] = {} + self._busy = False + + self._build_ui() + self._set_enabled(False) + + # -- UI construction -- + + def _build_ui(self): + # Header + title = Gtk.Label(label="Session export") + title.add_css_class("title-3") + title.set_halign(Gtk.Align.START) + self.append(title) + + self._session_label = Gtk.Label(label="No session bound") + self._session_label.add_css_class("dim-label") + self._session_label.set_halign(Gtk.Align.START) + self.append(self._session_label) + + # --- Diarization box --- + diar_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=4) + diar_box.add_css_class("card") + diar_box.set_margin_top(4) + + diar_header = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + diar_header.set_margin_top(8) + diar_header.set_margin_bottom(4) + diar_header.set_margin_start(8) + diar_header.set_margin_end(8) + + diar_header.append(Gtk.Label(label="Participants:")) + self._participants_spin = Gtk.SpinButton.new_with_range(1, 10, 1) + self._participants_spin.set_value(DEFAULT_PARTICIPANTS) + diar_header.append(self._participants_spin) + + self._run_btn = Gtk.Button(label="Run diarization") + self._run_btn.add_css_class("suggested-action") + self._run_btn.connect("clicked", self._on_run_diarize) + diar_header.append(self._run_btn) + + diar_box.append(diar_header) + + self._progress = Gtk.ProgressBar() + self._progress.set_show_text(True) + self._progress.set_text("") + self._progress.set_margin_start(8) + self._progress.set_margin_end(8) + diar_box.append(self._progress) + + self._status_label = Gtk.Label(label="") + self._status_label.set_halign(Gtk.Align.START) + self._status_label.set_wrap(True) + self._status_label.set_ellipsize(Pango.EllipsizeMode.END) + self._status_label.add_css_class("caption") + self._status_label.set_margin_start(8) + self._status_label.set_margin_end(8) + self._status_label.set_margin_bottom(8) + diar_box.append(self._status_label) + + self.append(diar_box) + + # --- Speakers + frames in a paned area for vertical scrolling --- + self._speakers_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=4) + self._speakers_box.add_css_class("card") + self._speakers_box.set_margin_top(4) + speakers_header = Gtk.Label(label="Speaker names") + speakers_header.add_css_class("heading") + speakers_header.set_halign(Gtk.Align.START) + speakers_header.set_margin_top(8) + speakers_header.set_margin_start(8) + self._speakers_box.append(speakers_header) + self._speakers_list = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=4) + self._speakers_list.set_margin_start(8) + self._speakers_list.set_margin_end(8) + self._speakers_list.set_margin_bottom(8) + self._speakers_box.append(self._speakers_list) + self.append(self._speakers_box) + self._speakers_box.set_visible(False) + + # --- Frame picker --- + frames_card = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=4) + frames_card.add_css_class("card") + frames_card.set_margin_top(4) + frames_card.set_vexpand(True) + + frames_header = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + frames_header.set_margin_top(8) + frames_header.set_margin_start(8) + frames_header.set_margin_end(8) + frames_title = Gtk.Label(label="Frames") + frames_title.add_css_class("heading") + frames_header.append(frames_title) + + self._frames_summary = Gtk.Label(label="") + self._frames_summary.add_css_class("dim-label") + frames_header.append(self._frames_summary) + + spacer = Gtk.Box() + spacer.set_hexpand(True) + frames_header.append(spacer) + + select_all_btn = Gtk.Button(label="Select all") + select_all_btn.add_css_class("flat") + select_all_btn.connect("clicked", lambda b: self._toggle_all_frames(True)) + frames_header.append(select_all_btn) + + deselect_all_btn = Gtk.Button(label="Deselect all") + deselect_all_btn.add_css_class("flat") + deselect_all_btn.connect("clicked", lambda b: self._toggle_all_frames(False)) + frames_header.append(deselect_all_btn) + + frames_card.append(frames_header) + + scroll = Gtk.ScrolledWindow() + scroll.set_policy(Gtk.PolicyType.NEVER, Gtk.PolicyType.AUTOMATIC) + scroll.set_vexpand(True) + scroll.set_margin_start(8) + scroll.set_margin_end(8) + scroll.set_margin_bottom(8) + self._frames_flow = Gtk.FlowBox() + self._frames_flow.set_selection_mode(Gtk.SelectionMode.NONE) + self._frames_flow.set_max_children_per_line(8) + self._frames_flow.set_min_children_per_line(2) + self._frames_flow.set_homogeneous(True) + self._frames_flow.set_row_spacing(4) + self._frames_flow.set_column_spacing(4) + scroll.set_child(self._frames_flow) + frames_card.append(scroll) + + self.append(frames_card) + + # --- Export controls --- + export_box = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + export_box.set_margin_top(4) + + self._export_btn = Gtk.Button(label="Export enhanced transcript") + self._export_btn.add_css_class("suggested-action") + self._export_btn.connect("clicked", self._on_export) + export_box.append(self._export_btn) + + self._open_btn = Gtk.Button(label="Open output") + self._open_btn.connect("clicked", self._on_open_output) + self._open_btn.set_sensitive(False) + export_box.append(self._open_btn) + + self._export_status = Gtk.Label(label="") + self._export_status.set_halign(Gtk.Align.START) + self._export_status.add_css_class("caption") + self._export_status.set_ellipsize(Pango.EllipsizeMode.END) + self._export_status.set_hexpand(True) + export_box.append(self._export_status) + + self.append(export_box) + + # -- Public API -- + + def bind_session(self, session_dir: Path | None): + """Attach the panel to a session directory (or None to clear).""" + self._session_dir = session_dir + self._diarized = None + self._speaker_entries.clear() + self._frame_checks.clear() + self._clear_widget(self._speakers_list) + self._clear_widget(self._frames_flow) + self._speakers_box.set_visible(False) + self._progress.set_fraction(0.0) + self._progress.set_text("") + self._status_label.set_text("") + self._export_status.set_text("") + self._open_btn.set_sensitive(False) + self._last_output: Path | None = None + + if session_dir is None: + self._session_label.set_text("No session bound") + self._frames_summary.set_text("") + self._set_enabled(False) + return + + self._session_label.set_text(f"Session: {session_dir.name}") + self._set_enabled(True) + self._load_frames() + # Reuse cached diarization if present. + if summary_pipeline.has_diarization(session_dir): + try: + self._diarized = summary_pipeline.load_diarization(session_dir) + self._populate_speakers() + self._status_label.set_text("Loaded cached diarization.") + self._progress.set_fraction(1.0) + self._progress.set_text("cached") + except Exception as e: + log.warning("Failed to load cached diarization: %s", e) + + def set_streaming(self, streaming: bool): + """Disable the panel while a live session is running.""" + self._set_enabled(not streaming and self._session_dir is not None) + + # -- Diarization -- + + def _on_run_diarize(self, _btn): + if self._busy or not self._session_dir: + return + num_speakers = int(self._participants_spin.get_value()) + self._busy = True + self._run_btn.set_sensitive(False) + self._export_btn.set_sensitive(False) + self._progress.set_fraction(0.05) + self._progress.set_text("starting…") + self._status_label.set_text("") + + session_dir = self._session_dir + + def _worker(): + try: + def on_progress(line, frac): + GLib.idle_add(self._update_progress, line, frac) + diarized = summary_pipeline.run_diarization( + session_dir, num_speakers=num_speakers, on_progress=on_progress, + ) + GLib.idle_add(self._on_diarize_done, diarized, None) + except Exception as e: + log.exception("Diarization failed") + GLib.idle_add(self._on_diarize_done, None, str(e)) + + Thread(target=_worker, daemon=True, name="diarize").start() + + def _update_progress(self, line: str | None, frac: float | None): + if frac is not None: + self._progress.set_fraction(min(1.0, max(0.0, frac))) + else: + # Pulse mode if no fraction hint. + self._progress.pulse() + if line: + self._progress.set_text(_short(line, 40)) + self._status_label.set_text(line) + return False + + def _on_diarize_done(self, diarized: dict | None, err: str | None): + self._busy = False + self._run_btn.set_sensitive(True) + self._export_btn.set_sensitive(True) + if err: + self._progress.set_fraction(0.0) + self._progress.set_text("failed") + self._status_label.set_text(f"Error: {err}") + return False + self._progress.set_fraction(1.0) + self._progress.set_text("done") + self._diarized = diarized + speakers = summary_merger.collect_speakers(diarized) if diarized else [] + self._status_label.set_text( + f"Diarization complete. Detected speakers: {', '.join(speakers) or '(none)'}" + ) + self._populate_speakers() + return False + + def _populate_speakers(self): + self._clear_widget(self._speakers_list) + self._speaker_entries.clear() + if not self._diarized: + self._speakers_box.set_visible(False) + return + speakers = summary_merger.collect_speakers(self._diarized) + if not speakers: + self._speakers_box.set_visible(False) + return + for sp in speakers: + row = Gtk.Box(orientation=Gtk.Orientation.HORIZONTAL, spacing=8) + label = Gtk.Label(label=sp) + label.set_width_chars(14) + label.set_xalign(0) + row.append(label) + entry = Gtk.Entry() + entry.set_placeholder_text(sp) + entry.set_hexpand(True) + row.append(entry) + self._speakers_list.append(row) + self._speaker_entries[sp] = entry + self._speakers_box.set_visible(True) + + # -- Frame picker -- + + def _load_frames(self): + self._clear_widget(self._frames_flow) + self._frame_checks.clear() + if not self._session_dir: + return + frames = load_frame_index(self._session_dir / "frames") + for f in frames: + self._add_frame_thumb(f["id"], f["path"], f["timestamp"]) + self._update_frames_summary() + + def _add_frame_thumb(self, frame_id: str, path: Path, timestamp: float): + box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=2) + try: + pixbuf = GdkPixbuf.Pixbuf.new_from_file_at_scale( + str(path), 192, 108, True, + ) + texture = Gdk.Texture.new_for_pixbuf(pixbuf) + pic = Gtk.Picture.new_for_paintable(texture) + pic.set_content_fit(Gtk.ContentFit.CONTAIN) + pic.set_size_request(192, 108) + box.append(pic) + except Exception as e: + log.debug("thumbnail load failed for %s: %s", path, e) + placeholder = Gtk.Label(label=frame_id) + placeholder.set_size_request(192, 108) + box.append(placeholder) + + m, s = divmod(int(timestamp), 60) + check = Gtk.CheckButton(label=f"{frame_id} [{m:02d}:{s:02d}]") + check.set_active(True) + check.connect("toggled", lambda _b: self._update_frames_summary()) + box.append(check) + + self._frames_flow.append(box) + self._frame_checks[frame_id] = check + + def _toggle_all_frames(self, value: bool): + for check in self._frame_checks.values(): + check.set_active(value) + + def _update_frames_summary(self): + total = len(self._frame_checks) + selected = sum(1 for c in self._frame_checks.values() if c.get_active()) + self._frames_summary.set_text(f"{selected}/{total} selected") + + # -- Export -- + + def _on_export(self, _btn): + if not self._session_dir: + return + if not summary_pipeline.has_diarization(self._session_dir): + self._export_status.set_text("Run diarization first.") + return + selected_ids = {fid for fid, c in self._frame_checks.items() if c.get_active()} + name_map = { + sp: entry.get_text().strip() + for sp, entry in self._speaker_entries.items() + if entry.get_text().strip() + } + try: + out = summary_pipeline.export( + self._session_dir, + selected_frame_ids=selected_ids, + name_map=name_map, + ) + except Exception as e: + log.exception("Export failed") + self._export_status.set_text(f"Error: {e}") + return + self._last_output = out + self._open_btn.set_sensitive(True) + self._export_status.set_text(f"Wrote {out}") + + def _on_open_output(self, _btn): + if not getattr(self, "_last_output", None): + return + uri = Gio.File.new_for_path(str(self._last_output)).get_uri() + Gio.AppInfo.launch_default_for_uri(uri, None) + + # -- helpers -- + + def _set_enabled(self, enabled: bool): + for w in (self._participants_spin, self._run_btn, self._export_btn): + w.set_sensitive(enabled and not self._busy) + + def _clear_widget(self, container: Gtk.Box | Gtk.FlowBox): + while child := container.get_first_child(): + container.remove(child) + + +def _short(s: str, n: int) -> str: + s = s.strip() + return s if len(s) <= n else s[: n - 1] + "…" diff --git a/cht/window.py b/cht/window.py index 2f2518b..5897541 100644 --- a/cht/window.py +++ b/cht/window.py @@ -18,6 +18,7 @@ from cht.ui.monitor import MonitorWidget from cht.ui.waveform import WaveformWidget from cht.ui.frames_panel import FramesPanel from cht.ui.transcript_panel import TranscriptPanel +from cht.ui.summary_panel import SummaryPanel from cht.ui.keyboard import KeyboardManager, KEY_LEFT, KEY_RIGHT, KEY_UP, KEY_DOWN, KEY_RETURN, KEY_KP_ENTER, KEY_ESCAPE, KEY_DELETE from cht.ui.agent_output import AgentOutputPanel from cht.ui.agent_input import AgentInputPanel @@ -261,6 +262,9 @@ class ChtWindow(Adw.ApplicationWindow): self._update_scrub_bar_manifest() self._populate_model_dropdown() + self._summary_panel.bind_session(mgr.session_dir) + self._summary_panel.set_streaming(False) + # Show "Continue" since there's an active session to resume self._connect_btn.set_label("Continue") @@ -328,6 +332,11 @@ class ChtWindow(Adw.ApplicationWindow): self._agent_output.load_thread(self._agent.thread) self.set_title(f"{APP_NAME} — {mgr.session_id}") + + self._summary_panel.bind_session(None) + self._summary_panel.set_streaming(True) + self._right_stack.set_visible_child_name("live") + log.info("Waiting for sender...") def _on_live_toggle(self): @@ -513,6 +522,7 @@ class ChtWindow(Adw.ApplicationWindow): # Stop live player before transitioning to review mode self._monitor.reset() self._load_session(last_session_id) + self._right_stack.set_visible_child_name("summary") return # Full reset — only when not reloading @@ -527,6 +537,8 @@ class ChtWindow(Adw.ApplicationWindow): self._known_frames = set() self._frames_panel.clear() self._transcript_panel.clear() + self._summary_panel.bind_session(None) + self._right_stack.set_visible_child_name("live") self.set_title(APP_NAME) def _on_close(self, *args): @@ -568,17 +580,21 @@ class ChtWindow(Adw.ApplicationWindow): self._timeline_controls.scrub_bar.connect("scrub-position", self._on_scrub_position) right_box.append(self._timeline_controls) - # Frames + # ViewStack: Live (frames/transcript/agent) ↔ Summary (post-session export) + self._right_stack = Adw.ViewStack() + self._right_stack.set_vexpand(True) + + live_box = Gtk.Box(orientation=Gtk.Orientation.VERTICAL, spacing=2) + live_box.set_vexpand(True) + frames_frame = Gtk.Frame() frames_frame.set_child(self._frames_panel) - right_box.append(frames_frame) + live_box.append(frames_frame) - # Transcript transcript_frame = Gtk.Frame() transcript_frame.set_child(self._transcript_panel) - right_box.append(transcript_frame) + live_box.append(transcript_frame) - # Agent input self._agent_input = AgentInputPanel() self._agent_input.connect("send-requested", lambda p, text: self._send_message(text or None)) self._agent_input.connect("action-requested", lambda p, verb: self._send_action(verb)) @@ -586,7 +602,20 @@ class ChtWindow(Adw.ApplicationWindow): self._agent_input.connect("lang-changed", self._on_lang_changed) self._agent_input.connect("history-toggled", lambda p, v: setattr(self._agent, "include_history", v)) self._agent_input.connect("permission-changed", self._on_permission_changed) - right_box.append(self._agent_input) + live_box.append(self._agent_input) + + live_page = self._right_stack.add_titled(live_box, "live", "Live") + live_page.set_icon_name("video-display-symbolic") + + self._summary_panel = SummaryPanel() + summary_page = self._right_stack.add_titled(self._summary_panel, "summary", "Summary") + summary_page.set_icon_name("text-x-generic-symbolic") + + switcher = Adw.ViewSwitcher() + switcher.set_stack(self._right_stack) + switcher.set_policy(Adw.ViewSwitcherPolicy.WIDE) + right_box.append(switcher) + right_box.append(self._right_stack) return right_box