add root readme

2026-05-07 13:04:40 -03:00
parent 946234eb9e
commit feb5ecd463
10 changed files with 919 additions and 6 deletions
--- a/cht/summary/init.py
+++ b/cht/summary/init.py
@@ -0,0 +1,5 @@
+"""Post-session summarization pipeline.
+
+Offline diarization (whisperx) + transcript/frame merger producing a clean
+LLM-ready `<session>_enhanced.txt`.
+"""
--- a/cht/summary/audio.py
+++ b/cht/summary/audio.py
@@ -0,0 +1,98 @@
+"""Assemble a single WAV file covering the entire session audio.
+
+Prefers the recording source (fMP4 or raw AAC) over the live-extracted
+WAV chunks: a single decode pass gives whisperx contiguous audio with no
+chunk-boundary artifacts. Chunks are a fallback when the recording source
+is missing.
+"""
+
+import logging
+import tempfile
+from pathlib import Path
+
+import ffmpeg
+
+from cht.stream import ffmpeg as ff
+
+log = logging.getLogger(__name__)
+
+
+def assemble_session_wav(session_dir: Path, *, force: bool = False) -> Path:
+    """Build `summary/full.wav` covering the whole session audio.
+
+    Returns the cached path if already present and `force` is False.
+    Raises FileNotFoundError if no usable audio source exists.
+    """
+    summary_dir = session_dir / "summary"
+    summary_dir.mkdir(parents=True, exist_ok=True)
+    out = summary_dir / "full.wav"
+    if out.exists() and not force:
+        log.info("assemble_session_wav: cached %s", out)
+        return out
+
+    stream_dir = session_dir / "stream"
+
+    # 1. Rust transport: standalone audio.aac.
+    aac = stream_dir / "audio.aac"
+    if aac.exists() and aac.stat().st_size > 100:
+        ff.extract_audio_chunk(aac, out)
+        log.info("assemble_session_wav: from audio.aac → %s", out)
+        return out
+
+    # 2. fMP4 segments (Python transport). Single segment is the common case.
+    segments = sorted(stream_dir.glob("recording_*.mp4")) if stream_dir.exists() else []
+    if len(segments) == 1:
+        ff.extract_audio_chunk(segments[0], out)
+        log.info("assemble_session_wav: from %s → %s", segments[0].name, out)
+        return out
+    if len(segments) > 1:
+        _concat_segments_audio(segments, out)
+        log.info("assemble_session_wav: concatenated %d segments → %s", len(segments), out)
+        return out
+
+    # 3. Fallback: concat the live audio chunks. Last resort — chunk seams may
+    # introduce minor artifacts; whisperx still works but precision can suffer.
+    audio_dir = session_dir / "audio"
+    chunks = sorted(audio_dir.glob("chunk_*.wav")) if audio_dir.exists() else []
+    if chunks:
+        log.warning("assemble_session_wav: no recording source, falling back to %d chunks", len(chunks))
+        _concat_chunks(chunks, out)
+        return out
+
+    raise FileNotFoundError(f"No audio source found in {session_dir}")
+
+
+def _concat_segments_audio(segments: list[Path], out: Path) -> None:
+    """Decode + concatenate audio tracks from multiple fMP4 segments into 16kHz mono WAV."""
+    inputs = [ffmpeg.input(str(p)) for p in segments]
+    audio_streams = [s.audio for s in inputs]
+    node = (
+        ffmpeg.concat(*audio_streams, v=0, a=1)
+        .output(str(out), acodec="pcm_s16le", ac=1, ar=16000)
+        .overwrite_output()
+        .global_args("-hide_banner", "-loglevel", "warning")
+    )
+    log.info("concat_segments_audio: %s", " ".join(node.compile()))
+    node.run(capture_stdout=True, capture_stderr=True)
+
+
+def _concat_chunks(chunks: list[Path], out: Path) -> None:
+    """Concat already-PCM 16kHz mono WAV files via the concat demuxer (no re-decode)."""
+    with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
+        listfile = Path(f.name)
+        for c in chunks:
+            f.write(f"file '{c.resolve()}'\n")
+    try:
+        node = (
+            ffmpeg.input(str(listfile), format="concat", safe=0)
+            .output(str(out), c="copy")
+            .overwrite_output()
+            .global_args("-hide_banner", "-loglevel", "warning")
+        )
+        log.info("concat_chunks: %s", " ".join(node.compile()))
+        node.run(capture_stdout=True, capture_stderr=True)
+    finally:
+        try:
+            listfile.unlink()
+        except OSError:
+            pass
--- a/cht/summary/diarize.py
+++ b/cht/summary/diarize.py
@@ -0,0 +1,102 @@
+"""WhisperX subprocess wrapper for offline diarized transcription.
+
+Runs whisperx CLI on a full-session WAV file, with min/max speakers pinned
+to the user-provided count. Streams stderr to a progress callback. Loads the
+resulting JSON and returns it.
+"""
+
+import json
+import logging
+import os
+import subprocess
+import threading
+from pathlib import Path
+
+from cht import config
+
+log = logging.getLogger(__name__)
+
+
+def _cudnn_lib_for(whisperx_bin: str) -> str | None:
+    """Find nvidia/cudnn/lib inside the venv that owns *whisperx_bin*.
+
+    whisperx ships with `nvidia-cudnn-cu12`; the runtime needs the .so files
+    on LD_LIBRARY_PATH or it dies with a missing-symbol error.
+    """
+    bin_path = Path(whisperx_bin).resolve()
+    venv_root = bin_path.parent.parent  # .../venv/def
+    if not venv_root.exists():
+        return None
+    matches = list(venv_root.glob("lib/python*/site-packages/nvidia/cudnn/lib"))
+    return str(matches[0]) if matches else None
+
+
+def run_whisperx(
+    wav_path: Path,
+    output_dir: Path,
+    *,
+    num_speakers: int,
+    on_progress=None,
+) -> dict:
+    """Run whisperx diarization on `wav_path`. Returns parsed JSON.
+
+    Writes whisperx outputs into `output_dir`. Caller is responsible for
+    persisting the relevant artifact elsewhere if desired.
+    """
+    if not config.HF_TOKEN:
+        raise RuntimeError(
+            "HF_TOKEN environment variable is required for whisperx diarization."
+        )
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        config.WHISPERX_BIN,
+        str(wav_path),
+        "--model", config.WHISPERX_MODEL,
+        "--device", config.WHISPERX_DEVICE,
+        "--compute_type", config.WHISPERX_COMPUTE_TYPE,
+        "--diarize",
+        "--min_speakers", str(num_speakers),
+        "--max_speakers", str(num_speakers),
+        "--hf_token", config.HF_TOKEN,
+        "--output_format", "json",
+        "--output_dir", str(output_dir),
+    ]
+
+    env = os.environ.copy()
+    cudnn_path = config.WHISPERX_LD_LIBRARY_PATH or _cudnn_lib_for(config.WHISPERX_BIN)
+    if cudnn_path:
+        env["LD_LIBRARY_PATH"] = cudnn_path + os.pathsep + env.get("LD_LIBRARY_PATH", "")
+
+    log.info("whisperx: %s", " ".join(c for c in cmd if c != config.HF_TOKEN))
+    if on_progress:
+        on_progress("whisperx: starting", None)
+
+    proc = subprocess.Popen(
+        cmd, env=env,
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        text=True, bufsize=1,
+    )
+
+    # Drain stderr/stdout combined; report progress lines.
+    def _drain():
+        for line in proc.stdout:
+            line = line.rstrip()
+            if not line:
+                continue
+            log.debug("[whisperx] %s", line)
+            if on_progress:
+                on_progress(line, None)
+
+    t = threading.Thread(target=_drain, daemon=True, name="whisperx_drain")
+    t.start()
+    proc.wait()
+    t.join(timeout=2)
+
+    if proc.returncode != 0:
+        raise RuntimeError(f"whisperx exited with status {proc.returncode}")
+
+    out_json = output_dir / f"{wav_path.stem}.json"
+    if not out_json.exists():
+        raise RuntimeError(f"whisperx finished but {out_json.name} not found")
+    return json.loads(out_json.read_text())
--- a/cht/summary/merger.py
+++ b/cht/summary/merger.py
@@ -0,0 +1,88 @@
+"""Interleave diarized audio segments with selected screen frames by timestamp.
+
+Direct port of mts/meetus/transcript_merger.py:merge_transcripts (line 162).
+"""
+
+import logging
+
+log = logging.getLogger(__name__)
+
+
+def merge(audio_segments: list[dict], frame_segments: list[dict],
+          name_map: dict[str, str] | None = None) -> list[dict]:
+    """Combine and group by speaker; screen frames break speaker groups.
+
+    `audio_segments`: each {timestamp, text, speaker?}.
+    `frame_segments`: each {timestamp, frame_path}.
+    `name_map`: optional SPEAKER_xx → real name remap, applied to outputs.
+    Returns merged list sorted by timestamp.
+    """
+    name_map = name_map or {}
+
+    audio = [{**s, "type": "audio"} for s in audio_segments]
+    screen = [{**s, "type": "screen"} for s in frame_segments]
+    all_segs = sorted(audio + screen, key=lambda x: x["timestamp"])
+
+    grouped: list[dict] = []
+    current = None
+
+    def _label(speaker):
+        if not speaker:
+            return None
+        return name_map.get(speaker, speaker)
+
+    for seg in all_segs:
+        if seg["type"] == "screen":
+            if current is not None:
+                grouped.append(current)
+                current = None
+            grouped.append(seg)
+            continue
+
+        speaker = _label(seg.get("speaker"))
+        if current is None:
+            current = {
+                "timestamp": seg["timestamp"],
+                "text": seg["text"],
+                "speaker": speaker,
+                "type": "audio",
+            }
+        elif speaker == current.get("speaker"):
+            current["text"] += " " + seg["text"]
+        else:
+            grouped.append(current)
+            current = {
+                "timestamp": seg["timestamp"],
+                "text": seg["text"],
+                "speaker": speaker,
+                "type": "audio",
+            }
+
+    if current is not None:
+        grouped.append(current)
+    return grouped
+
+
+def whisperx_to_audio_segments(diarized: dict) -> list[dict]:
+    """Convert whisperx JSON segments to the merger's audio format."""
+    out = []
+    for seg in diarized.get("segments", []):
+        text = (seg.get("text") or "").strip()
+        if not text:
+            continue
+        out.append({
+            "timestamp": float(seg.get("start", 0.0)),
+            "text": text,
+            "speaker": seg.get("speaker"),
+        })
+    return out
+
+
+def collect_speakers(diarized: dict) -> list[str]:
+    """Distinct SPEAKER_xx labels found in the diarization, sorted."""
+    seen = set()
+    for seg in diarized.get("segments", []):
+        sp = seg.get("speaker")
+        if sp:
+            seen.add(sp)
+    return sorted(seen)
--- a/cht/summary/output.py
+++ b/cht/summary/output.py
@@ -0,0 +1,75 @@
+"""Format merged segments as a sequential LLM-ready transcript.
+
+Direct port of mts/meetus/transcript_merger.py:_format_detailed (line 249).
+"""
+
+import json
+import logging
+from pathlib import Path
+
+log = logging.getLogger(__name__)
+
+
+def format_detailed(merged_segments: list[dict], *, frames_relative_to: Path | None = None) -> str:
+    """Render the interleaved transcript.
+
+    If `frames_relative_to` is given, frame paths are rewritten relative to it.
+    """
+    lines = []
+    lines.append("=" * 80)
+    lines.append("ENHANCED MEETING TRANSCRIPT")
+    lines.append("Audio transcript + Screen frames")
+    lines.append("=" * 80)
+    lines.append("")
+
+    for seg in merged_segments:
+        ts = _format_timestamp(seg["timestamp"])
+        if seg["type"] == "audio":
+            speaker = seg.get("speaker") or "SPEAKER"
+            lines.append(f"[{ts}] {speaker}:")
+            lines.append(f"  {seg['text']}")
+            lines.append("")
+        else:
+            lines.append(f"[{ts}] SCREEN CONTENT:")
+            fp = seg.get("frame_path")
+            if fp:
+                if frames_relative_to is not None:
+                    try:
+                        fp = str(Path(fp).resolve().relative_to(frames_relative_to.resolve()))
+                    except ValueError:
+                        fp = str(fp)
+                else:
+                    fp = str(fp)
+                lines.append(f"  Frame: {fp}")
+            lines.append("")
+
+    return "\n".join(lines)
+
+
+def _format_timestamp(seconds: float) -> str:
+    seconds = int(seconds)
+    h, rem = divmod(seconds, 3600)
+    m, s = divmod(rem, 60)
+    if h:
+        return f"{h:02d}:{m:02d}:{s:02d}"
+    return f"{m:02d}:{s:02d}"
+
+
+def write_outputs(session_dir: Path, merged: list[dict], *, name: str | None = None) -> Path:
+    """Write `<name>_enhanced.txt` and `merged.json` under `session_dir/summary`.
+
+    Returns the path of the enhanced transcript.
+    """
+    summary_dir = session_dir / "summary"
+    summary_dir.mkdir(parents=True, exist_ok=True)
+    name = name or session_dir.name
+
+    text = format_detailed(merged, frames_relative_to=session_dir)
+    text_path = summary_dir / f"{name}_enhanced.txt"
+    text_path.write_text(text)
+
+    merged_path = summary_dir / "merged.json"
+    merged_path.write_text(json.dumps(merged, indent=2, default=str))
+
+    log.info("Wrote %s (%d entries)", text_path, len(merged))
+    return text_path
--- a/cht/summary/pipeline.py
+++ b/cht/summary/pipeline.py
@@ -0,0 +1,72 @@
+"""End-to-end orchestrator for the summarization export.
+
+Two operations:
+  diarize(...)   — heavy: assembles audio, runs whisperx, caches diarized.json.
+  export(...)    — cheap: merges cached diarization with selected frames and
+                   writes <session>_enhanced.txt. Re-run any time the user
+                   tweaks frame selection or speaker names.
+"""
+
+import json
+import logging
+from pathlib import Path
+
+from cht.session import load_frame_index
+from cht.summary import audio, diarize, merger, output
+
+log = logging.getLogger(__name__)
+
+
+def diarized_path(session_dir: Path) -> Path:
+    return session_dir / "summary" / "diarized.json"
+
+
+def has_diarization(session_dir: Path) -> bool:
+    return diarized_path(session_dir).exists()
+
+
+def load_diarization(session_dir: Path) -> dict:
+    return json.loads(diarized_path(session_dir).read_text())
+
+
+def run_diarization(session_dir: Path, *, num_speakers: int, on_progress=None) -> dict:
+    """Assemble audio, run whisperx, cache and return the JSON."""
+    if on_progress:
+        on_progress("assembling audio", 0.05)
+    wav = audio.assemble_session_wav(session_dir)
+
+    if on_progress:
+        on_progress("running whisperx", 0.15)
+    summary_dir = session_dir / "summary"
+    diarized = diarize.run_whisperx(
+        wav, summary_dir,
+        num_speakers=num_speakers,
+        on_progress=lambda line, _frac: on_progress(line, None) if on_progress else None,
+    )
+
+    diarized_path(session_dir).write_text(json.dumps(diarized, indent=2))
+    if on_progress:
+        on_progress("diarization done", 1.0)
+    return diarized
+
+
+def export(session_dir: Path,
+           *,
+           selected_frame_ids: set[str] | None = None,
+           name_map: dict[str, str] | None = None) -> Path:
+    """Merge cached diarization + selected frames and write enhanced.txt."""
+    if not has_diarization(session_dir):
+        raise RuntimeError("No diarization available — run diarization first.")
+    diarized = load_diarization(session_dir)
+    audio_segs = merger.whisperx_to_audio_segments(diarized)
+
+    frames = load_frame_index(session_dir / "frames")
+    if selected_frame_ids is not None:
+        frames = [f for f in frames if f["id"] in selected_frame_ids]
+    frame_segs = [
+        {"timestamp": f["timestamp"], "frame_path": str(f["path"])}
+        for f in frames
+    ]
+
+    merged = merger.merge(audio_segs, frame_segs, name_map=name_map)
+    return output.write_outputs(session_dir, merged)