add root readme

2026-05-07 13:04:40 -03:00
parent 946234eb9e
commit feb5ecd463
10 changed files with 919 additions and 6 deletions
--- a/cht/summary/diarize.py
+++ b/cht/summary/diarize.py
@@ -0,0 +1,102 @@
+"""WhisperX subprocess wrapper for offline diarized transcription.
+
+Runs whisperx CLI on a full-session WAV file, with min/max speakers pinned
+to the user-provided count. Streams stderr to a progress callback. Loads the
+resulting JSON and returns it.
+"""
+
+import json
+import logging
+import os
+import subprocess
+import threading
+from pathlib import Path
+
+from cht import config
+
+log = logging.getLogger(__name__)
+
+
+def _cudnn_lib_for(whisperx_bin: str) -> str | None:
+    """Find nvidia/cudnn/lib inside the venv that owns *whisperx_bin*.
+
+    whisperx ships with `nvidia-cudnn-cu12`; the runtime needs the .so files
+    on LD_LIBRARY_PATH or it dies with a missing-symbol error.
+    """
+    bin_path = Path(whisperx_bin).resolve()
+    venv_root = bin_path.parent.parent  # .../venv/def
+    if not venv_root.exists():
+        return None
+    matches = list(venv_root.glob("lib/python*/site-packages/nvidia/cudnn/lib"))
+    return str(matches[0]) if matches else None
+
+
+def run_whisperx(
+    wav_path: Path,
+    output_dir: Path,
+    *,
+    num_speakers: int,
+    on_progress=None,
+) -> dict:
+    """Run whisperx diarization on `wav_path`. Returns parsed JSON.
+
+    Writes whisperx outputs into `output_dir`. Caller is responsible for
+    persisting the relevant artifact elsewhere if desired.
+    """
+    if not config.HF_TOKEN:
+        raise RuntimeError(
+            "HF_TOKEN environment variable is required for whisperx diarization."
+        )
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        config.WHISPERX_BIN,
+        str(wav_path),
+        "--model", config.WHISPERX_MODEL,
+        "--device", config.WHISPERX_DEVICE,
+        "--compute_type", config.WHISPERX_COMPUTE_TYPE,
+        "--diarize",
+        "--min_speakers", str(num_speakers),
+        "--max_speakers", str(num_speakers),
+        "--hf_token", config.HF_TOKEN,
+        "--output_format", "json",
+        "--output_dir", str(output_dir),
+    ]
+
+    env = os.environ.copy()
+    cudnn_path = config.WHISPERX_LD_LIBRARY_PATH or _cudnn_lib_for(config.WHISPERX_BIN)
+    if cudnn_path:
+        env["LD_LIBRARY_PATH"] = cudnn_path + os.pathsep + env.get("LD_LIBRARY_PATH", "")
+
+    log.info("whisperx: %s", " ".join(c for c in cmd if c != config.HF_TOKEN))
+    if on_progress:
+        on_progress("whisperx: starting", None)
+
+    proc = subprocess.Popen(
+        cmd, env=env,
+        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
+        text=True, bufsize=1,
+    )
+
+    # Drain stderr/stdout combined; report progress lines.
+    def _drain():
+        for line in proc.stdout:
+            line = line.rstrip()
+            if not line:
+                continue
+            log.debug("[whisperx] %s", line)
+            if on_progress:
+                on_progress(line, None)
+
+    t = threading.Thread(target=_drain, daemon=True, name="whisperx_drain")
+    t.start()
+    proc.wait()
+    t.join(timeout=2)
+
+    if proc.returncode != 0:
+        raise RuntimeError(f"whisperx exited with status {proc.returncode}")
+
+    out_json = output_dir / f"{wav_path.stem}.json"
+    if not out_json.exists():
+        raise RuntimeError(f"whisperx finished but {out_json.name} not found")
+    return json.loads(out_json.read_text())