add root readme
This commit is contained in:
5
cht/summary/__init__.py
Normal file
5
cht/summary/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Post-session summarization pipeline.
|
||||
|
||||
Offline diarization (whisperx) + transcript/frame merger producing a clean
|
||||
LLM-ready `<session>_enhanced.txt`.
|
||||
"""
|
||||
98
cht/summary/audio.py
Normal file
98
cht/summary/audio.py
Normal file
@@ -0,0 +1,98 @@
|
||||
"""Assemble a single WAV file covering the entire session audio.
|
||||
|
||||
Prefers the recording source (fMP4 or raw AAC) over the live-extracted
|
||||
WAV chunks: a single decode pass gives whisperx contiguous audio with no
|
||||
chunk-boundary artifacts. Chunks are a fallback when the recording source
|
||||
is missing.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import ffmpeg
|
||||
|
||||
from cht.stream import ffmpeg as ff
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def assemble_session_wav(session_dir: Path, *, force: bool = False) -> Path:
|
||||
"""Build `summary/full.wav` covering the whole session audio.
|
||||
|
||||
Returns the cached path if already present and `force` is False.
|
||||
Raises FileNotFoundError if no usable audio source exists.
|
||||
"""
|
||||
summary_dir = session_dir / "summary"
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
out = summary_dir / "full.wav"
|
||||
if out.exists() and not force:
|
||||
log.info("assemble_session_wav: cached %s", out)
|
||||
return out
|
||||
|
||||
stream_dir = session_dir / "stream"
|
||||
|
||||
# 1. Rust transport: standalone audio.aac.
|
||||
aac = stream_dir / "audio.aac"
|
||||
if aac.exists() and aac.stat().st_size > 100:
|
||||
ff.extract_audio_chunk(aac, out)
|
||||
log.info("assemble_session_wav: from audio.aac → %s", out)
|
||||
return out
|
||||
|
||||
# 2. fMP4 segments (Python transport). Single segment is the common case.
|
||||
segments = sorted(stream_dir.glob("recording_*.mp4")) if stream_dir.exists() else []
|
||||
if len(segments) == 1:
|
||||
ff.extract_audio_chunk(segments[0], out)
|
||||
log.info("assemble_session_wav: from %s → %s", segments[0].name, out)
|
||||
return out
|
||||
if len(segments) > 1:
|
||||
_concat_segments_audio(segments, out)
|
||||
log.info("assemble_session_wav: concatenated %d segments → %s", len(segments), out)
|
||||
return out
|
||||
|
||||
# 3. Fallback: concat the live audio chunks. Last resort — chunk seams may
|
||||
# introduce minor artifacts; whisperx still works but precision can suffer.
|
||||
audio_dir = session_dir / "audio"
|
||||
chunks = sorted(audio_dir.glob("chunk_*.wav")) if audio_dir.exists() else []
|
||||
if chunks:
|
||||
log.warning("assemble_session_wav: no recording source, falling back to %d chunks", len(chunks))
|
||||
_concat_chunks(chunks, out)
|
||||
return out
|
||||
|
||||
raise FileNotFoundError(f"No audio source found in {session_dir}")
|
||||
|
||||
|
||||
def _concat_segments_audio(segments: list[Path], out: Path) -> None:
|
||||
"""Decode + concatenate audio tracks from multiple fMP4 segments into 16kHz mono WAV."""
|
||||
inputs = [ffmpeg.input(str(p)) for p in segments]
|
||||
audio_streams = [s.audio for s in inputs]
|
||||
node = (
|
||||
ffmpeg.concat(*audio_streams, v=0, a=1)
|
||||
.output(str(out), acodec="pcm_s16le", ac=1, ar=16000)
|
||||
.overwrite_output()
|
||||
.global_args("-hide_banner", "-loglevel", "warning")
|
||||
)
|
||||
log.info("concat_segments_audio: %s", " ".join(node.compile()))
|
||||
node.run(capture_stdout=True, capture_stderr=True)
|
||||
|
||||
|
||||
def _concat_chunks(chunks: list[Path], out: Path) -> None:
|
||||
"""Concat already-PCM 16kHz mono WAV files via the concat demuxer (no re-decode)."""
|
||||
with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
|
||||
listfile = Path(f.name)
|
||||
for c in chunks:
|
||||
f.write(f"file '{c.resolve()}'\n")
|
||||
try:
|
||||
node = (
|
||||
ffmpeg.input(str(listfile), format="concat", safe=0)
|
||||
.output(str(out), c="copy")
|
||||
.overwrite_output()
|
||||
.global_args("-hide_banner", "-loglevel", "warning")
|
||||
)
|
||||
log.info("concat_chunks: %s", " ".join(node.compile()))
|
||||
node.run(capture_stdout=True, capture_stderr=True)
|
||||
finally:
|
||||
try:
|
||||
listfile.unlink()
|
||||
except OSError:
|
||||
pass
|
||||
102
cht/summary/diarize.py
Normal file
102
cht/summary/diarize.py
Normal file
@@ -0,0 +1,102 @@
|
||||
"""WhisperX subprocess wrapper for offline diarized transcription.
|
||||
|
||||
Runs whisperx CLI on a full-session WAV file, with min/max speakers pinned
|
||||
to the user-provided count. Streams stderr to a progress callback. Loads the
|
||||
resulting JSON and returns it.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import threading
|
||||
from pathlib import Path
|
||||
|
||||
from cht import config
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _cudnn_lib_for(whisperx_bin: str) -> str | None:
|
||||
"""Find nvidia/cudnn/lib inside the venv that owns *whisperx_bin*.
|
||||
|
||||
whisperx ships with `nvidia-cudnn-cu12`; the runtime needs the .so files
|
||||
on LD_LIBRARY_PATH or it dies with a missing-symbol error.
|
||||
"""
|
||||
bin_path = Path(whisperx_bin).resolve()
|
||||
venv_root = bin_path.parent.parent # .../venv/def
|
||||
if not venv_root.exists():
|
||||
return None
|
||||
matches = list(venv_root.glob("lib/python*/site-packages/nvidia/cudnn/lib"))
|
||||
return str(matches[0]) if matches else None
|
||||
|
||||
|
||||
def run_whisperx(
|
||||
wav_path: Path,
|
||||
output_dir: Path,
|
||||
*,
|
||||
num_speakers: int,
|
||||
on_progress=None,
|
||||
) -> dict:
|
||||
"""Run whisperx diarization on `wav_path`. Returns parsed JSON.
|
||||
|
||||
Writes whisperx outputs into `output_dir`. Caller is responsible for
|
||||
persisting the relevant artifact elsewhere if desired.
|
||||
"""
|
||||
if not config.HF_TOKEN:
|
||||
raise RuntimeError(
|
||||
"HF_TOKEN environment variable is required for whisperx diarization."
|
||||
)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
cmd = [
|
||||
config.WHISPERX_BIN,
|
||||
str(wav_path),
|
||||
"--model", config.WHISPERX_MODEL,
|
||||
"--device", config.WHISPERX_DEVICE,
|
||||
"--compute_type", config.WHISPERX_COMPUTE_TYPE,
|
||||
"--diarize",
|
||||
"--min_speakers", str(num_speakers),
|
||||
"--max_speakers", str(num_speakers),
|
||||
"--hf_token", config.HF_TOKEN,
|
||||
"--output_format", "json",
|
||||
"--output_dir", str(output_dir),
|
||||
]
|
||||
|
||||
env = os.environ.copy()
|
||||
cudnn_path = config.WHISPERX_LD_LIBRARY_PATH or _cudnn_lib_for(config.WHISPERX_BIN)
|
||||
if cudnn_path:
|
||||
env["LD_LIBRARY_PATH"] = cudnn_path + os.pathsep + env.get("LD_LIBRARY_PATH", "")
|
||||
|
||||
log.info("whisperx: %s", " ".join(c for c in cmd if c != config.HF_TOKEN))
|
||||
if on_progress:
|
||||
on_progress("whisperx: starting", None)
|
||||
|
||||
proc = subprocess.Popen(
|
||||
cmd, env=env,
|
||||
stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
|
||||
text=True, bufsize=1,
|
||||
)
|
||||
|
||||
# Drain stderr/stdout combined; report progress lines.
|
||||
def _drain():
|
||||
for line in proc.stdout:
|
||||
line = line.rstrip()
|
||||
if not line:
|
||||
continue
|
||||
log.debug("[whisperx] %s", line)
|
||||
if on_progress:
|
||||
on_progress(line, None)
|
||||
|
||||
t = threading.Thread(target=_drain, daemon=True, name="whisperx_drain")
|
||||
t.start()
|
||||
proc.wait()
|
||||
t.join(timeout=2)
|
||||
|
||||
if proc.returncode != 0:
|
||||
raise RuntimeError(f"whisperx exited with status {proc.returncode}")
|
||||
|
||||
out_json = output_dir / f"{wav_path.stem}.json"
|
||||
if not out_json.exists():
|
||||
raise RuntimeError(f"whisperx finished but {out_json.name} not found")
|
||||
return json.loads(out_json.read_text())
|
||||
88
cht/summary/merger.py
Normal file
88
cht/summary/merger.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""Interleave diarized audio segments with selected screen frames by timestamp.
|
||||
|
||||
Direct port of mts/meetus/transcript_merger.py:merge_transcripts (line 162).
|
||||
"""
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def merge(audio_segments: list[dict], frame_segments: list[dict],
|
||||
name_map: dict[str, str] | None = None) -> list[dict]:
|
||||
"""Combine and group by speaker; screen frames break speaker groups.
|
||||
|
||||
`audio_segments`: each {timestamp, text, speaker?}.
|
||||
`frame_segments`: each {timestamp, frame_path}.
|
||||
`name_map`: optional SPEAKER_xx → real name remap, applied to outputs.
|
||||
Returns merged list sorted by timestamp.
|
||||
"""
|
||||
name_map = name_map or {}
|
||||
|
||||
audio = [{**s, "type": "audio"} for s in audio_segments]
|
||||
screen = [{**s, "type": "screen"} for s in frame_segments]
|
||||
all_segs = sorted(audio + screen, key=lambda x: x["timestamp"])
|
||||
|
||||
grouped: list[dict] = []
|
||||
current = None
|
||||
|
||||
def _label(speaker):
|
||||
if not speaker:
|
||||
return None
|
||||
return name_map.get(speaker, speaker)
|
||||
|
||||
for seg in all_segs:
|
||||
if seg["type"] == "screen":
|
||||
if current is not None:
|
||||
grouped.append(current)
|
||||
current = None
|
||||
grouped.append(seg)
|
||||
continue
|
||||
|
||||
speaker = _label(seg.get("speaker"))
|
||||
if current is None:
|
||||
current = {
|
||||
"timestamp": seg["timestamp"],
|
||||
"text": seg["text"],
|
||||
"speaker": speaker,
|
||||
"type": "audio",
|
||||
}
|
||||
elif speaker == current.get("speaker"):
|
||||
current["text"] += " " + seg["text"]
|
||||
else:
|
||||
grouped.append(current)
|
||||
current = {
|
||||
"timestamp": seg["timestamp"],
|
||||
"text": seg["text"],
|
||||
"speaker": speaker,
|
||||
"type": "audio",
|
||||
}
|
||||
|
||||
if current is not None:
|
||||
grouped.append(current)
|
||||
return grouped
|
||||
|
||||
|
||||
def whisperx_to_audio_segments(diarized: dict) -> list[dict]:
|
||||
"""Convert whisperx JSON segments to the merger's audio format."""
|
||||
out = []
|
||||
for seg in diarized.get("segments", []):
|
||||
text = (seg.get("text") or "").strip()
|
||||
if not text:
|
||||
continue
|
||||
out.append({
|
||||
"timestamp": float(seg.get("start", 0.0)),
|
||||
"text": text,
|
||||
"speaker": seg.get("speaker"),
|
||||
})
|
||||
return out
|
||||
|
||||
|
||||
def collect_speakers(diarized: dict) -> list[str]:
|
||||
"""Distinct SPEAKER_xx labels found in the diarization, sorted."""
|
||||
seen = set()
|
||||
for seg in diarized.get("segments", []):
|
||||
sp = seg.get("speaker")
|
||||
if sp:
|
||||
seen.add(sp)
|
||||
return sorted(seen)
|
||||
75
cht/summary/output.py
Normal file
75
cht/summary/output.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""Format merged segments as a sequential LLM-ready transcript.
|
||||
|
||||
Direct port of mts/meetus/transcript_merger.py:_format_detailed (line 249).
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def format_detailed(merged_segments: list[dict], *, frames_relative_to: Path | None = None) -> str:
|
||||
"""Render the interleaved transcript.
|
||||
|
||||
If `frames_relative_to` is given, frame paths are rewritten relative to it.
|
||||
"""
|
||||
lines = []
|
||||
lines.append("=" * 80)
|
||||
lines.append("ENHANCED MEETING TRANSCRIPT")
|
||||
lines.append("Audio transcript + Screen frames")
|
||||
lines.append("=" * 80)
|
||||
lines.append("")
|
||||
|
||||
for seg in merged_segments:
|
||||
ts = _format_timestamp(seg["timestamp"])
|
||||
if seg["type"] == "audio":
|
||||
speaker = seg.get("speaker") or "SPEAKER"
|
||||
lines.append(f"[{ts}] {speaker}:")
|
||||
lines.append(f" {seg['text']}")
|
||||
lines.append("")
|
||||
else:
|
||||
lines.append(f"[{ts}] SCREEN CONTENT:")
|
||||
fp = seg.get("frame_path")
|
||||
if fp:
|
||||
if frames_relative_to is not None:
|
||||
try:
|
||||
fp = str(Path(fp).resolve().relative_to(frames_relative_to.resolve()))
|
||||
except ValueError:
|
||||
fp = str(fp)
|
||||
else:
|
||||
fp = str(fp)
|
||||
lines.append(f" Frame: {fp}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def _format_timestamp(seconds: float) -> str:
|
||||
seconds = int(seconds)
|
||||
h, rem = divmod(seconds, 3600)
|
||||
m, s = divmod(rem, 60)
|
||||
if h:
|
||||
return f"{h:02d}:{m:02d}:{s:02d}"
|
||||
return f"{m:02d}:{s:02d}"
|
||||
|
||||
|
||||
def write_outputs(session_dir: Path, merged: list[dict], *, name: str | None = None) -> Path:
|
||||
"""Write `<name>_enhanced.txt` and `merged.json` under `session_dir/summary`.
|
||||
|
||||
Returns the path of the enhanced transcript.
|
||||
"""
|
||||
summary_dir = session_dir / "summary"
|
||||
summary_dir.mkdir(parents=True, exist_ok=True)
|
||||
name = name or session_dir.name
|
||||
|
||||
text = format_detailed(merged, frames_relative_to=session_dir)
|
||||
text_path = summary_dir / f"{name}_enhanced.txt"
|
||||
text_path.write_text(text)
|
||||
|
||||
merged_path = summary_dir / "merged.json"
|
||||
merged_path.write_text(json.dumps(merged, indent=2, default=str))
|
||||
|
||||
log.info("Wrote %s (%d entries)", text_path, len(merged))
|
||||
return text_path
|
||||
72
cht/summary/pipeline.py
Normal file
72
cht/summary/pipeline.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""End-to-end orchestrator for the summarization export.
|
||||
|
||||
Two operations:
|
||||
diarize(...) — heavy: assembles audio, runs whisperx, caches diarized.json.
|
||||
export(...) — cheap: merges cached diarization with selected frames and
|
||||
writes <session>_enhanced.txt. Re-run any time the user
|
||||
tweaks frame selection or speaker names.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
from cht.session import load_frame_index
|
||||
from cht.summary import audio, diarize, merger, output
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def diarized_path(session_dir: Path) -> Path:
|
||||
return session_dir / "summary" / "diarized.json"
|
||||
|
||||
|
||||
def has_diarization(session_dir: Path) -> bool:
|
||||
return diarized_path(session_dir).exists()
|
||||
|
||||
|
||||
def load_diarization(session_dir: Path) -> dict:
|
||||
return json.loads(diarized_path(session_dir).read_text())
|
||||
|
||||
|
||||
def run_diarization(session_dir: Path, *, num_speakers: int, on_progress=None) -> dict:
|
||||
"""Assemble audio, run whisperx, cache and return the JSON."""
|
||||
if on_progress:
|
||||
on_progress("assembling audio", 0.05)
|
||||
wav = audio.assemble_session_wav(session_dir)
|
||||
|
||||
if on_progress:
|
||||
on_progress("running whisperx", 0.15)
|
||||
summary_dir = session_dir / "summary"
|
||||
diarized = diarize.run_whisperx(
|
||||
wav, summary_dir,
|
||||
num_speakers=num_speakers,
|
||||
on_progress=lambda line, _frac: on_progress(line, None) if on_progress else None,
|
||||
)
|
||||
|
||||
diarized_path(session_dir).write_text(json.dumps(diarized, indent=2))
|
||||
if on_progress:
|
||||
on_progress("diarization done", 1.0)
|
||||
return diarized
|
||||
|
||||
|
||||
def export(session_dir: Path,
|
||||
*,
|
||||
selected_frame_ids: set[str] | None = None,
|
||||
name_map: dict[str, str] | None = None) -> Path:
|
||||
"""Merge cached diarization + selected frames and write enhanced.txt."""
|
||||
if not has_diarization(session_dir):
|
||||
raise RuntimeError("No diarization available — run diarization first.")
|
||||
diarized = load_diarization(session_dir)
|
||||
audio_segs = merger.whisperx_to_audio_segments(diarized)
|
||||
|
||||
frames = load_frame_index(session_dir / "frames")
|
||||
if selected_frame_ids is not None:
|
||||
frames = [f for f in frames if f["id"] in selected_frame_ids]
|
||||
frame_segs = [
|
||||
{"timestamp": f["timestamp"], "frame_path": str(f["path"])}
|
||||
for f in frames
|
||||
]
|
||||
|
||||
merged = merger.merge(audio_segs, frame_segs, name_map=name_map)
|
||||
return output.write_outputs(session_dir, merged)
|
||||
Reference in New Issue
Block a user