From d14390a649a5f139ed6cdb5a8b27cce7ade11232 Mon Sep 17 00:00:00 2001 From: buenosairesam Date: Fri, 3 Apr 2026 03:18:09 -0300 Subject: [PATCH] transcript updates --- cht/config.py | 4 ++- cht/stream/manager.py | 10 +++++++ cht/transcriber/engine.py | 59 +++++++++++++++++++++++++++++++++------ 3 files changed, 64 insertions(+), 9 deletions(-) diff --git a/cht/config.py b/cht/config.py index faeb614..53f9b57 100644 --- a/cht/config.py +++ b/cht/config.py @@ -26,5 +26,7 @@ AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector) WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket # Transcription -WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy +WHISPER_MODEL = "medium" # "small" for speed, "medium" for accuracy WHISPER_DEVICE = "cuda" # "cuda" or "cpu" +TRANSCRIBE_MIN_CHUNK_S = 5 # minimum seconds of audio before transcribing +TRANSCRIBE_LINES_PER_GROUP = 3 # whisper segments grouped per transcript ID (1-5) diff --git a/cht/stream/manager.py b/cht/stream/manager.py index b2b9a15..eb192e3 100644 --- a/cht/stream/manager.py +++ b/cht/stream/manager.py @@ -39,6 +39,16 @@ def list_sessions(): return sessions +def delete_sessions(session_ids): + """Delete session directories by ID.""" + import shutil + for sid in session_ids: + path = SESSIONS_DIR / sid + if path.exists() and path.is_dir(): + shutil.rmtree(path) + log.info("Deleted session: %s", sid) + + class StreamManager: def __init__(self, session_id=None): if session_id is None: diff --git a/cht/transcriber/engine.py b/cht/transcriber/engine.py index 15fe35c..beb785f 100644 --- a/cht/transcriber/engine.py +++ b/cht/transcriber/engine.py @@ -59,32 +59,75 @@ class TranscriberEngine: return [] self._ensure_model() try: - kwargs = {"beam_size": 5, "vad_filter": True} + kwargs = { + "beam_size": 5, + "vad_filter": True, + "condition_on_previous_text": True, + } if self.language: kwargs["language"] = self.language + # Feed last transcript text as context for better continuity + if self._segments: + kwargs["initial_prompt"] = self._segments[-1].text segments_iter, info = self._model.transcribe(str(wav_path), **kwargs) except Exception as e: log.error("Whisper transcription failed: %s", e) return [] + # Group whisper segments: new T-ID every N lines or on silence gap (>1s) + from cht.config import TRANSCRIBE_LINES_PER_GROUP + lines_per_group = TRANSCRIBE_LINES_PER_GROUP + SILENCE_GAP_S = 1.0 + + raw_segs = [] + for seg in segments_iter: + text = seg.text.strip() + if text: + raw_segs.append((time_offset + seg.start, time_offset + seg.end, text)) + new_segments = [] with self._lock: if self._stopped: return [] - for seg in segments_iter: - text = seg.text.strip() - if not text: - continue + + group_start = None + group_end = None + group_lines = [] + prev_end = None + + def _flush(): + nonlocal group_start, group_end, group_lines + if not group_lines: + return tid = f"T{self._next_id:04d}" self._next_id += 1 entry = TranscriptSegment( id=tid, - start=time_offset + seg.start, - end=time_offset + seg.end, - text=text, + start=group_start, + end=group_end, + text=" ".join(group_lines), ) self._segments.append(entry) new_segments.append(entry) + group_lines = [] + group_start = None + group_end = None + + for start, end, text in raw_segs: + # Silence gap → flush current group + if prev_end is not None and start - prev_end > SILENCE_GAP_S: + _flush() + + if group_start is None: + group_start = start + group_end = end + group_lines.append(text) + prev_end = end + + if len(group_lines) >= lines_per_group: + _flush() + + _flush() return new_segments