transcript updates

This commit is contained in:
2026-04-03 03:18:09 -03:00
parent db3b94a6a1
commit d14390a649
3 changed files with 64 additions and 9 deletions

View File

@@ -26,5 +26,7 @@ AUDIO_SAFETY_MARGIN = 2 # seconds safety margin (matches scene detector)
WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket WAVEFORM_BUCKET_MS = 50 # milliseconds per waveform peak bucket
# Transcription # Transcription
WHISPER_MODEL = "small" # "small" for speed, "medium" for accuracy WHISPER_MODEL = "medium" # "small" for speed, "medium" for accuracy
WHISPER_DEVICE = "cuda" # "cuda" or "cpu" WHISPER_DEVICE = "cuda" # "cuda" or "cpu"
TRANSCRIBE_MIN_CHUNK_S = 5 # minimum seconds of audio before transcribing
TRANSCRIBE_LINES_PER_GROUP = 3 # whisper segments grouped per transcript ID (1-5)

View File

@@ -39,6 +39,16 @@ def list_sessions():
return sessions return sessions
def delete_sessions(session_ids):
"""Delete session directories by ID."""
import shutil
for sid in session_ids:
path = SESSIONS_DIR / sid
if path.exists() and path.is_dir():
shutil.rmtree(path)
log.info("Deleted session: %s", sid)
class StreamManager: class StreamManager:
def __init__(self, session_id=None): def __init__(self, session_id=None):
if session_id is None: if session_id is None:

View File

@@ -59,32 +59,75 @@ class TranscriberEngine:
return [] return []
self._ensure_model() self._ensure_model()
try: try:
kwargs = {"beam_size": 5, "vad_filter": True} kwargs = {
"beam_size": 5,
"vad_filter": True,
"condition_on_previous_text": True,
}
if self.language: if self.language:
kwargs["language"] = self.language kwargs["language"] = self.language
# Feed last transcript text as context for better continuity
if self._segments:
kwargs["initial_prompt"] = self._segments[-1].text
segments_iter, info = self._model.transcribe(str(wav_path), **kwargs) segments_iter, info = self._model.transcribe(str(wav_path), **kwargs)
except Exception as e: except Exception as e:
log.error("Whisper transcription failed: %s", e) log.error("Whisper transcription failed: %s", e)
return [] return []
# Group whisper segments: new T-ID every N lines or on silence gap (>1s)
from cht.config import TRANSCRIBE_LINES_PER_GROUP
lines_per_group = TRANSCRIBE_LINES_PER_GROUP
SILENCE_GAP_S = 1.0
raw_segs = []
for seg in segments_iter:
text = seg.text.strip()
if text:
raw_segs.append((time_offset + seg.start, time_offset + seg.end, text))
new_segments = [] new_segments = []
with self._lock: with self._lock:
if self._stopped: if self._stopped:
return [] return []
for seg in segments_iter:
text = seg.text.strip() group_start = None
if not text: group_end = None
continue group_lines = []
prev_end = None
def _flush():
nonlocal group_start, group_end, group_lines
if not group_lines:
return
tid = f"T{self._next_id:04d}" tid = f"T{self._next_id:04d}"
self._next_id += 1 self._next_id += 1
entry = TranscriptSegment( entry = TranscriptSegment(
id=tid, id=tid,
start=time_offset + seg.start, start=group_start,
end=time_offset + seg.end, end=group_end,
text=text, text=" ".join(group_lines),
) )
self._segments.append(entry) self._segments.append(entry)
new_segments.append(entry) new_segments.append(entry)
group_lines = []
group_start = None
group_end = None
for start, end, text in raw_segs:
# Silence gap → flush current group
if prev_end is not None and start - prev_end > SILENCE_GAP_S:
_flush()
if group_start is None:
group_start = start
group_end = end
group_lines.append(text)
prev_end = end
if len(group_lines) >= lines_per_group:
_flush()
_flush()
return new_segments return new_segments