add whisperx support

This commit is contained in:
Mariano Gabriel
2025-12-02 02:33:39 -03:00
parent 118ef04223
commit 7b919beda6
4 changed files with 155 additions and 38 deletions

View File

@@ -45,14 +45,15 @@ class TranscriptMerger:
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle different Whisper output formats
# Handle different Whisper/WhisperX output formats
segments = []
if isinstance(data, dict) and 'segments' in data:
# Standard Whisper JSON format
# Standard Whisper/WhisperX JSON format
segments = [
{
'timestamp': seg.get('start', 0),
'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio'
}
for seg in data['segments']
@@ -63,6 +64,7 @@ class TranscriptMerger:
{
'timestamp': seg.get('start', seg.get('timestamp', 0)),
'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio'
}
for seg in data
@@ -207,35 +209,28 @@ class TranscriptMerger:
lines = []
lines.append("=" * 80)
lines.append("ENHANCED MEETING TRANSCRIPT")
if self.embed_images:
lines.append("Audio transcript + Embedded frame images (base64)")
else:
lines.append("Audio transcript + Screen content")
lines.append("Audio transcript + Screen frames")
lines.append("=" * 80)
lines.append("")
total_image_bytes = 0
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
if seg['type'] == 'audio':
lines.append(f"[{timestamp}] SPEAKER:")
speaker = seg.get('speaker', 'SPEAKER')
lines.append(f"[{timestamp}] {speaker}:")
lines.append(f" {seg['text']}")
lines.append("")
else: # screen
lines.append(f"[{timestamp}] SCREEN CONTENT:")
# Embed image if requested
if self.embed_images and 'frame_path' in seg:
b64_img, img_size = self._encode_image_base64(seg['frame_path'])
total_image_bytes += img_size
if b64_img:
lines.append(f" IMAGE (base64, {img_size // 1024}KB):")
lines.append(f" <image>data:image/jpeg;base64,{b64_img}</image>")
lines.append("")
# Show frame path if available
if 'frame_path' in seg:
# Get just the filename relative to the enhanced transcript
frame_path = Path(seg['frame_path'])
relative_path = f"frames/{frame_path.name}"
lines.append(f" Frame: {relative_path}")
# Include text content if available (fallback or additional context)
if 'text' in seg and seg['text'].strip():
@@ -245,12 +240,6 @@ class TranscriptMerger:
lines.append("")
if self.embed_images and total_image_bytes > 0:
total_mb = total_image_bytes / (1024 * 1024)
lines.append("")
lines.append(f"Total embedded images size: {total_mb:.2f} MB")
logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
return "\n".join(lines)
def _format_compact(self, segments: List[Dict]) -> str:
@@ -259,7 +248,10 @@ class TranscriptMerger:
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
if seg['type'] == 'audio':
prefix = seg.get('speaker', 'SPEAKER')
else:
prefix = "SCREEN"
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
lines.append(f"[{timestamp}] {prefix}: {text}")