add whisperx support
This commit is contained in:
@@ -45,14 +45,15 @@ class TranscriptMerger:
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different Whisper output formats
|
||||
# Handle different Whisper/WhisperX output formats
|
||||
segments = []
|
||||
if isinstance(data, dict) and 'segments' in data:
|
||||
# Standard Whisper JSON format
|
||||
# Standard Whisper/WhisperX JSON format
|
||||
segments = [
|
||||
{
|
||||
'timestamp': seg.get('start', 0),
|
||||
'text': seg['text'].strip(),
|
||||
'speaker': seg.get('speaker'), # WhisperX diarization
|
||||
'type': 'audio'
|
||||
}
|
||||
for seg in data['segments']
|
||||
@@ -63,6 +64,7 @@ class TranscriptMerger:
|
||||
{
|
||||
'timestamp': seg.get('start', seg.get('timestamp', 0)),
|
||||
'text': seg['text'].strip(),
|
||||
'speaker': seg.get('speaker'), # WhisperX diarization
|
||||
'type': 'audio'
|
||||
}
|
||||
for seg in data
|
||||
@@ -207,35 +209,28 @@ class TranscriptMerger:
|
||||
lines = []
|
||||
lines.append("=" * 80)
|
||||
lines.append("ENHANCED MEETING TRANSCRIPT")
|
||||
if self.embed_images:
|
||||
lines.append("Audio transcript + Embedded frame images (base64)")
|
||||
else:
|
||||
lines.append("Audio transcript + Screen content")
|
||||
lines.append("Audio transcript + Screen frames")
|
||||
lines.append("=" * 80)
|
||||
lines.append("")
|
||||
|
||||
total_image_bytes = 0
|
||||
|
||||
for seg in segments:
|
||||
timestamp = self._format_timestamp(seg['timestamp'])
|
||||
|
||||
if seg['type'] == 'audio':
|
||||
lines.append(f"[{timestamp}] SPEAKER:")
|
||||
speaker = seg.get('speaker', 'SPEAKER')
|
||||
lines.append(f"[{timestamp}] {speaker}:")
|
||||
lines.append(f" {seg['text']}")
|
||||
lines.append("")
|
||||
|
||||
else: # screen
|
||||
lines.append(f"[{timestamp}] SCREEN CONTENT:")
|
||||
|
||||
# Embed image if requested
|
||||
if self.embed_images and 'frame_path' in seg:
|
||||
b64_img, img_size = self._encode_image_base64(seg['frame_path'])
|
||||
total_image_bytes += img_size
|
||||
|
||||
if b64_img:
|
||||
lines.append(f" IMAGE (base64, {img_size // 1024}KB):")
|
||||
lines.append(f" <image>data:image/jpeg;base64,{b64_img}</image>")
|
||||
lines.append("")
|
||||
# Show frame path if available
|
||||
if 'frame_path' in seg:
|
||||
# Get just the filename relative to the enhanced transcript
|
||||
frame_path = Path(seg['frame_path'])
|
||||
relative_path = f"frames/{frame_path.name}"
|
||||
lines.append(f" Frame: {relative_path}")
|
||||
|
||||
# Include text content if available (fallback or additional context)
|
||||
if 'text' in seg and seg['text'].strip():
|
||||
@@ -245,12 +240,6 @@ class TranscriptMerger:
|
||||
|
||||
lines.append("")
|
||||
|
||||
if self.embed_images and total_image_bytes > 0:
|
||||
total_mb = total_image_bytes / (1024 * 1024)
|
||||
lines.append("")
|
||||
lines.append(f"Total embedded images size: {total_mb:.2f} MB")
|
||||
logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_compact(self, segments: List[Dict]) -> str:
|
||||
@@ -259,7 +248,10 @@ class TranscriptMerger:
|
||||
|
||||
for seg in segments:
|
||||
timestamp = self._format_timestamp(seg['timestamp'])
|
||||
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
|
||||
if seg['type'] == 'audio':
|
||||
prefix = seg.get('speaker', 'SPEAKER')
|
||||
else:
|
||||
prefix = "SCREEN"
|
||||
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
|
||||
lines.append(f"[{timestamp}] {prefix}: {text}")
|
||||
|
||||
|
||||
@@ -184,10 +184,10 @@ class ProcessingWorkflow:
|
||||
logger.info("STEP 0: Running Whisper Transcription")
|
||||
logger.info("=" * 80)
|
||||
|
||||
# Check if whisper is installed
|
||||
if not shutil.which("whisper"):
|
||||
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
|
||||
raise RuntimeError("Whisper not installed")
|
||||
# Check if whisperx is installed
|
||||
if not shutil.which("whisperx"):
|
||||
logger.error("WhisperX is not installed. Install it with: pip install whisperx")
|
||||
raise RuntimeError("WhisperX not installed")
|
||||
|
||||
# Unload Ollama model to free GPU memory for Whisper (if using vision)
|
||||
if self.config.use_vision:
|
||||
@@ -199,16 +199,17 @@ class ProcessingWorkflow:
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not unload Ollama model: {e}")
|
||||
|
||||
logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
|
||||
logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
|
||||
logger.info("This may take a few minutes depending on video length...")
|
||||
|
||||
# Run whisper command
|
||||
# Run whisperx command with diarization
|
||||
cmd = [
|
||||
"whisper",
|
||||
"whisperx",
|
||||
str(self.config.video_path),
|
||||
"--model", self.config.whisper_model,
|
||||
"--output_format", "json",
|
||||
"--output_dir", str(self.output_mgr.output_dir)
|
||||
"--output_dir", str(self.output_mgr.output_dir),
|
||||
"--diarize",
|
||||
]
|
||||
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user