add whisperx support

This commit is contained in:
Mariano Gabriel
2025-12-03 06:48:45 -03:00
parent 7b919beda6
commit 7d7ec15ff7
4 changed files with 87 additions and 16 deletions

View File

@@ -4,6 +4,7 @@ Coordinates frame extraction, analysis, and transcript merging.
"""
from pathlib import Path
import logging
import os
import subprocess
import shutil
from typing import Dict, Any, Optional
@@ -32,6 +33,7 @@ class WorkflowConfig:
# Whisper options
self.run_whisper = kwargs.get('run_whisper', False)
self.whisper_model = kwargs.get('whisper_model', 'medium')
self.diarize = kwargs.get('diarize', False)
# Frame extraction
self.scene_detection = kwargs.get('scene_detection', False)
@@ -176,18 +178,27 @@ class ProcessingWorkflow:
if cached:
return str(cached)
# If no cache and not running whisper, use provided transcript path (if any)
if not self.config.run_whisper:
# If no cache and not running whisper/diarize, use provided transcript path (if any)
if not self.config.run_whisper and not self.config.diarize:
return self.config.transcript_path
logger.info("=" * 80)
logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80)
# Check if whisperx is installed
if not shutil.which("whisperx"):
logger.error("WhisperX is not installed. Install it with: pip install whisperx")
raise RuntimeError("WhisperX not installed")
# Determine which transcription tool to use
use_diarize = getattr(self.config, 'diarize', False)
if use_diarize:
if not shutil.which("whisperx"):
logger.error("WhisperX is not installed. Install it with: pip install whisperx")
raise RuntimeError("WhisperX not installed (required for --diarize)")
transcribe_cmd = "whisperx"
else:
if not shutil.which("whisper"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
raise RuntimeError("Whisper not installed")
transcribe_cmd = "whisper"
# Unload Ollama model to free GPU memory for Whisper (if using vision)
if self.config.use_vision:
@@ -199,21 +210,34 @@ class ProcessingWorkflow:
except Exception as e:
logger.warning(f"Could not unload Ollama model: {e}")
logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
if use_diarize:
logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...")
else:
logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
logger.info("This may take a few minutes depending on video length...")
# Run whisperx command with diarization
# Build command
cmd = [
"whisperx",
transcribe_cmd,
str(self.config.video_path),
"--model", self.config.whisper_model,
"--output_format", "json",
"--output_dir", str(self.output_mgr.output_dir),
"--diarize",
]
if use_diarize:
cmd.append("--diarize")
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
# Set up environment with cuDNN library path for whisperx
env = os.environ.copy()
if use_diarize:
import site
site_packages = site.getsitepackages()[0]
cudnn_path = Path(site_packages) / "nvidia" / "cudnn" / "lib"
if cudnn_path.exists():
env["LD_LIBRARY_PATH"] = str(cudnn_path) + ":" + env.get("LD_LIBRARY_PATH", "")
subprocess.run(cmd, check=True, capture_output=True, text=True, env=env)
transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")