Files
mitus/meetus/workflow.py
Mariano Gabriel cd7b0aed07 refactor
2025-10-20 00:03:41 -03:00

317 lines
12 KiB
Python

"""
Orchestrate the video processing workflow.
Coordinates frame extraction, analysis, and transcript merging.
"""
from pathlib import Path
import logging
import subprocess
import shutil
from typing import Dict, Any, Optional
from .output_manager import OutputManager
from .cache_manager import CacheManager
from .frame_extractor import FrameExtractor
from .ocr_processor import OCRProcessor
from .vision_processor import VisionProcessor
from .transcript_merger import TranscriptMerger
logger = logging.getLogger(__name__)
class WorkflowConfig:
"""Configuration for the processing workflow."""
def __init__(self, **kwargs):
"""Initialize configuration from keyword arguments."""
# Video and paths
self.video_path = Path(kwargs['video'])
self.transcript_path = kwargs.get('transcript')
self.output_dir = kwargs.get('output_dir', 'output')
self.custom_output = kwargs.get('output')
# Whisper options
self.run_whisper = kwargs.get('run_whisper', False)
self.whisper_model = kwargs.get('whisper_model', 'base')
# Frame extraction
self.scene_detection = kwargs.get('scene_detection', False)
self.interval = kwargs.get('interval', 5)
# Analysis options
self.use_vision = kwargs.get('use_vision', False)
self.vision_model = kwargs.get('vision_model', 'llava:13b')
self.vision_context = kwargs.get('vision_context', 'meeting')
self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')
# Processing options
self.no_deduplicate = kwargs.get('no_deduplicate', False)
self.no_cache = kwargs.get('no_cache', False)
self.extract_only = kwargs.get('extract_only', False)
self.format = kwargs.get('format', 'detailed')
def to_dict(self) -> Dict[str, Any]:
"""Convert config to dictionary for manifest."""
return {
"whisper": {
"enabled": self.run_whisper,
"model": self.whisper_model
},
"frame_extraction": {
"method": "scene_detection" if self.scene_detection else "interval",
"interval_seconds": self.interval if not self.scene_detection else None
},
"analysis": {
"method": "vision" if self.use_vision else "ocr",
"vision_model": self.vision_model if self.use_vision else None,
"vision_context": self.vision_context if self.use_vision else None,
"ocr_engine": self.ocr_engine if not self.use_vision else None,
"deduplication": not self.no_deduplicate
},
"output_format": self.format
}
class ProcessingWorkflow:
"""Orchestrate the complete video processing workflow."""
def __init__(self, config: WorkflowConfig):
"""
Initialize workflow.
Args:
config: Workflow configuration
"""
self.config = config
self.output_mgr = OutputManager(
config.video_path,
config.output_dir,
use_cache=not config.no_cache
)
self.cache_mgr = CacheManager(
self.output_mgr.output_dir,
self.output_mgr.frames_dir,
config.video_path.stem,
use_cache=not config.no_cache
)
def run(self) -> Dict[str, Any]:
"""
Run the complete processing workflow.
Returns:
Dictionary with output paths and status
"""
logger.info("=" * 80)
logger.info("MEETING PROCESSOR")
logger.info("=" * 80)
logger.info(f"Video: {self.config.video_path.name}")
logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
if self.config.use_vision:
logger.info(f"Vision Model: {self.config.vision_model}")
logger.info(f"Context: {self.config.vision_context}")
logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
logger.info("=" * 80)
# Step 0: Whisper transcription
transcript_path = self._run_whisper()
# Step 1: Extract frames
frames_info = self._extract_frames()
if not frames_info:
logger.error("No frames extracted")
raise RuntimeError("Frame extraction failed")
# Step 2: Analyze frames
screen_segments = self._analyze_frames(frames_info)
if self.config.extract_only:
logger.info("Done! (extract-only mode)")
return self._build_result(transcript_path, screen_segments)
# Step 3: Merge with transcript
enhanced_transcript = self._merge_transcripts(transcript_path, screen_segments)
# Save manifest
self.output_mgr.save_manifest(self.config.to_dict())
# Build final result
return self._build_result(transcript_path, screen_segments, enhanced_transcript)
def _run_whisper(self) -> Optional[str]:
"""Run Whisper transcription if requested."""
if not self.config.run_whisper:
return self.config.transcript_path
# Check cache
cached = self.cache_mgr.get_whisper_cache()
if cached:
return str(cached)
logger.info("=" * 80)
logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80)
# Check if whisper is installed
if not shutil.which("whisper"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
raise RuntimeError("Whisper not installed")
logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
logger.info("This may take a few minutes depending on video length...")
# Run whisper command
cmd = [
"whisper",
str(self.config.video_path),
"--model", self.config.whisper_model,
"--output_format", "json",
"--output_dir", str(self.output_mgr.output_dir)
]
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
if transcript_path.exists():
logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
logger.info("")
return str(transcript_path)
else:
logger.error("Whisper completed but transcript file not found")
raise RuntimeError("Whisper output missing")
except subprocess.CalledProcessError as e:
logger.error(f"Whisper failed: {e.stderr}")
raise
def _extract_frames(self):
"""Extract frames from video."""
logger.info("Step 1: Extracting frames from video...")
# Check cache
cached_frames = self.cache_mgr.get_frames_cache()
if cached_frames:
return cached_frames
# Extract frames
extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
if self.config.scene_detection:
frames_info = extractor.extract_scene_changes()
else:
frames_info = extractor.extract_by_interval(self.config.interval)
logger.info(f"✓ Extracted {len(frames_info)} frames")
return frames_info
def _analyze_frames(self, frames_info):
"""Analyze frames with vision or OCR."""
analysis_type = 'vision' if self.config.use_vision else 'ocr'
# Check cache
cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
if cached_analysis:
return cached_analysis
if self.config.use_vision:
return self._run_vision_analysis(frames_info)
else:
return self._run_ocr_analysis(frames_info)
def _run_vision_analysis(self, frames_info):
"""Run vision analysis on frames."""
logger.info("Step 2: Running vision analysis on extracted frames...")
try:
vision = VisionProcessor(model=self.config.vision_model)
screen_segments = vision.process_frames(
frames_info,
context=self.config.vision_context,
deduplicate=not self.config.no_deduplicate
)
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
# Cache results
self.cache_mgr.save_analysis('vision', screen_segments)
return screen_segments
except ImportError as e:
logger.error(f"{e}")
raise
def _run_ocr_analysis(self, frames_info):
"""Run OCR analysis on frames."""
logger.info("Step 2: Running OCR on extracted frames...")
try:
ocr = OCRProcessor(engine=self.config.ocr_engine)
screen_segments = ocr.process_frames(
frames_info,
deduplicate=not self.config.no_deduplicate
)
logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
# Cache results
self.cache_mgr.save_analysis('ocr', screen_segments)
return screen_segments
except ImportError as e:
logger.error(f"{e}")
logger.error(f"To install {self.config.ocr_engine}:")
logger.error(f" pip install {self.config.ocr_engine}")
raise
def _merge_transcripts(self, transcript_path, screen_segments):
"""Merge audio and screen transcripts."""
merger = TranscriptMerger()
# Load audio transcript if available
audio_segments = []
if transcript_path:
logger.info("Step 3: Merging with Whisper transcript...")
transcript_file = Path(transcript_path)
if not transcript_file.exists():
logger.warning(f"Transcript not found: {transcript_path}")
logger.info("Proceeding with screen content only...")
else:
audio_segments = merger.load_whisper_transcript(str(transcript_file))
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
else:
logger.info("No transcript provided, using screen content only...")
# Merge and format
merged = merger.merge_transcripts(audio_segments, screen_segments)
formatted = merger.format_for_claude(merged, format_style=self.config.format)
# Save output
if self.config.custom_output:
output_path = self.config.custom_output
else:
output_path = self.output_mgr.get_path(f"{self.config.video_path.stem}_enhanced.txt")
merger.save_transcript(formatted, str(output_path))
logger.info("=" * 80)
logger.info("✓ PROCESSING COMPLETE!")
logger.info("=" * 80)
logger.info(f"Output directory: {self.output_mgr.output_dir}")
logger.info(f"Enhanced transcript: {Path(output_path).name}")
logger.info("")
return output_path
def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
"""Build result dictionary."""
return {
"output_dir": str(self.output_mgr.output_dir),
"transcript": transcript_path,
"analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
"frames_count": len(screen_segments) if screen_segments else 0,
"enhanced_transcript": enhanced_transcript,
"manifest": str(self.output_mgr.get_path("manifest.json"))
}