refactor

2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -0,0 +1,316 @@
+"""
+Orchestrate the video processing workflow.
+Coordinates frame extraction, analysis, and transcript merging.
+"""
+from pathlib import Path
+import logging
+import subprocess
+import shutil
+from typing import Dict, Any, Optional
+
+from .output_manager import OutputManager
+from .cache_manager import CacheManager
+from .frame_extractor import FrameExtractor
+from .ocr_processor import OCRProcessor
+from .vision_processor import VisionProcessor
+from .transcript_merger import TranscriptMerger
+
+logger = logging.getLogger(__name__)
+
+
+class WorkflowConfig:
+    """Configuration for the processing workflow."""
+
+    def __init__(self, **kwargs):
+        """Initialize configuration from keyword arguments."""
+        # Video and paths
+        self.video_path = Path(kwargs['video'])
+        self.transcript_path = kwargs.get('transcript')
+        self.output_dir = kwargs.get('output_dir', 'output')
+        self.custom_output = kwargs.get('output')
+
+        # Whisper options
+        self.run_whisper = kwargs.get('run_whisper', False)
+        self.whisper_model = kwargs.get('whisper_model', 'base')
+
+        # Frame extraction
+        self.scene_detection = kwargs.get('scene_detection', False)
+        self.interval = kwargs.get('interval', 5)
+
+        # Analysis options
+        self.use_vision = kwargs.get('use_vision', False)
+        self.vision_model = kwargs.get('vision_model', 'llava:13b')
+        self.vision_context = kwargs.get('vision_context', 'meeting')
+        self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')
+
+        # Processing options
+        self.no_deduplicate = kwargs.get('no_deduplicate', False)
+        self.no_cache = kwargs.get('no_cache', False)
+        self.extract_only = kwargs.get('extract_only', False)
+        self.format = kwargs.get('format', 'detailed')
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for manifest."""
+        return {
+            "whisper": {
+                "enabled": self.run_whisper,
+                "model": self.whisper_model
+            },
+            "frame_extraction": {
+                "method": "scene_detection" if self.scene_detection else "interval",
+                "interval_seconds": self.interval if not self.scene_detection else None
+            },
+            "analysis": {
+                "method": "vision" if self.use_vision else "ocr",
+                "vision_model": self.vision_model if self.use_vision else None,
+                "vision_context": self.vision_context if self.use_vision else None,
+                "ocr_engine": self.ocr_engine if not self.use_vision else None,
+                "deduplication": not self.no_deduplicate
+            },
+            "output_format": self.format
+        }
+
+
+class ProcessingWorkflow:
+    """Orchestrate the complete video processing workflow."""
+
+    def __init__(self, config: WorkflowConfig):
+        """
+        Initialize workflow.
+
+        Args:
+            config: Workflow configuration
+        """
+        self.config = config
+        self.output_mgr = OutputManager(
+            config.video_path,
+            config.output_dir,
+            use_cache=not config.no_cache
+        )
+        self.cache_mgr = CacheManager(
+            self.output_mgr.output_dir,
+            self.output_mgr.frames_dir,
+            config.video_path.stem,
+            use_cache=not config.no_cache
+        )
+
+    def run(self) -> Dict[str, Any]:
+        """
+        Run the complete processing workflow.
+
+        Returns:
+            Dictionary with output paths and status
+        """
+        logger.info("=" * 80)
+        logger.info("MEETING PROCESSOR")
+        logger.info("=" * 80)
+        logger.info(f"Video: {self.config.video_path.name}")
+        logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
+        if self.config.use_vision:
+            logger.info(f"Vision Model: {self.config.vision_model}")
+            logger.info(f"Context: {self.config.vision_context}")
+        logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
+        logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
+        logger.info("=" * 80)
+
+        # Step 0: Whisper transcription
+        transcript_path = self._run_whisper()
+
+        # Step 1: Extract frames
+        frames_info = self._extract_frames()
+
+        if not frames_info:
+            logger.error("No frames extracted")
+            raise RuntimeError("Frame extraction failed")
+
+        # Step 2: Analyze frames
+        screen_segments = self._analyze_frames(frames_info)
+
+        if self.config.extract_only:
+            logger.info("Done! (extract-only mode)")
+            return self._build_result(transcript_path, screen_segments)
+
+        # Step 3: Merge with transcript
+        enhanced_transcript = self._merge_transcripts(transcript_path, screen_segments)
+
+        # Save manifest
+        self.output_mgr.save_manifest(self.config.to_dict())
+
+        # Build final result
+        return self._build_result(transcript_path, screen_segments, enhanced_transcript)
+
+    def _run_whisper(self) -> Optional[str]:
+        """Run Whisper transcription if requested."""
+        if not self.config.run_whisper:
+            return self.config.transcript_path
+
+        # Check cache
+        cached = self.cache_mgr.get_whisper_cache()
+        if cached:
+            return str(cached)
+
+        logger.info("=" * 80)
+        logger.info("STEP 0: Running Whisper Transcription")
+        logger.info("=" * 80)
+
+        # Check if whisper is installed
+        if not shutil.which("whisper"):
+            logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+            raise RuntimeError("Whisper not installed")
+
+        logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
+        logger.info("This may take a few minutes depending on video length...")
+
+        # Run whisper command
+        cmd = [
+            "whisper",
+            str(self.config.video_path),
+            "--model", self.config.whisper_model,
+            "--output_format", "json",
+            "--output_dir", str(self.output_mgr.output_dir)
+        ]
+
+        try:
+            subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+            transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
+
+            if transcript_path.exists():
+                logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
+                logger.info("")
+                return str(transcript_path)
+            else:
+                logger.error("Whisper completed but transcript file not found")
+                raise RuntimeError("Whisper output missing")
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Whisper failed: {e.stderr}")
+            raise
+
+    def _extract_frames(self):
+        """Extract frames from video."""
+        logger.info("Step 1: Extracting frames from video...")
+
+        # Check cache
+        cached_frames = self.cache_mgr.get_frames_cache()
+        if cached_frames:
+            return cached_frames
+
+        # Extract frames
+        extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
+
+        if self.config.scene_detection:
+            frames_info = extractor.extract_scene_changes()
+        else:
+            frames_info = extractor.extract_by_interval(self.config.interval)
+
+        logger.info(f"✓ Extracted {len(frames_info)} frames")
+        return frames_info
+
+    def _analyze_frames(self, frames_info):
+        """Analyze frames with vision or OCR."""
+        analysis_type = 'vision' if self.config.use_vision else 'ocr'
+
+        # Check cache
+        cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
+        if cached_analysis:
+            return cached_analysis
+
+        if self.config.use_vision:
+            return self._run_vision_analysis(frames_info)
+        else:
+            return self._run_ocr_analysis(frames_info)
+
+    def _run_vision_analysis(self, frames_info):
+        """Run vision analysis on frames."""
+        logger.info("Step 2: Running vision analysis on extracted frames...")
+
+        try:
+            vision = VisionProcessor(model=self.config.vision_model)
+            screen_segments = vision.process_frames(
+                frames_info,
+                context=self.config.vision_context,
+                deduplicate=not self.config.no_deduplicate
+            )
+            logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
+
+            # Cache results
+            self.cache_mgr.save_analysis('vision', screen_segments)
+            return screen_segments
+
+        except ImportError as e:
+            logger.error(f"{e}")
+            raise
+
+    def _run_ocr_analysis(self, frames_info):
+        """Run OCR analysis on frames."""
+        logger.info("Step 2: Running OCR on extracted frames...")
+
+        try:
+            ocr = OCRProcessor(engine=self.config.ocr_engine)
+            screen_segments = ocr.process_frames(
+                frames_info,
+                deduplicate=not self.config.no_deduplicate
+            )
+            logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
+
+            # Cache results
+            self.cache_mgr.save_analysis('ocr', screen_segments)
+            return screen_segments
+
+        except ImportError as e:
+            logger.error(f"{e}")
+            logger.error(f"To install {self.config.ocr_engine}:")
+            logger.error(f"  pip install {self.config.ocr_engine}")
+            raise
+
+    def _merge_transcripts(self, transcript_path, screen_segments):
+        """Merge audio and screen transcripts."""
+        merger = TranscriptMerger()
+
+        # Load audio transcript if available
+        audio_segments = []
+        if transcript_path:
+            logger.info("Step 3: Merging with Whisper transcript...")
+            transcript_file = Path(transcript_path)
+
+            if not transcript_file.exists():
+                logger.warning(f"Transcript not found: {transcript_path}")
+                logger.info("Proceeding with screen content only...")
+            else:
+                audio_segments = merger.load_whisper_transcript(str(transcript_file))
+                logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
+        else:
+            logger.info("No transcript provided, using screen content only...")
+
+        # Merge and format
+        merged = merger.merge_transcripts(audio_segments, screen_segments)
+        formatted = merger.format_for_claude(merged, format_style=self.config.format)
+
+        # Save output
+        if self.config.custom_output:
+            output_path = self.config.custom_output
+        else:
+            output_path = self.output_mgr.get_path(f"{self.config.video_path.stem}_enhanced.txt")
+
+        merger.save_transcript(formatted, str(output_path))
+
+        logger.info("=" * 80)
+        logger.info("✓ PROCESSING COMPLETE!")
+        logger.info("=" * 80)
+        logger.info(f"Output directory: {self.output_mgr.output_dir}")
+        logger.info(f"Enhanced transcript: {Path(output_path).name}")
+        logger.info("")
+
+        return output_path
+
+    def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
+        """Build result dictionary."""
+        return {
+            "output_dir": str(self.output_mgr.output_dir),
+            "transcript": transcript_path,
+            "analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
+            "frames_count": len(screen_segments) if screen_segments else 0,
+            "enhanced_transcript": enhanced_transcript,
+            "manifest": str(self.output_mgr.get_path("manifest.json"))
+        }