embed images

2025-10-28 08:02:45 -03:00
parent b1e1daf278
commit 118ef04223
12 changed files with 1016 additions and 61 deletions
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -40,10 +40,21 @@ class WorkflowConfig:

        # Analysis options
        self.use_vision = kwargs.get('use_vision', False)
+        self.use_hybrid = kwargs.get('use_hybrid', False)
+        self.hybrid_llm_cleanup = kwargs.get('hybrid_llm_cleanup', False)
+        self.hybrid_llm_model = kwargs.get('hybrid_llm_model', 'llama3.2:3b')
        self.vision_model = kwargs.get('vision_model', 'llava:13b')
        self.vision_context = kwargs.get('vision_context', 'meeting')
        self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')

+        # Validation: can't use both vision and hybrid
+        if self.use_vision and self.use_hybrid:
+            raise ValueError("Cannot use both --use-vision and --use-hybrid. Choose one.")
+
+        # Validation: LLM cleanup requires hybrid mode
+        if self.hybrid_llm_cleanup and not self.use_hybrid:
+            raise ValueError("--hybrid-llm-cleanup requires --use-hybrid")
+
        # Processing options
        self.no_deduplicate = kwargs.get('no_deduplicate', False)
        self.no_cache = kwargs.get('no_cache', False)
@@ -52,6 +63,8 @@ class WorkflowConfig:
        self.skip_cache_analysis = kwargs.get('skip_cache_analysis', False)
        self.extract_only = kwargs.get('extract_only', False)
        self.format = kwargs.get('format', 'detailed')
+        self.embed_images = kwargs.get('embed_images', False)
+        self.embed_quality = kwargs.get('embed_quality', 80)

    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary for manifest."""
@@ -66,10 +79,10 @@ class WorkflowConfig:
                "scene_threshold": self.scene_threshold if self.scene_detection else None
            },
            "analysis": {
-                "method": "vision" if self.use_vision else "ocr",
+                "method": "vision" if self.use_vision else ("hybrid" if self.use_hybrid else "ocr"),
                "vision_model": self.vision_model if self.use_vision else None,
                "vision_context": self.vision_context if self.use_vision else None,
-                "ocr_engine": self.ocr_engine if not self.use_vision else None,
+                "ocr_engine": self.ocr_engine if (not self.use_vision) else None,
                "deduplication": not self.no_deduplicate
            },
            "output_format": self.format
@@ -113,10 +126,19 @@ class ProcessingWorkflow:
        logger.info("MEETING PROCESSOR")
        logger.info("=" * 80)
        logger.info(f"Video: {self.config.video_path.name}")
-        logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
+
+        # Determine analysis method
        if self.config.use_vision:
-            logger.info(f"Vision Model: {self.config.vision_model}")
+            analysis_method = f"Vision Model ({self.config.vision_model})"
+            logger.info(f"Analysis: {analysis_method}")
            logger.info(f"Context: {self.config.vision_context}")
+        elif self.config.use_hybrid:
+            analysis_method = f"Hybrid (OpenCV + {self.config.ocr_engine})"
+            logger.info(f"Analysis: {analysis_method}")
+        else:
+            analysis_method = f"OCR ({self.config.ocr_engine})"
+            logger.info(f"Analysis: {analysis_method}")
+
        logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
        logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
        logger.info("=" * 80)
@@ -148,15 +170,16 @@ class ProcessingWorkflow:
        return self._build_result(transcript_path, screen_segments, enhanced_transcript)

    def _run_whisper(self) -> Optional[str]:
-        """Run Whisper transcription if requested."""
-        if not self.config.run_whisper:
-            return self.config.transcript_path
-
-        # Check cache
+        """Run Whisper transcription if requested, or use cached/provided transcript."""
+        # First, check cache (regardless of run_whisper flag)
        cached = self.cache_mgr.get_whisper_cache()
        if cached:
            return str(cached)

+        # If no cache and not running whisper, use provided transcript path (if any)
+        if not self.config.run_whisper:
+            return self.config.transcript_path
+
        logger.info("=" * 80)
        logger.info("STEP 0: Running Whisper Transcription")
        logger.info("=" * 80)
@@ -195,6 +218,25 @@ class ProcessingWorkflow:

            if transcript_path.exists():
                logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
+
+                # Debug: Show transcript preview
+                try:
+                    import json
+                    with open(transcript_path, 'r', encoding='utf-8') as f:
+                        whisper_data = json.load(f)
+
+                    if 'segments' in whisper_data:
+                        logger.debug(f"Whisper produced {len(whisper_data['segments'])} segments")
+                        if whisper_data['segments']:
+                            logger.debug(f"First segment: {whisper_data['segments'][0]}")
+                            logger.debug(f"Last segment: {whisper_data['segments'][-1]}")
+
+                    if 'text' in whisper_data:
+                        text_preview = whisper_data['text'][:200] + "..." if len(whisper_data.get('text', '')) > 200 else whisper_data.get('text', '')
+                        logger.debug(f"Transcript preview: {text_preview}")
+                except Exception as e:
+                    logger.debug(f"Could not parse whisper output for debug: {e}")
+
                logger.info("")
                return str(transcript_path)
            else:
@@ -216,12 +258,24 @@ class ProcessingWorkflow:

        # Clean up old frames if regenerating
        if self.config.skip_cache_frames and self.output_mgr.frames_dir.exists():
-            logger.info("Cleaning up old frames...")
-            for old_frame in self.output_mgr.frames_dir.glob("*.jpg"):
-                old_frame.unlink()
+            old_frames = list(self.output_mgr.frames_dir.glob("*.jpg"))
+            if old_frames:
+                logger.info(f"Cleaning up {len(old_frames)} old frames...")
+                for old_frame in old_frames:
+                    old_frame.unlink()
+                logger.info("✓ Cleanup complete")

-        # Extract frames
-        extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
+        # Extract frames (use embed quality so saved files match embedded images)
+        if self.config.scene_detection:
+            logger.info(f"Extracting frames with scene detection (threshold={self.config.scene_threshold})...")
+        else:
+            logger.info(f"Extracting frames every {self.config.interval}s...")
+
+        extractor = FrameExtractor(
+            str(self.config.video_path),
+            str(self.output_mgr.frames_dir),
+            quality=self.config.embed_quality
+        )

        if self.config.scene_detection:
            frames_info = extractor.extract_scene_changes(threshold=self.config.scene_threshold)
@@ -232,8 +286,29 @@ class ProcessingWorkflow:
        return frames_info

    def _analyze_frames(self, frames_info):
-        """Analyze frames with vision or OCR."""
-        analysis_type = 'vision' if self.config.use_vision else 'ocr'
+        """Analyze frames with vision, hybrid, or OCR."""
+        # Skip analysis if just embedding images
+        if self.config.embed_images:
+            logger.info("Step 2: Skipping analysis (images will be embedded)")
+            # Create minimal segments with just frame paths and timestamps
+            screen_segments = [
+                {
+                    'timestamp': timestamp,
+                    'text': '',  # No text extraction needed
+                    'frame_path': frame_path
+                }
+                for frame_path, timestamp in frames_info
+            ]
+            logger.info(f"✓ Prepared {len(screen_segments)} frames for embedding")
+            return screen_segments
+
+        # Determine analysis type
+        if self.config.use_vision:
+            analysis_type = 'vision'
+        elif self.config.use_hybrid:
+            analysis_type = 'hybrid'
+        else:
+            analysis_type = 'ocr'

        # Check cache
        cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
@@ -242,6 +317,8 @@ class ProcessingWorkflow:

        if self.config.use_vision:
            return self._run_vision_analysis(frames_info)
+        elif self.config.use_hybrid:
+            return self._run_hybrid_analysis(frames_info)
        else:
            return self._run_ocr_analysis(frames_info)

@@ -272,6 +349,13 @@ class ProcessingWorkflow:
            )
            logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")

+            # Debug: Show sample analysis results
+            if screen_segments:
+                logger.debug(f"First analysis result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}")
+                logger.debug(f"First analysis text preview: {screen_segments[0].get('text', '')[:200]}...")
+                if len(screen_segments) > 1:
+                    logger.debug(f"Last analysis result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}")
+
            # Cache results
            self.cache_mgr.save_analysis('vision', screen_segments)
            return screen_segments
@@ -285,6 +369,42 @@ class ProcessingWorkflow:
        cached = self.cache_mgr.get_whisper_cache()
        return str(cached) if cached else None

+    def _run_hybrid_analysis(self, frames_info):
+        """Run hybrid analysis on frames (OpenCV + OCR)."""
+        if self.config.hybrid_llm_cleanup:
+            logger.info("Step 2: Running hybrid analysis (OpenCV + OCR + LLM cleanup)...")
+        else:
+            logger.info("Step 2: Running hybrid analysis (OpenCV text detection + OCR)...")
+
+        try:
+            from .hybrid_processor import HybridProcessor
+
+            hybrid = HybridProcessor(
+                ocr_engine=self.config.ocr_engine,
+                use_llm_cleanup=self.config.hybrid_llm_cleanup,
+                llm_model=self.config.hybrid_llm_model
+            )
+            screen_segments = hybrid.process_frames(
+                frames_info,
+                deduplicate=not self.config.no_deduplicate
+            )
+            logger.info(f"✓ Processed {len(screen_segments)} frames with hybrid analysis")
+
+            # Debug: Show sample hybrid results
+            if screen_segments:
+                logger.debug(f"First hybrid result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}")
+                logger.debug(f"First hybrid text preview: {screen_segments[0].get('text', '')[:200]}...")
+                if len(screen_segments) > 1:
+                    logger.debug(f"Last hybrid result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}")
+
+            # Cache results
+            self.cache_mgr.save_analysis('hybrid', screen_segments)
+            return screen_segments
+
+        except ImportError as e:
+            logger.error(f"{e}")
+            raise
+
    def _run_ocr_analysis(self, frames_info):
        """Run OCR analysis on frames."""
        logger.info("Step 2: Running OCR on extracted frames...")
@@ -297,6 +417,13 @@ class ProcessingWorkflow:
            )
            logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")

+            # Debug: Show sample OCR results
+            if screen_segments:
+                logger.debug(f"First OCR result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}")
+                logger.debug(f"First OCR text preview: {screen_segments[0].get('text', '')[:200]}...")
+                if len(screen_segments) > 1:
+                    logger.debug(f"Last OCR result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}")
+
            # Cache results
            self.cache_mgr.save_analysis('ocr', screen_segments)
            return screen_segments
@@ -309,7 +436,10 @@ class ProcessingWorkflow:

    def _merge_transcripts(self, transcript_path, screen_segments):
        """Merge audio and screen transcripts."""
-        merger = TranscriptMerger()
+        merger = TranscriptMerger(
+            embed_images=self.config.embed_images,
+            embed_quality=self.config.embed_quality
+        )

        # Load audio transcript if available
        audio_segments = []
@@ -350,10 +480,18 @@ class ProcessingWorkflow:

    def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
        """Build result dictionary."""
+        # Determine analysis filename
+        if self.config.use_vision:
+            analysis_type = 'vision'
+        elif self.config.use_hybrid:
+            analysis_type = 'hybrid'
+        else:
+            analysis_type = 'ocr'
+
        return {
            "output_dir": str(self.output_mgr.output_dir),
            "transcript": transcript_path,
-            "analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
+            "analysis": f"{self.config.video_path.stem}_{analysis_type}.json",
            "frames_count": len(screen_segments) if screen_segments else 0,
            "enhanced_transcript": enhanced_transcript,
            "manifest": str(self.output_mgr.get_path("manifest.json"))