add vision processor

2025-10-19 22:58:28 -03:00
parent ae89564373
commit a999bc9093
4 changed files with 511 additions and 107 deletions
--- a/meetus/vision_processor.py
+++ b/meetus/vision_processor.py
@@ -0,0 +1,192 @@
+"""
+Vision-based frame analysis using local vision-language models via Ollama.
+Better than OCR for understanding dashboards, code, and console output.
+"""
+from typing import List, Tuple, Dict, Optional
+from pathlib import Path
+import logging
+from difflib import SequenceMatcher
+
+logger = logging.getLogger(__name__)
+
+
+class VisionProcessor:
+    """Process frames using local vision models via Ollama."""
+
+    def __init__(self, model: str = "llava:13b"):
+        """
+        Initialize vision processor.
+
+        Args:
+            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
+        """
+        self.model = model
+        self._client = None
+        self._init_client()
+
+    def _init_client(self):
+        """Initialize Ollama client."""
+        try:
+            import ollama
+            self._client = ollama
+
+            # Check if model is available
+            try:
+                models = self._client.list()
+                available_models = [m['name'] for m in models.get('models', [])]
+
+                if self.model not in available_models:
+                    logger.warning(f"Model {self.model} not found locally.")
+                    logger.info(f"Pulling {self.model}... (this may take a few minutes)")
+                    self._client.pull(self.model)
+                    logger.info(f"✓ Model {self.model} downloaded")
+                else:
+                    logger.info(f"Using local vision model: {self.model}")
+
+            except Exception as e:
+                logger.warning(f"Could not verify model availability: {e}")
+                logger.info("Attempting to use model anyway...")
+
+        except ImportError:
+            raise ImportError(
+                "ollama package not installed. Run: pip install ollama\n"
+                "Also install Ollama: https://ollama.ai/download"
+            )
+
+    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
+        """
+        Analyze a single frame using local vision model.
+
+        Args:
+            image_path: Path to image file
+            context: Context hint for analysis (meeting, dashboard, code, console)
+
+        Returns:
+            Analyzed content description
+        """
+        # Context-specific prompts
+        prompts = {
+            "meeting": """Analyze this screen capture from a meeting recording. Extract:
+1. Any visible text (titles, labels, headings)
+2. Key metrics, numbers, or data points shown
+3. Dashboard panels or visualizations (describe what they show)
+4. Code snippets (preserve formatting and context)
+5. Console/terminal output (commands and results)
+6. Application names or UI elements
+
+Focus on information that would help someone understand what was being discussed.
+Be concise but include all important details. If there's code, preserve it exactly.""",
+
+            "dashboard": """Analyze this dashboard/monitoring panel. Extract:
+1. Panel titles and metrics names
+2. Current values and units
+3. Trends (up/down/stable)
+4. Alerts or warnings
+5. Time ranges shown
+6. Any anomalies or notable patterns
+
+Format as structured data.""",
+
+            "code": """Analyze this code screenshot. Extract:
+1. Programming language
+2. File name or path (if visible)
+3. Code content (preserve exact formatting)
+4. Comments
+5. Function/class names
+6. Any error messages or warnings
+
+Preserve code exactly as shown.""",
+
+            "console": """Analyze this console/terminal output. Extract:
+1. Commands executed
+2. Output/results
+3. Error messages
+4. Warnings or status messages
+5. File paths or URLs
+
+Preserve formatting and structure."""
+        }
+
+        prompt = prompts.get(context, prompts["meeting"])
+
+        try:
+            # Use Ollama's chat API with vision
+            response = self._client.chat(
+                model=self.model,
+                messages=[
+                    {
+                        'role': 'user',
+                        'content': prompt,
+                        'images': [image_path]
+                    }
+                ]
+            )
+
+            # Extract text from response
+            text = response['message']['content']
+            return text.strip()
+
+        except Exception as e:
+            logger.error(f"Vision model error for {image_path}: {e}")
+            return ""
+
+    def process_frames(
+        self,
+        frames_info: List[Tuple[str, float]],
+        context: str = "meeting",
+        deduplicate: bool = True,
+        similarity_threshold: float = 0.85
+    ) -> List[Dict]:
+        """
+        Process multiple frames with vision analysis.
+
+        Args:
+            frames_info: List of (frame_path, timestamp) tuples
+            context: Context hint for analysis
+            deduplicate: Whether to remove similar consecutive analyses
+            similarity_threshold: Threshold for considering analyses as duplicates (0-1)
+
+        Returns:
+            List of dicts with 'timestamp', 'text', and 'frame_path'
+        """
+        results = []
+        prev_text = ""
+
+        total = len(frames_info)
+        logger.info(f"Starting vision analysis of {total} frames...")
+
+        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
+            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
+
+            text = self.analyze_frame(frame_path, context)
+
+            if not text:
+                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
+                continue
+
+            # Deduplicate similar consecutive frames
+            if deduplicate:
+                similarity = self._text_similarity(prev_text, text)
+                if similarity > similarity_threshold:
+                    logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
+                    continue
+
+            results.append({
+                'timestamp': timestamp,
+                'text': text,
+                'frame_path': frame_path
+            })
+
+            prev_text = text
+
+        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
+        return results
+
+    def _text_similarity(self, text1: str, text2: str) -> float:
+        """
+        Calculate similarity between two texts.
+
+        Returns:
+            Similarity score between 0 and 1
+        """
+        return SequenceMatcher(None, text1, text2).ratio()