mitus/meetus/vision_processor.py

"""
Vision-based frame analysis using local vision-language models via Ollama.
Better than OCR for understanding dashboards, code, and console output.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher

logger = logging.getLogger(__name__)


class VisionProcessor:
    """Process frames using local vision models via Ollama."""

    def __init__(self, model: str = "llava:13b"):
        """
        Initialize vision processor.

        Args:
            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
        """
        self.model = model
        self._client = None
        self._init_client()

    def _init_client(self):
        """Initialize Ollama client."""
        try:
            import ollama
            self._client = ollama

            # Check if model is available
            try:
                models = self._client.list()
                available_models = [m['name'] for m in models.get('models', [])]

                if self.model not in available_models:
                    logger.warning(f"Model {self.model} not found locally.")
                    logger.info(f"Pulling {self.model}... (this may take a few minutes)")
                    self._client.pull(self.model)
                    logger.info(f"✓ Model {self.model} downloaded")
                else:
                    logger.info(f"Using local vision model: {self.model}")

            except Exception as e:
                logger.warning(f"Could not verify model availability: {e}")
                logger.info("Attempting to use model anyway...")

        except ImportError:
            raise ImportError(
                "ollama package not installed. Run: pip install ollama\n"
                "Also install Ollama: https://ollama.ai/download"
            )

    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
        """
        Analyze a single frame using local vision model.

        Args:
            image_path: Path to image file
            context: Context hint for analysis (meeting, dashboard, code, console)

        Returns:
            Analyzed content description
        """
        # Context-specific prompts
        prompts = {
            "meeting": """Analyze this screen capture from a meeting recording. Extract:
1. Any visible text (titles, labels, headings)
2. Key metrics, numbers, or data points shown
3. Dashboard panels or visualizations (describe what they show)
4. Code snippets (preserve formatting and context)
5. Console/terminal output (commands and results)
6. Application names or UI elements

Focus on information that would help someone understand what was being discussed.
Be concise but include all important details. If there's code, preserve it exactly.""",

            "dashboard": """Analyze this dashboard/monitoring panel. Extract:
1. Panel titles and metrics names
2. Current values and units
3. Trends (up/down/stable)
4. Alerts or warnings
5. Time ranges shown
6. Any anomalies or notable patterns

Format as structured data.""",

            "code": """Analyze this code screenshot. Extract:
1. Programming language
2. File name or path (if visible)
3. Code content (preserve exact formatting)
4. Comments
5. Function/class names
6. Any error messages or warnings

Preserve code exactly as shown.""",

            "console": """Analyze this console/terminal output. Extract:
1. Commands executed
2. Output/results
3. Error messages
4. Warnings or status messages
5. File paths or URLs

Preserve formatting and structure."""
        }

        prompt = prompts.get(context, prompts["meeting"])

        try:
            # Use Ollama's chat API with vision
            response = self._client.chat(
                model=self.model,
                messages=[
                    {
                        'role': 'user',
                        'content': prompt,
                        'images': [image_path]
                    }
                ]
            )

            # Extract text from response
            text = response['message']['content']
            return text.strip()

        except Exception as e:
            logger.error(f"Vision model error for {image_path}: {e}")
            return ""

    def process_frames(
        self,
        frames_info: List[Tuple[str, float]],
        context: str = "meeting",
        deduplicate: bool = True,
        similarity_threshold: float = 0.85
    ) -> List[Dict]:
        """
        Process multiple frames with vision analysis.

        Args:
            frames_info: List of (frame_path, timestamp) tuples
            context: Context hint for analysis
            deduplicate: Whether to remove similar consecutive analyses
            similarity_threshold: Threshold for considering analyses as duplicates (0-1)

        Returns:
            List of dicts with 'timestamp', 'text', and 'frame_path'
        """
        results = []
        prev_text = ""

        total = len(frames_info)
        logger.info(f"Starting vision analysis of {total} frames...")

        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")

            text = self.analyze_frame(frame_path, context)

            if not text:
                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
                continue

            # Deduplicate similar consecutive frames
            if deduplicate:
                similarity = self._text_similarity(prev_text, text)
                if similarity > similarity_threshold:
                    logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
                    continue

            results.append({
                'timestamp': timestamp,
                'text': text,
                'frame_path': frame_path
            })

            prev_text = text

        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
        return results

    def _text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts.

        Returns:
            Similarity score between 0 and 1
        """
        return SequenceMatcher(None, text1, text2).ratio()