mitus/meetus/vision_processor.py

"""
Vision-based frame analysis using local vision-language models via Ollama.
Better than OCR for understanding dashboards, code, and console output.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
import os

logger = logging.getLogger(__name__)


class VisionProcessor:
    """Process frames using local vision models via Ollama."""

    def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
        """
        Initialize vision processor.

        Args:
            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
            prompts_dir: Directory containing prompt files (default: meetus/prompts/)
        """
        self.model = model
        self._client = None

        # Set prompts directory
        if prompts_dir:
            self.prompts_dir = Path(prompts_dir)
        else:
            # Default to meetus/prompts/ relative to this file
            self.prompts_dir = Path(__file__).parent / "prompts"

        self._init_client()

    def _init_client(self):
        """Initialize Ollama client."""
        try:
            import ollama
            self._client = ollama

            # Check if model is available
            try:
                models = self._client.list()
                available_models = [m['name'] for m in models.get('models', [])]

                if self.model not in available_models:
                    logger.warning(f"Model {self.model} not found locally.")
                    logger.info(f"Pulling {self.model}... (this may take a few minutes)")
                    self._client.pull(self.model)
                    logger.info(f"✓ Model {self.model} downloaded")
                else:
                    logger.info(f"Using local vision model: {self.model}")

            except Exception as e:
                logger.warning(f"Could not verify model availability: {e}")
                logger.info("Attempting to use model anyway...")

        except ImportError:
            raise ImportError(
                "ollama package not installed. Run: pip install ollama\n"
                "Also install Ollama: https://ollama.ai/download"
            )

    def _load_prompt(self, context: str) -> str:
        """
        Load prompt from file.

        Args:
            context: Context name (meeting, dashboard, code, console)

        Returns:
            Prompt text
        """
        prompt_file = self.prompts_dir / f"{context}.txt"

        if prompt_file.exists():
            with open(prompt_file, 'r', encoding='utf-8') as f:
                return f.read().strip()
        else:
            # Fallback to default prompt
            logger.warning(f"Prompt file not found: {prompt_file}, using default")
            return "Analyze this image and describe what you see in detail."

    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
        """
        Analyze a single frame using local vision model.

        Args:
            image_path: Path to image file
            context: Context hint for analysis (meeting, dashboard, code, console)

        Returns:
            Analyzed content description
        """
        # Load prompt from file
        prompt = self._load_prompt(context)

        try:
            # Use Ollama's chat API with vision
            response = self._client.chat(
                model=self.model,
                messages=[
                    {
                        'role': 'user',
                        'content': prompt,
                        'images': [image_path]
                    }
                ]
            )

            # Extract text from response
            text = response['message']['content']
            return text.strip()

        except Exception as e:
            logger.error(f"Vision model error for {image_path}: {e}")
            return ""

    def process_frames(
        self,
        frames_info: List[Tuple[str, float]],
        context: str = "meeting",
        deduplicate: bool = True,
        similarity_threshold: float = 0.85
    ) -> List[Dict]:
        """
        Process multiple frames with vision analysis.

        Args:
            frames_info: List of (frame_path, timestamp) tuples
            context: Context hint for analysis
            deduplicate: Whether to remove similar consecutive analyses
            similarity_threshold: Threshold for considering analyses as duplicates (0-1)

        Returns:
            List of dicts with 'timestamp', 'text', and 'frame_path'
        """
        results = []
        prev_text = ""

        total = len(frames_info)
        logger.info(f"Starting vision analysis of {total} frames...")

        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")

            text = self.analyze_frame(frame_path, context)

            if not text:
                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
                continue

            # Deduplicate similar consecutive frames
            if deduplicate:
                similarity = self._text_similarity(prev_text, text)
                if similarity > similarity_threshold:
                    logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
                    continue

            results.append({
                'timestamp': timestamp,
                'text': text,
                'frame_path': frame_path
            })

            prev_text = text

        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
        return results

    def _text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts.

        Returns:
            Similarity score between 0 and 1
        """
        return SequenceMatcher(None, text1, text2).ratio()