mitus/meetus/vision_processor.py

"""
Vision-based frame analysis using local vision-language models via Ollama.
Better than OCR for understanding dashboards, code, and console output.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
import os

logger = logging.getLogger(__name__)


class VisionProcessor:
    """Process frames using local vision models via Ollama."""

    def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
        """
        Initialize vision processor.

        Args:
            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
            prompts_dir: Directory containing prompt files (default: meetus/prompts/)
        """
        self.model = model
        self._client = None

        # Set prompts directory
        if prompts_dir:
            self.prompts_dir = Path(prompts_dir)
        else:
            # Default to meetus/prompts/ relative to this file
            self.prompts_dir = Path(__file__).parent / "prompts"

        self._init_client()

    def _init_client(self):
        """Initialize Ollama client."""
        try:
            import ollama
            self._client = ollama

            # Check if model is available
            try:
                models = self._client.list()
                available_models = [m['name'] for m in models.get('models', [])]

                if self.model not in available_models:
                    logger.warning(f"Model {self.model} not found locally.")
                    logger.info(f"Pulling {self.model}... (this may take a few minutes)")
                    self._client.pull(self.model)
                    logger.info(f"✓ Model {self.model} downloaded")
                else:
                    logger.info(f"Using local vision model: {self.model}")

            except Exception as e:
                logger.warning(f"Could not verify model availability: {e}")
                logger.info("Attempting to use model anyway...")

        except ImportError:
            raise ImportError(
                "ollama package not installed. Run: pip install ollama\n"
                "Also install Ollama: https://ollama.ai/download"
            )

    def _load_prompt(self, context: str) -> str:
        """
        Load prompt from file.

        Args:
            context: Context name (meeting, dashboard, code, console)

        Returns:
            Prompt text
        """
        prompt_file = self.prompts_dir / f"{context}.txt"

        if prompt_file.exists():
            with open(prompt_file, 'r', encoding='utf-8') as f:
                return f.read().strip()
        else:
            # Fallback to default prompt
            logger.warning(f"Prompt file not found: {prompt_file}, using default")
            return "Analyze this image and describe what you see in detail."

    def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
        """
        Analyze a single frame using local vision model.

        Args:
            image_path: Path to image file
            context: Context hint for analysis (meeting, dashboard, code, console)
            audio_context: Optional audio transcript around this timestamp for context

        Returns:
            Analyzed content description
        """
        # Load prompt from file
        prompt = self._load_prompt(context)

        # Add audio context if available
        if audio_context:
            prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"

        try:
            # Use Ollama's chat API with vision
            response = self._client.chat(
                model=self.model,
                messages=[
                    {
                        'role': 'user',
                        'content': prompt,
                        'images': [image_path]
                    }
                ]
            )

            # Extract text from response
            text = response['message']['content']
            return text.strip()

        except Exception as e:
            logger.error(f"Vision model error for {image_path}: {e}")
            return ""

    def process_frames(
        self,
        frames_info: List[Tuple[str, float]],
        context: str = "meeting",
        deduplicate: bool = True,
        similarity_threshold: float = 0.85,
        audio_segments: Optional[List[Dict]] = None
    ) -> List[Dict]:
        """
        Process multiple frames with vision analysis.

        Args:
            frames_info: List of (frame_path, timestamp) tuples
            context: Context hint for analysis
            deduplicate: Whether to remove similar consecutive analyses
            similarity_threshold: Threshold for considering analyses as duplicates (0-1)

        Returns:
            List of dicts with 'timestamp', 'text', and 'frame_path'
        """
        results = []
        prev_text = ""

        total = len(frames_info)
        logger.info(f"Starting vision analysis of {total} frames...")

        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")

            # Get audio context around this timestamp (±30 seconds)
            audio_context = self._get_audio_context(timestamp, audio_segments, window=30)

            text = self.analyze_frame(frame_path, context, audio_context)

            if not text:
                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
                continue

            # Debug: Show what was extracted
            logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars")
            logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}")

            # Deduplicate similar consecutive frames
            if deduplicate and prev_text:
                similarity = self._text_similarity(prev_text, text)
                logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})")
                if similarity > similarity_threshold:
                    logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
                    continue

            results.append({
                'timestamp': timestamp,
                'text': text,
                'frame_path': frame_path
            })

            prev_text = text

        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
        return results

    def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
        """
        Get audio transcript around a given timestamp.

        Args:
            timestamp: Target timestamp in seconds
            audio_segments: List of audio segments with 'timestamp' and 'text' keys
            window: Time window in seconds (±window around timestamp)

        Returns:
            Concatenated audio text from the time window
        """
        if not audio_segments:
            return ""

        relevant = [seg for seg in audio_segments
                    if abs(seg.get('timestamp', 0) - timestamp) <= window]

        if not relevant:
            return ""

        return " ".join([seg['text'] for seg in relevant])

    def _text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts.

        Returns:
            Similarity score between 0 and 1
        """
        return SequenceMatcher(None, text1, text2).ratio()