""" Vision-based frame analysis using local vision-language models via Ollama. Better than OCR for understanding dashboards, code, and console output. """ from typing import List, Tuple, Dict, Optional from pathlib import Path import logging from difflib import SequenceMatcher import os logger = logging.getLogger(__name__) class VisionProcessor: """Process frames using local vision models via Ollama.""" def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None): """ Initialize vision processor. Args: model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava) prompts_dir: Directory containing prompt files (default: meetus/prompts/) """ self.model = model self._client = None # Set prompts directory if prompts_dir: self.prompts_dir = Path(prompts_dir) else: # Default to meetus/prompts/ relative to this file self.prompts_dir = Path(__file__).parent / "prompts" self._init_client() def _init_client(self): """Initialize Ollama client.""" try: import ollama self._client = ollama # Check if model is available try: models = self._client.list() available_models = [m['name'] for m in models.get('models', [])] if self.model not in available_models: logger.warning(f"Model {self.model} not found locally.") logger.info(f"Pulling {self.model}... (this may take a few minutes)") self._client.pull(self.model) logger.info(f"✓ Model {self.model} downloaded") else: logger.info(f"Using local vision model: {self.model}") except Exception as e: logger.warning(f"Could not verify model availability: {e}") logger.info("Attempting to use model anyway...") except ImportError: raise ImportError( "ollama package not installed. Run: pip install ollama\n" "Also install Ollama: https://ollama.ai/download" ) def _load_prompt(self, context: str) -> str: """ Load prompt from file. Args: context: Context name (meeting, dashboard, code, console) Returns: Prompt text """ prompt_file = self.prompts_dir / f"{context}.txt" if prompt_file.exists(): with open(prompt_file, 'r', encoding='utf-8') as f: return f.read().strip() else: # Fallback to default prompt logger.warning(f"Prompt file not found: {prompt_file}, using default") return "Analyze this image and describe what you see in detail." def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str: """ Analyze a single frame using local vision model. Args: image_path: Path to image file context: Context hint for analysis (meeting, dashboard, code, console) audio_context: Optional audio transcript around this timestamp for context Returns: Analyzed content description """ # Load prompt from file prompt = self._load_prompt(context) # Add audio context if available if audio_context: prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}" try: # Use Ollama's chat API with vision response = self._client.chat( model=self.model, messages=[ { 'role': 'user', 'content': prompt, 'images': [image_path] } ] ) # Extract text from response text = response['message']['content'] return text.strip() except Exception as e: logger.error(f"Vision model error for {image_path}: {e}") return "" def process_frames( self, frames_info: List[Tuple[str, float]], context: str = "meeting", deduplicate: bool = True, similarity_threshold: float = 0.85, audio_segments: Optional[List[Dict]] = None ) -> List[Dict]: """ Process multiple frames with vision analysis. Args: frames_info: List of (frame_path, timestamp) tuples context: Context hint for analysis deduplicate: Whether to remove similar consecutive analyses similarity_threshold: Threshold for considering analyses as duplicates (0-1) Returns: List of dicts with 'timestamp', 'text', and 'frame_path' """ results = [] prev_text = "" total = len(frames_info) logger.info(f"Starting vision analysis of {total} frames...") for idx, (frame_path, timestamp) in enumerate(frames_info, 1): logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...") # Get audio context around this timestamp (±30 seconds) audio_context = self._get_audio_context(timestamp, audio_segments, window=30) text = self.analyze_frame(frame_path, context, audio_context) if not text: logger.warning(f"No content extracted from frame at {timestamp:.2f}s") continue # Debug: Show what was extracted logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars") logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}") # Deduplicate similar consecutive frames if deduplicate and prev_text: similarity = self._text_similarity(prev_text, text) logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})") if similarity > similarity_threshold: logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") continue results.append({ 'timestamp': timestamp, 'text': text, 'frame_path': frame_path }) prev_text = text logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})") return results def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str: """ Get audio transcript around a given timestamp. Args: timestamp: Target timestamp in seconds audio_segments: List of audio segments with 'timestamp' and 'text' keys window: Time window in seconds (±window around timestamp) Returns: Concatenated audio text from the time window """ if not audio_segments: return "" relevant = [seg for seg in audio_segments if abs(seg.get('timestamp', 0) - timestamp) <= window] if not relevant: return "" return " ".join([seg['text'] for seg in relevant]) def _text_similarity(self, text1: str, text2: str) -> float: """ Calculate similarity between two texts. Returns: Similarity score between 0 and 1 """ return SequenceMatcher(None, text1, text2).ratio()