Files
mitus/meetus/vision_processor.py
Mariano Gabriel cd7b0aed07 refactor
2025-10-20 00:03:41 -03:00

181 lines
5.9 KiB
Python

"""
Vision-based frame analysis using local vision-language models via Ollama.
Better than OCR for understanding dashboards, code, and console output.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
import os
logger = logging.getLogger(__name__)
class VisionProcessor:
"""Process frames using local vision models via Ollama."""
def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
"""
Initialize vision processor.
Args:
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
prompts_dir: Directory containing prompt files (default: meetus/prompts/)
"""
self.model = model
self._client = None
# Set prompts directory
if prompts_dir:
self.prompts_dir = Path(prompts_dir)
else:
# Default to meetus/prompts/ relative to this file
self.prompts_dir = Path(__file__).parent / "prompts"
self._init_client()
def _init_client(self):
"""Initialize Ollama client."""
try:
import ollama
self._client = ollama
# Check if model is available
try:
models = self._client.list()
available_models = [m['name'] for m in models.get('models', [])]
if self.model not in available_models:
logger.warning(f"Model {self.model} not found locally.")
logger.info(f"Pulling {self.model}... (this may take a few minutes)")
self._client.pull(self.model)
logger.info(f"✓ Model {self.model} downloaded")
else:
logger.info(f"Using local vision model: {self.model}")
except Exception as e:
logger.warning(f"Could not verify model availability: {e}")
logger.info("Attempting to use model anyway...")
except ImportError:
raise ImportError(
"ollama package not installed. Run: pip install ollama\n"
"Also install Ollama: https://ollama.ai/download"
)
def _load_prompt(self, context: str) -> str:
"""
Load prompt from file.
Args:
context: Context name (meeting, dashboard, code, console)
Returns:
Prompt text
"""
prompt_file = self.prompts_dir / f"{context}.txt"
if prompt_file.exists():
with open(prompt_file, 'r', encoding='utf-8') as f:
return f.read().strip()
else:
# Fallback to default prompt
logger.warning(f"Prompt file not found: {prompt_file}, using default")
return "Analyze this image and describe what you see in detail."
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
"""
Analyze a single frame using local vision model.
Args:
image_path: Path to image file
context: Context hint for analysis (meeting, dashboard, code, console)
Returns:
Analyzed content description
"""
# Load prompt from file
prompt = self._load_prompt(context)
try:
# Use Ollama's chat API with vision
response = self._client.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': prompt,
'images': [image_path]
}
]
)
# Extract text from response
text = response['message']['content']
return text.strip()
except Exception as e:
logger.error(f"Vision model error for {image_path}: {e}")
return ""
def process_frames(
self,
frames_info: List[Tuple[str, float]],
context: str = "meeting",
deduplicate: bool = True,
similarity_threshold: float = 0.85
) -> List[Dict]:
"""
Process multiple frames with vision analysis.
Args:
frames_info: List of (frame_path, timestamp) tuples
context: Context hint for analysis
deduplicate: Whether to remove similar consecutive analyses
similarity_threshold: Threshold for considering analyses as duplicates (0-1)
Returns:
List of dicts with 'timestamp', 'text', and 'frame_path'
"""
results = []
prev_text = ""
total = len(frames_info)
logger.info(f"Starting vision analysis of {total} frames...")
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
text = self.analyze_frame(frame_path, context)
if not text:
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
continue
# Deduplicate similar consecutive frames
if deduplicate:
similarity = self._text_similarity(prev_text, text)
if similarity > similarity_threshold:
logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
continue
results.append({
'timestamp': timestamp,
'text': text,
'frame_path': frame_path
})
prev_text = text
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
return results
def _text_similarity(self, text1: str, text2: str) -> float:
"""
Calculate similarity between two texts.
Returns:
Similarity score between 0 and 1
"""
return SequenceMatcher(None, text1, text2).ratio()