Files
mitus/meetus/vision_processor.py
2025-10-19 22:58:28 -03:00

193 lines
6.2 KiB
Python

"""
Vision-based frame analysis using local vision-language models via Ollama.
Better than OCR for understanding dashboards, code, and console output.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
logger = logging.getLogger(__name__)
class VisionProcessor:
"""Process frames using local vision models via Ollama."""
def __init__(self, model: str = "llava:13b"):
"""
Initialize vision processor.
Args:
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
"""
self.model = model
self._client = None
self._init_client()
def _init_client(self):
"""Initialize Ollama client."""
try:
import ollama
self._client = ollama
# Check if model is available
try:
models = self._client.list()
available_models = [m['name'] for m in models.get('models', [])]
if self.model not in available_models:
logger.warning(f"Model {self.model} not found locally.")
logger.info(f"Pulling {self.model}... (this may take a few minutes)")
self._client.pull(self.model)
logger.info(f"✓ Model {self.model} downloaded")
else:
logger.info(f"Using local vision model: {self.model}")
except Exception as e:
logger.warning(f"Could not verify model availability: {e}")
logger.info("Attempting to use model anyway...")
except ImportError:
raise ImportError(
"ollama package not installed. Run: pip install ollama\n"
"Also install Ollama: https://ollama.ai/download"
)
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
"""
Analyze a single frame using local vision model.
Args:
image_path: Path to image file
context: Context hint for analysis (meeting, dashboard, code, console)
Returns:
Analyzed content description
"""
# Context-specific prompts
prompts = {
"meeting": """Analyze this screen capture from a meeting recording. Extract:
1. Any visible text (titles, labels, headings)
2. Key metrics, numbers, or data points shown
3. Dashboard panels or visualizations (describe what they show)
4. Code snippets (preserve formatting and context)
5. Console/terminal output (commands and results)
6. Application names or UI elements
Focus on information that would help someone understand what was being discussed.
Be concise but include all important details. If there's code, preserve it exactly.""",
"dashboard": """Analyze this dashboard/monitoring panel. Extract:
1. Panel titles and metrics names
2. Current values and units
3. Trends (up/down/stable)
4. Alerts or warnings
5. Time ranges shown
6. Any anomalies or notable patterns
Format as structured data.""",
"code": """Analyze this code screenshot. Extract:
1. Programming language
2. File name or path (if visible)
3. Code content (preserve exact formatting)
4. Comments
5. Function/class names
6. Any error messages or warnings
Preserve code exactly as shown.""",
"console": """Analyze this console/terminal output. Extract:
1. Commands executed
2. Output/results
3. Error messages
4. Warnings or status messages
5. File paths or URLs
Preserve formatting and structure."""
}
prompt = prompts.get(context, prompts["meeting"])
try:
# Use Ollama's chat API with vision
response = self._client.chat(
model=self.model,
messages=[
{
'role': 'user',
'content': prompt,
'images': [image_path]
}
]
)
# Extract text from response
text = response['message']['content']
return text.strip()
except Exception as e:
logger.error(f"Vision model error for {image_path}: {e}")
return ""
def process_frames(
self,
frames_info: List[Tuple[str, float]],
context: str = "meeting",
deduplicate: bool = True,
similarity_threshold: float = 0.85
) -> List[Dict]:
"""
Process multiple frames with vision analysis.
Args:
frames_info: List of (frame_path, timestamp) tuples
context: Context hint for analysis
deduplicate: Whether to remove similar consecutive analyses
similarity_threshold: Threshold for considering analyses as duplicates (0-1)
Returns:
List of dicts with 'timestamp', 'text', and 'frame_path'
"""
results = []
prev_text = ""
total = len(frames_info)
logger.info(f"Starting vision analysis of {total} frames...")
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
text = self.analyze_frame(frame_path, context)
if not text:
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
continue
# Deduplicate similar consecutive frames
if deduplicate:
similarity = self._text_similarity(prev_text, text)
if similarity > similarity_threshold:
logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
continue
results.append({
'timestamp': timestamp,
'text': text,
'frame_path': frame_path
})
prev_text = text
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
return results
def _text_similarity(self, text1: str, text2: str) -> float:
"""
Calculate similarity between two texts.
Returns:
Similarity score between 0 and 1
"""
return SequenceMatcher(None, text1, text2).ratio()