add vision processor
This commit is contained in:
192
meetus/vision_processor.py
Normal file
192
meetus/vision_processor.py
Normal file
@@ -0,0 +1,192 @@
|
||||
"""
|
||||
Vision-based frame analysis using local vision-language models via Ollama.
|
||||
Better than OCR for understanding dashboards, code, and console output.
|
||||
"""
|
||||
from typing import List, Tuple, Dict, Optional
|
||||
from pathlib import Path
|
||||
import logging
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class VisionProcessor:
|
||||
"""Process frames using local vision models via Ollama."""
|
||||
|
||||
def __init__(self, model: str = "llava:13b"):
|
||||
"""
|
||||
Initialize vision processor.
|
||||
|
||||
Args:
|
||||
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
|
||||
"""
|
||||
self.model = model
|
||||
self._client = None
|
||||
self._init_client()
|
||||
|
||||
def _init_client(self):
|
||||
"""Initialize Ollama client."""
|
||||
try:
|
||||
import ollama
|
||||
self._client = ollama
|
||||
|
||||
# Check if model is available
|
||||
try:
|
||||
models = self._client.list()
|
||||
available_models = [m['name'] for m in models.get('models', [])]
|
||||
|
||||
if self.model not in available_models:
|
||||
logger.warning(f"Model {self.model} not found locally.")
|
||||
logger.info(f"Pulling {self.model}... (this may take a few minutes)")
|
||||
self._client.pull(self.model)
|
||||
logger.info(f"✓ Model {self.model} downloaded")
|
||||
else:
|
||||
logger.info(f"Using local vision model: {self.model}")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not verify model availability: {e}")
|
||||
logger.info("Attempting to use model anyway...")
|
||||
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"ollama package not installed. Run: pip install ollama\n"
|
||||
"Also install Ollama: https://ollama.ai/download"
|
||||
)
|
||||
|
||||
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
|
||||
"""
|
||||
Analyze a single frame using local vision model.
|
||||
|
||||
Args:
|
||||
image_path: Path to image file
|
||||
context: Context hint for analysis (meeting, dashboard, code, console)
|
||||
|
||||
Returns:
|
||||
Analyzed content description
|
||||
"""
|
||||
# Context-specific prompts
|
||||
prompts = {
|
||||
"meeting": """Analyze this screen capture from a meeting recording. Extract:
|
||||
1. Any visible text (titles, labels, headings)
|
||||
2. Key metrics, numbers, or data points shown
|
||||
3. Dashboard panels or visualizations (describe what they show)
|
||||
4. Code snippets (preserve formatting and context)
|
||||
5. Console/terminal output (commands and results)
|
||||
6. Application names or UI elements
|
||||
|
||||
Focus on information that would help someone understand what was being discussed.
|
||||
Be concise but include all important details. If there's code, preserve it exactly.""",
|
||||
|
||||
"dashboard": """Analyze this dashboard/monitoring panel. Extract:
|
||||
1. Panel titles and metrics names
|
||||
2. Current values and units
|
||||
3. Trends (up/down/stable)
|
||||
4. Alerts or warnings
|
||||
5. Time ranges shown
|
||||
6. Any anomalies or notable patterns
|
||||
|
||||
Format as structured data.""",
|
||||
|
||||
"code": """Analyze this code screenshot. Extract:
|
||||
1. Programming language
|
||||
2. File name or path (if visible)
|
||||
3. Code content (preserve exact formatting)
|
||||
4. Comments
|
||||
5. Function/class names
|
||||
6. Any error messages or warnings
|
||||
|
||||
Preserve code exactly as shown.""",
|
||||
|
||||
"console": """Analyze this console/terminal output. Extract:
|
||||
1. Commands executed
|
||||
2. Output/results
|
||||
3. Error messages
|
||||
4. Warnings or status messages
|
||||
5. File paths or URLs
|
||||
|
||||
Preserve formatting and structure."""
|
||||
}
|
||||
|
||||
prompt = prompts.get(context, prompts["meeting"])
|
||||
|
||||
try:
|
||||
# Use Ollama's chat API with vision
|
||||
response = self._client.chat(
|
||||
model=self.model,
|
||||
messages=[
|
||||
{
|
||||
'role': 'user',
|
||||
'content': prompt,
|
||||
'images': [image_path]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
# Extract text from response
|
||||
text = response['message']['content']
|
||||
return text.strip()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Vision model error for {image_path}: {e}")
|
||||
return ""
|
||||
|
||||
def process_frames(
|
||||
self,
|
||||
frames_info: List[Tuple[str, float]],
|
||||
context: str = "meeting",
|
||||
deduplicate: bool = True,
|
||||
similarity_threshold: float = 0.85
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Process multiple frames with vision analysis.
|
||||
|
||||
Args:
|
||||
frames_info: List of (frame_path, timestamp) tuples
|
||||
context: Context hint for analysis
|
||||
deduplicate: Whether to remove similar consecutive analyses
|
||||
similarity_threshold: Threshold for considering analyses as duplicates (0-1)
|
||||
|
||||
Returns:
|
||||
List of dicts with 'timestamp', 'text', and 'frame_path'
|
||||
"""
|
||||
results = []
|
||||
prev_text = ""
|
||||
|
||||
total = len(frames_info)
|
||||
logger.info(f"Starting vision analysis of {total} frames...")
|
||||
|
||||
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
||||
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
|
||||
|
||||
text = self.analyze_frame(frame_path, context)
|
||||
|
||||
if not text:
|
||||
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
|
||||
continue
|
||||
|
||||
# Deduplicate similar consecutive frames
|
||||
if deduplicate:
|
||||
similarity = self._text_similarity(prev_text, text)
|
||||
if similarity > similarity_threshold:
|
||||
logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
|
||||
continue
|
||||
|
||||
results.append({
|
||||
'timestamp': timestamp,
|
||||
'text': text,
|
||||
'frame_path': frame_path
|
||||
})
|
||||
|
||||
prev_text = text
|
||||
|
||||
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
|
||||
return results
|
||||
|
||||
def _text_similarity(self, text1: str, text2: str) -> float:
|
||||
"""
|
||||
Calculate similarity between two texts.
|
||||
|
||||
Returns:
|
||||
Similarity score between 0 and 1
|
||||
"""
|
||||
return SequenceMatcher(None, text1, text2).ratio()
|
||||
Reference in New Issue
Block a user