218 lines
7.5 KiB
Python
218 lines
7.5 KiB
Python
"""
|
|
Vision-based frame analysis using local vision-language models via Ollama.
|
|
Better than OCR for understanding dashboards, code, and console output.
|
|
"""
|
|
from typing import List, Tuple, Dict, Optional
|
|
from pathlib import Path
|
|
import logging
|
|
from difflib import SequenceMatcher
|
|
import os
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class VisionProcessor:
|
|
"""Process frames using local vision models via Ollama."""
|
|
|
|
def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
|
|
"""
|
|
Initialize vision processor.
|
|
|
|
Args:
|
|
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
|
|
prompts_dir: Directory containing prompt files (default: meetus/prompts/)
|
|
"""
|
|
self.model = model
|
|
self._client = None
|
|
|
|
# Set prompts directory
|
|
if prompts_dir:
|
|
self.prompts_dir = Path(prompts_dir)
|
|
else:
|
|
# Default to meetus/prompts/ relative to this file
|
|
self.prompts_dir = Path(__file__).parent / "prompts"
|
|
|
|
self._init_client()
|
|
|
|
def _init_client(self):
|
|
"""Initialize Ollama client."""
|
|
try:
|
|
import ollama
|
|
self._client = ollama
|
|
|
|
# Check if model is available
|
|
try:
|
|
models = self._client.list()
|
|
available_models = [m['name'] for m in models.get('models', [])]
|
|
|
|
if self.model not in available_models:
|
|
logger.warning(f"Model {self.model} not found locally.")
|
|
logger.info(f"Pulling {self.model}... (this may take a few minutes)")
|
|
self._client.pull(self.model)
|
|
logger.info(f"✓ Model {self.model} downloaded")
|
|
else:
|
|
logger.info(f"Using local vision model: {self.model}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f"Could not verify model availability: {e}")
|
|
logger.info("Attempting to use model anyway...")
|
|
|
|
except ImportError:
|
|
raise ImportError(
|
|
"ollama package not installed. Run: pip install ollama\n"
|
|
"Also install Ollama: https://ollama.ai/download"
|
|
)
|
|
|
|
def _load_prompt(self, context: str) -> str:
|
|
"""
|
|
Load prompt from file.
|
|
|
|
Args:
|
|
context: Context name (meeting, dashboard, code, console)
|
|
|
|
Returns:
|
|
Prompt text
|
|
"""
|
|
prompt_file = self.prompts_dir / f"{context}.txt"
|
|
|
|
if prompt_file.exists():
|
|
with open(prompt_file, 'r', encoding='utf-8') as f:
|
|
return f.read().strip()
|
|
else:
|
|
# Fallback to default prompt
|
|
logger.warning(f"Prompt file not found: {prompt_file}, using default")
|
|
return "Analyze this image and describe what you see in detail."
|
|
|
|
def analyze_frame(self, image_path: str, context: str = "meeting", audio_context: str = "") -> str:
|
|
"""
|
|
Analyze a single frame using local vision model.
|
|
|
|
Args:
|
|
image_path: Path to image file
|
|
context: Context hint for analysis (meeting, dashboard, code, console)
|
|
audio_context: Optional audio transcript around this timestamp for context
|
|
|
|
Returns:
|
|
Analyzed content description
|
|
"""
|
|
# Load prompt from file
|
|
prompt = self._load_prompt(context)
|
|
|
|
# Add audio context if available
|
|
if audio_context:
|
|
prompt = f"Audio context (what's being discussed around this time):\n{audio_context}\n\n{prompt}"
|
|
|
|
try:
|
|
# Use Ollama's chat API with vision
|
|
response = self._client.chat(
|
|
model=self.model,
|
|
messages=[
|
|
{
|
|
'role': 'user',
|
|
'content': prompt,
|
|
'images': [image_path]
|
|
}
|
|
]
|
|
)
|
|
|
|
# Extract text from response
|
|
text = response['message']['content']
|
|
return text.strip()
|
|
|
|
except Exception as e:
|
|
logger.error(f"Vision model error for {image_path}: {e}")
|
|
return ""
|
|
|
|
def process_frames(
|
|
self,
|
|
frames_info: List[Tuple[str, float]],
|
|
context: str = "meeting",
|
|
deduplicate: bool = True,
|
|
similarity_threshold: float = 0.85,
|
|
audio_segments: Optional[List[Dict]] = None
|
|
) -> List[Dict]:
|
|
"""
|
|
Process multiple frames with vision analysis.
|
|
|
|
Args:
|
|
frames_info: List of (frame_path, timestamp) tuples
|
|
context: Context hint for analysis
|
|
deduplicate: Whether to remove similar consecutive analyses
|
|
similarity_threshold: Threshold for considering analyses as duplicates (0-1)
|
|
|
|
Returns:
|
|
List of dicts with 'timestamp', 'text', and 'frame_path'
|
|
"""
|
|
results = []
|
|
prev_text = ""
|
|
|
|
total = len(frames_info)
|
|
logger.info(f"Starting vision analysis of {total} frames...")
|
|
|
|
for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
|
|
logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")
|
|
|
|
# Get audio context around this timestamp (±30 seconds)
|
|
audio_context = self._get_audio_context(timestamp, audio_segments, window=30)
|
|
|
|
text = self.analyze_frame(frame_path, context, audio_context)
|
|
|
|
if not text:
|
|
logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
|
|
continue
|
|
|
|
# Debug: Show what was extracted
|
|
logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars")
|
|
logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}")
|
|
|
|
# Deduplicate similar consecutive frames
|
|
if deduplicate and prev_text:
|
|
similarity = self._text_similarity(prev_text, text)
|
|
logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})")
|
|
if similarity > similarity_threshold:
|
|
logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
|
|
continue
|
|
|
|
results.append({
|
|
'timestamp': timestamp,
|
|
'text': text,
|
|
'frame_path': frame_path
|
|
})
|
|
|
|
prev_text = text
|
|
|
|
logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
|
|
return results
|
|
|
|
def _get_audio_context(self, timestamp: float, audio_segments: Optional[List[Dict]], window: int = 30) -> str:
|
|
"""
|
|
Get audio transcript around a given timestamp.
|
|
|
|
Args:
|
|
timestamp: Target timestamp in seconds
|
|
audio_segments: List of audio segments with 'timestamp' and 'text' keys
|
|
window: Time window in seconds (±window around timestamp)
|
|
|
|
Returns:
|
|
Concatenated audio text from the time window
|
|
"""
|
|
if not audio_segments:
|
|
return ""
|
|
|
|
relevant = [seg for seg in audio_segments
|
|
if abs(seg.get('timestamp', 0) - timestamp) <= window]
|
|
|
|
if not relevant:
|
|
return ""
|
|
|
|
return " ".join([seg['text'] for seg in relevant])
|
|
|
|
def _text_similarity(self, text1: str, text2: str) -> float:
|
|
"""
|
|
Calculate similarity between two texts.
|
|
|
|
Returns:
|
|
Similarity score between 0 and 1
|
|
"""
|
|
return SequenceMatcher(None, text1, text2).ratio()
|