This commit is contained in:
Mariano Gabriel
2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions

View File

@@ -6,6 +6,7 @@ from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
import os
logger = logging.getLogger(__name__)
@@ -13,15 +14,24 @@ logger = logging.getLogger(__name__)
class VisionProcessor:
"""Process frames using local vision models via Ollama."""
def __init__(self, model: str = "llava:13b"):
def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
"""
Initialize vision processor.
Args:
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
prompts_dir: Directory containing prompt files (default: meetus/prompts/)
"""
self.model = model
self._client = None
# Set prompts directory
if prompts_dir:
self.prompts_dir = Path(prompts_dir)
else:
# Default to meetus/prompts/ relative to this file
self.prompts_dir = Path(__file__).parent / "prompts"
self._init_client()
def _init_client(self):
@@ -53,6 +63,26 @@ class VisionProcessor:
"Also install Ollama: https://ollama.ai/download"
)
def _load_prompt(self, context: str) -> str:
"""
Load prompt from file.
Args:
context: Context name (meeting, dashboard, code, console)
Returns:
Prompt text
"""
prompt_file = self.prompts_dir / f"{context}.txt"
if prompt_file.exists():
with open(prompt_file, 'r', encoding='utf-8') as f:
return f.read().strip()
else:
# Fallback to default prompt
logger.warning(f"Prompt file not found: {prompt_file}, using default")
return "Analyze this image and describe what you see in detail."
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
"""
Analyze a single frame using local vision model.
@@ -64,50 +94,8 @@ class VisionProcessor:
Returns:
Analyzed content description
"""
# Context-specific prompts
prompts = {
"meeting": """Analyze this screen capture from a meeting recording. Extract:
1. Any visible text (titles, labels, headings)
2. Key metrics, numbers, or data points shown
3. Dashboard panels or visualizations (describe what they show)
4. Code snippets (preserve formatting and context)
5. Console/terminal output (commands and results)
6. Application names or UI elements
Focus on information that would help someone understand what was being discussed.
Be concise but include all important details. If there's code, preserve it exactly.""",
"dashboard": """Analyze this dashboard/monitoring panel. Extract:
1. Panel titles and metrics names
2. Current values and units
3. Trends (up/down/stable)
4. Alerts or warnings
5. Time ranges shown
6. Any anomalies or notable patterns
Format as structured data.""",
"code": """Analyze this code screenshot. Extract:
1. Programming language
2. File name or path (if visible)
3. Code content (preserve exact formatting)
4. Comments
5. Function/class names
6. Any error messages or warnings
Preserve code exactly as shown.""",
"console": """Analyze this console/terminal output. Extract:
1. Commands executed
2. Output/results
3. Error messages
4. Warnings or status messages
5. File paths or URLs
Preserve formatting and structure."""
}
prompt = prompts.get(context, prompts["meeting"])
# Load prompt from file
prompt = self._load_prompt(context)
try:
# Use Ollama's chat API with vision