This commit is contained in:
Mariano Gabriel
2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions

137
meetus/cache_manager.py Normal file
View File

@@ -0,0 +1,137 @@
"""
Manage caching for frames, transcripts, and analysis results.
"""
from pathlib import Path
import json
import logging
from typing import List, Tuple, Dict, Optional
logger = logging.getLogger(__name__)
class CacheManager:
"""Manage caching of intermediate processing results."""
def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True):
"""
Initialize cache manager.
Args:
output_dir: Output directory for cached files
frames_dir: Directory for cached frames
video_name: Name of the video (stem)
use_cache: Whether to use caching
"""
self.output_dir = output_dir
self.frames_dir = frames_dir
self.video_name = video_name
self.use_cache = use_cache
def get_whisper_cache(self) -> Optional[Path]:
"""
Check for cached Whisper transcript.
Returns:
Path to cached transcript or None
"""
if not self.use_cache:
return None
cache_path = self.output_dir / f"{self.video_name}.json"
if cache_path.exists():
logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}")
return cache_path
return None
def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]:
"""
Check for cached frames.
Returns:
List of (frame_path, timestamp) tuples or None
"""
if not self.use_cache or not self.frames_dir.exists():
return None
existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
if not existing_frames:
return None
logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/")
# Build frames_info from existing files
frames_info = []
for frame_path in sorted(existing_frames):
# Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg)
try:
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
timestamp = float(timestamp_str)
except:
timestamp = 0.0
frames_info.append((str(frame_path), timestamp))
return frames_info
def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]:
"""
Check for cached analysis results.
Args:
analysis_type: 'vision' or 'ocr'
Returns:
List of analysis results or None
"""
if not self.use_cache:
return None
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
if cache_path.exists():
logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}")
with open(cache_path, 'r', encoding='utf-8') as f:
results = json.load(f)
logger.info(f"✓ Loaded {len(results)} analyzed frames from cache")
return results
return None
def save_analysis(self, analysis_type: str, results: List[Dict]):
"""
Save analysis results to cache.
Args:
analysis_type: 'vision' or 'ocr'
results: Analysis results to save
"""
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
with open(cache_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}")
def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]:
"""
Check what caches exist.
Args:
analysis_type: Optional specific analysis type to check
Returns:
Dictionary of cache status
"""
status = {
"whisper": (self.output_dir / f"{self.video_name}.json").exists(),
"frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False,
}
if analysis_type:
status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists()
else:
status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists()
status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists()
return status

135
meetus/output_manager.py Normal file
View File

@@ -0,0 +1,135 @@
"""
Manage output directories and manifest files.
Creates timestamped folders for each video and tracks processing options.
"""
from pathlib import Path
from datetime import datetime
import json
import logging
from typing import Dict, Any, Optional
logger = logging.getLogger(__name__)
class OutputManager:
"""Manage output directories and manifest files for video processing."""
def __init__(self, video_path: Path, base_output_dir: str = "output", use_cache: bool = True):
"""
Initialize output manager.
Args:
video_path: Path to the video file being processed
base_output_dir: Base directory for all outputs
use_cache: Whether to use existing directories if found
"""
self.video_path = video_path
self.base_output_dir = Path(base_output_dir)
self.use_cache = use_cache
# Find or create output directory
self.output_dir = self._get_or_create_output_dir()
self.frames_dir = self.output_dir / "frames"
self.frames_dir.mkdir(exist_ok=True)
logger.info(f"Output directory: {self.output_dir}")
def _get_or_create_output_dir(self) -> Path:
"""
Get existing output directory or create a new timestamped one.
Returns:
Path to output directory
"""
video_name = self.video_path.stem
# Look for existing directories if caching is enabled
if self.use_cache and self.base_output_dir.exists():
existing_dirs = sorted([
d for d in self.base_output_dir.iterdir()
if d.is_dir() and d.name.endswith(f"-{video_name}")
], reverse=True) # Most recent first
if existing_dirs:
logger.info(f"Found existing output: {existing_dirs[0].name}")
return existing_dirs[0]
# Create new timestamped directory
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
dir_name = f"{timestamp}-{video_name}"
output_dir = self.base_output_dir / dir_name
output_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Created new output directory: {dir_name}")
return output_dir
def get_path(self, filename: str) -> Path:
"""Get full path for a file in the output directory."""
return self.output_dir / filename
def get_frames_path(self, filename: str) -> Path:
"""Get full path for a file in the frames directory."""
return self.frames_dir / filename
def save_manifest(self, config: Dict[str, Any]):
"""
Save processing configuration to manifest.json.
Args:
config: Dictionary of processing options
"""
manifest_path = self.output_dir / "manifest.json"
manifest = {
"video": {
"name": self.video_path.name,
"path": str(self.video_path.absolute()),
},
"processed_at": datetime.now().isoformat(),
"configuration": config,
"outputs": {
"frames": str(self.frames_dir.relative_to(self.output_dir)),
"enhanced_transcript": f"{self.video_path.stem}_enhanced.txt",
"whisper_transcript": f"{self.video_path.stem}.json" if config.get("run_whisper") else None,
"analysis": f"{self.video_path.stem}_{'vision' if config.get('use_vision') else 'ocr'}.json"
}
}
with open(manifest_path, 'w', encoding='utf-8') as f:
json.dump(manifest, f, indent=2, ensure_ascii=False)
logger.info(f"Saved manifest: {manifest_path}")
def load_manifest(self) -> Optional[Dict[str, Any]]:
"""
Load existing manifest if it exists.
Returns:
Manifest dictionary or None
"""
manifest_path = self.output_dir / "manifest.json"
if manifest_path.exists():
with open(manifest_path, 'r', encoding='utf-8') as f:
return json.load(f)
return None
def list_outputs(self) -> Dict[str, Any]:
"""
List all output files in the directory.
Returns:
Dictionary of output files and their status
"""
video_name = self.video_path.stem
return {
"output_dir": str(self.output_dir),
"manifest": (self.output_dir / "manifest.json").exists(),
"enhanced_transcript": (self.output_dir / f"{video_name}_enhanced.txt").exists(),
"whisper_transcript": (self.output_dir / f"{video_name}.json").exists(),
"vision_analysis": (self.output_dir / f"{video_name}_vision.json").exists(),
"ocr_analysis": (self.output_dir / f"{video_name}_ocr.json").exists(),
"frames": len(list(self.frames_dir.glob("*.jpg"))) if self.frames_dir.exists() else 0
}

9
meetus/prompts/code.txt Normal file
View File

@@ -0,0 +1,9 @@
Analyze this code screenshot. Extract:
1. Programming language
2. File name or path (if visible)
3. Code content (preserve exact formatting)
4. Comments
5. Function/class names
6. Any error messages or warnings
Preserve code exactly as shown.

View File

@@ -0,0 +1,8 @@
Analyze this console/terminal output. Extract:
1. Commands executed
2. Output/results
3. Error messages
4. Warnings or status messages
5. File paths or URLs
Preserve formatting and structure.

View File

@@ -0,0 +1,9 @@
Analyze this dashboard/monitoring panel. Extract:
1. Panel titles and metrics names
2. Current values and units
3. Trends (up/down/stable)
4. Alerts or warnings
5. Time ranges shown
6. Any anomalies or notable patterns
Format as structured data.

View File

@@ -0,0 +1,10 @@
Analyze this screen capture from a meeting recording. Extract:
1. Any visible text (titles, labels, headings)
2. Key metrics, numbers, or data points shown
3. Dashboard panels or visualizations (describe what they show)
4. Code snippets (preserve formatting and context)
5. Console/terminal output (commands and results)
6. Application names or UI elements
Focus on information that would help someone understand what was being discussed.
Be concise but include all important details. If there's code, preserve it exactly.

View File

@@ -6,6 +6,7 @@ from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
from difflib import SequenceMatcher
import os
logger = logging.getLogger(__name__)
@@ -13,15 +14,24 @@ logger = logging.getLogger(__name__)
class VisionProcessor:
"""Process frames using local vision models via Ollama."""
def __init__(self, model: str = "llava:13b"):
def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
"""
Initialize vision processor.
Args:
model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
prompts_dir: Directory containing prompt files (default: meetus/prompts/)
"""
self.model = model
self._client = None
# Set prompts directory
if prompts_dir:
self.prompts_dir = Path(prompts_dir)
else:
# Default to meetus/prompts/ relative to this file
self.prompts_dir = Path(__file__).parent / "prompts"
self._init_client()
def _init_client(self):
@@ -53,6 +63,26 @@ class VisionProcessor:
"Also install Ollama: https://ollama.ai/download"
)
def _load_prompt(self, context: str) -> str:
"""
Load prompt from file.
Args:
context: Context name (meeting, dashboard, code, console)
Returns:
Prompt text
"""
prompt_file = self.prompts_dir / f"{context}.txt"
if prompt_file.exists():
with open(prompt_file, 'r', encoding='utf-8') as f:
return f.read().strip()
else:
# Fallback to default prompt
logger.warning(f"Prompt file not found: {prompt_file}, using default")
return "Analyze this image and describe what you see in detail."
def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
"""
Analyze a single frame using local vision model.
@@ -64,50 +94,8 @@ class VisionProcessor:
Returns:
Analyzed content description
"""
# Context-specific prompts
prompts = {
"meeting": """Analyze this screen capture from a meeting recording. Extract:
1. Any visible text (titles, labels, headings)
2. Key metrics, numbers, or data points shown
3. Dashboard panels or visualizations (describe what they show)
4. Code snippets (preserve formatting and context)
5. Console/terminal output (commands and results)
6. Application names or UI elements
Focus on information that would help someone understand what was being discussed.
Be concise but include all important details. If there's code, preserve it exactly.""",
"dashboard": """Analyze this dashboard/monitoring panel. Extract:
1. Panel titles and metrics names
2. Current values and units
3. Trends (up/down/stable)
4. Alerts or warnings
5. Time ranges shown
6. Any anomalies or notable patterns
Format as structured data.""",
"code": """Analyze this code screenshot. Extract:
1. Programming language
2. File name or path (if visible)
3. Code content (preserve exact formatting)
4. Comments
5. Function/class names
6. Any error messages or warnings
Preserve code exactly as shown.""",
"console": """Analyze this console/terminal output. Extract:
1. Commands executed
2. Output/results
3. Error messages
4. Warnings or status messages
5. File paths or URLs
Preserve formatting and structure."""
}
prompt = prompts.get(context, prompts["meeting"])
# Load prompt from file
prompt = self._load_prompt(context)
try:
# Use Ollama's chat API with vision

316
meetus/workflow.py Normal file
View File

@@ -0,0 +1,316 @@
"""
Orchestrate the video processing workflow.
Coordinates frame extraction, analysis, and transcript merging.
"""
from pathlib import Path
import logging
import subprocess
import shutil
from typing import Dict, Any, Optional
from .output_manager import OutputManager
from .cache_manager import CacheManager
from .frame_extractor import FrameExtractor
from .ocr_processor import OCRProcessor
from .vision_processor import VisionProcessor
from .transcript_merger import TranscriptMerger
logger = logging.getLogger(__name__)
class WorkflowConfig:
"""Configuration for the processing workflow."""
def __init__(self, **kwargs):
"""Initialize configuration from keyword arguments."""
# Video and paths
self.video_path = Path(kwargs['video'])
self.transcript_path = kwargs.get('transcript')
self.output_dir = kwargs.get('output_dir', 'output')
self.custom_output = kwargs.get('output')
# Whisper options
self.run_whisper = kwargs.get('run_whisper', False)
self.whisper_model = kwargs.get('whisper_model', 'base')
# Frame extraction
self.scene_detection = kwargs.get('scene_detection', False)
self.interval = kwargs.get('interval', 5)
# Analysis options
self.use_vision = kwargs.get('use_vision', False)
self.vision_model = kwargs.get('vision_model', 'llava:13b')
self.vision_context = kwargs.get('vision_context', 'meeting')
self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')
# Processing options
self.no_deduplicate = kwargs.get('no_deduplicate', False)
self.no_cache = kwargs.get('no_cache', False)
self.extract_only = kwargs.get('extract_only', False)
self.format = kwargs.get('format', 'detailed')
def to_dict(self) -> Dict[str, Any]:
"""Convert config to dictionary for manifest."""
return {
"whisper": {
"enabled": self.run_whisper,
"model": self.whisper_model
},
"frame_extraction": {
"method": "scene_detection" if self.scene_detection else "interval",
"interval_seconds": self.interval if not self.scene_detection else None
},
"analysis": {
"method": "vision" if self.use_vision else "ocr",
"vision_model": self.vision_model if self.use_vision else None,
"vision_context": self.vision_context if self.use_vision else None,
"ocr_engine": self.ocr_engine if not self.use_vision else None,
"deduplication": not self.no_deduplicate
},
"output_format": self.format
}
class ProcessingWorkflow:
"""Orchestrate the complete video processing workflow."""
def __init__(self, config: WorkflowConfig):
"""
Initialize workflow.
Args:
config: Workflow configuration
"""
self.config = config
self.output_mgr = OutputManager(
config.video_path,
config.output_dir,
use_cache=not config.no_cache
)
self.cache_mgr = CacheManager(
self.output_mgr.output_dir,
self.output_mgr.frames_dir,
config.video_path.stem,
use_cache=not config.no_cache
)
def run(self) -> Dict[str, Any]:
"""
Run the complete processing workflow.
Returns:
Dictionary with output paths and status
"""
logger.info("=" * 80)
logger.info("MEETING PROCESSOR")
logger.info("=" * 80)
logger.info(f"Video: {self.config.video_path.name}")
logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
if self.config.use_vision:
logger.info(f"Vision Model: {self.config.vision_model}")
logger.info(f"Context: {self.config.vision_context}")
logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
logger.info("=" * 80)
# Step 0: Whisper transcription
transcript_path = self._run_whisper()
# Step 1: Extract frames
frames_info = self._extract_frames()
if not frames_info:
logger.error("No frames extracted")
raise RuntimeError("Frame extraction failed")
# Step 2: Analyze frames
screen_segments = self._analyze_frames(frames_info)
if self.config.extract_only:
logger.info("Done! (extract-only mode)")
return self._build_result(transcript_path, screen_segments)
# Step 3: Merge with transcript
enhanced_transcript = self._merge_transcripts(transcript_path, screen_segments)
# Save manifest
self.output_mgr.save_manifest(self.config.to_dict())
# Build final result
return self._build_result(transcript_path, screen_segments, enhanced_transcript)
def _run_whisper(self) -> Optional[str]:
"""Run Whisper transcription if requested."""
if not self.config.run_whisper:
return self.config.transcript_path
# Check cache
cached = self.cache_mgr.get_whisper_cache()
if cached:
return str(cached)
logger.info("=" * 80)
logger.info("STEP 0: Running Whisper Transcription")
logger.info("=" * 80)
# Check if whisper is installed
if not shutil.which("whisper"):
logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
raise RuntimeError("Whisper not installed")
logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
logger.info("This may take a few minutes depending on video length...")
# Run whisper command
cmd = [
"whisper",
str(self.config.video_path),
"--model", self.config.whisper_model,
"--output_format", "json",
"--output_dir", str(self.output_mgr.output_dir)
]
try:
subprocess.run(cmd, check=True, capture_output=True, text=True)
transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
if transcript_path.exists():
logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
logger.info("")
return str(transcript_path)
else:
logger.error("Whisper completed but transcript file not found")
raise RuntimeError("Whisper output missing")
except subprocess.CalledProcessError as e:
logger.error(f"Whisper failed: {e.stderr}")
raise
def _extract_frames(self):
"""Extract frames from video."""
logger.info("Step 1: Extracting frames from video...")
# Check cache
cached_frames = self.cache_mgr.get_frames_cache()
if cached_frames:
return cached_frames
# Extract frames
extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
if self.config.scene_detection:
frames_info = extractor.extract_scene_changes()
else:
frames_info = extractor.extract_by_interval(self.config.interval)
logger.info(f"✓ Extracted {len(frames_info)} frames")
return frames_info
def _analyze_frames(self, frames_info):
"""Analyze frames with vision or OCR."""
analysis_type = 'vision' if self.config.use_vision else 'ocr'
# Check cache
cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
if cached_analysis:
return cached_analysis
if self.config.use_vision:
return self._run_vision_analysis(frames_info)
else:
return self._run_ocr_analysis(frames_info)
def _run_vision_analysis(self, frames_info):
"""Run vision analysis on frames."""
logger.info("Step 2: Running vision analysis on extracted frames...")
try:
vision = VisionProcessor(model=self.config.vision_model)
screen_segments = vision.process_frames(
frames_info,
context=self.config.vision_context,
deduplicate=not self.config.no_deduplicate
)
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
# Cache results
self.cache_mgr.save_analysis('vision', screen_segments)
return screen_segments
except ImportError as e:
logger.error(f"{e}")
raise
def _run_ocr_analysis(self, frames_info):
"""Run OCR analysis on frames."""
logger.info("Step 2: Running OCR on extracted frames...")
try:
ocr = OCRProcessor(engine=self.config.ocr_engine)
screen_segments = ocr.process_frames(
frames_info,
deduplicate=not self.config.no_deduplicate
)
logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
# Cache results
self.cache_mgr.save_analysis('ocr', screen_segments)
return screen_segments
except ImportError as e:
logger.error(f"{e}")
logger.error(f"To install {self.config.ocr_engine}:")
logger.error(f" pip install {self.config.ocr_engine}")
raise
def _merge_transcripts(self, transcript_path, screen_segments):
"""Merge audio and screen transcripts."""
merger = TranscriptMerger()
# Load audio transcript if available
audio_segments = []
if transcript_path:
logger.info("Step 3: Merging with Whisper transcript...")
transcript_file = Path(transcript_path)
if not transcript_file.exists():
logger.warning(f"Transcript not found: {transcript_path}")
logger.info("Proceeding with screen content only...")
else:
audio_segments = merger.load_whisper_transcript(str(transcript_file))
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
else:
logger.info("No transcript provided, using screen content only...")
# Merge and format
merged = merger.merge_transcripts(audio_segments, screen_segments)
formatted = merger.format_for_claude(merged, format_style=self.config.format)
# Save output
if self.config.custom_output:
output_path = self.config.custom_output
else:
output_path = self.output_mgr.get_path(f"{self.config.video_path.stem}_enhanced.txt")
merger.save_transcript(formatted, str(output_path))
logger.info("=" * 80)
logger.info("✓ PROCESSING COMPLETE!")
logger.info("=" * 80)
logger.info(f"Output directory: {self.output_mgr.output_dir}")
logger.info(f"Enhanced transcript: {Path(output_path).name}")
logger.info("")
return output_path
def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
"""Build result dictionary."""
return {
"output_dir": str(self.output_mgr.output_dir),
"transcript": transcript_path,
"analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
"frames_count": len(screen_segments) if screen_segments else 0,
"enhanced_transcript": enhanced_transcript,
"manifest": str(self.output_mgr.get_path("manifest.json"))
}