refactor

2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,11 @@
 samples/*
 !samples/.gitkeep
-# Output files
+# Output directories (timestamped folders for each video)
 output/*
 !output/.gitkeep
-# Extracted frames
+# Python cache
 frames/
 __pycache__
 *.pyc
 .pytest_cache/
--- a/README.md
+++ b/README.md
@@ -184,22 +184,53 @@ python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --verbo
 ## Output Files
-All output files are saved to the `output/` directory by default:
+Each video gets its own timestamped output directory:
- **`output/<video>_enhanced.txt`** - Enhanced transcript ready for AI summarization
+```
- **`output/<video>.json`** - Whisper transcript (if `--run-whisper` was used)
+output/
- **`output/<video>_vision.json`** - Vision analysis results with timestamps (if `--use-vision`)
+└── 20241019_143022-meeting/
- **`output/<video>_ocr.json`** - OCR results with timestamps (if using OCR)
+    ├── manifest.json                    # Processing configuration
- **`frames/`** - Extracted video frames (JPG files)
+    ├── meeting_enhanced.txt             # Enhanced transcript for AI
    ├── meeting.json                     # Whisper transcript
    ├── meeting_vision.json              # Vision analysis results
    └── frames/                          # Extracted video frames
        ├── frame_00001_5.00s.jpg
        ├── frame_00002_10.00s.jpg
        └── ...
 ```
 ### Manifest File
 Each processing run creates a `manifest.json` that tracks:
 - Video information (name, path)
 - Processing timestamp
 - Configuration used (Whisper model, vision settings, etc.)
 - Output file locations
 Example manifest:
 ```json
 {
  "video": {
    "name": "meeting.mkv",
    "path": "/full/path/to/meeting.mkv"
  },
  "processed_at": "2024-10-19T14:30:22",
  "configuration": {
    "whisper": {"enabled": true, "model": "base"},
    "analysis": {"method": "vision", "vision_model": "llava:13b", "vision_context": "code"}
  }
 }
 ```
 ### Caching Behavior
-The tool automatically caches intermediate results to speed up re-runs:
+The tool automatically reuses the most recent output directory for the same video:
- **Whisper transcript**: Cached as `output/<video>.json`
+- **First run**: Creates new timestamped directory (e.g., `20241019_143022-meeting/`)
- **Extracted frames**: Cached in `frames/<video>_*.jpg`
+- **Subsequent runs**: Reuses the same directory and cached results
- **Analysis results**: Cached as `output/<video>_vision.json` or `output/<video>_ocr.json`
+- **Cached items**: Whisper transcript, extracted frames, analysis results
 - **Force new run**: Use `--no-cache` to create a fresh directory
-Re-running with the same video will use cached results unless `--no-cache` is specified.
+This means you can instantly switch between OCR and vision analysis without re-extracting frames!
 ## Workflow for Meeting Analysis
@@ -310,6 +341,15 @@ Options:
 - **`--vision-context dashboard`**: Extracts metrics, trends, panel names
 - **`--vision-context console`**: Captures commands, output, error messages
 **Customizing Prompts:**
 Prompts are stored as editable text files in `meetus/prompts/`:
 - `meeting.txt` - General meeting analysis
 - `code.txt` - Code screenshot analysis
 - `dashboard.txt` - Dashboard/monitoring analysis
 - `console.txt` - Terminal/console analysis
 Just edit these files to customize how the vision model analyzes your frames!
 ### Scene Detection vs Interval
 - **Scene detection**: Better for presentations with distinct slides. More efficient.
 - **Interval extraction**: Better for continuous screen sharing (coding, browsing). More thorough.
@@ -384,16 +424,31 @@ sudo apt-get install tesseract-ocr  # Don't forget system package!
 ```
 meetus/
-├── meetus/                  # Main package
+├── meetus/                     # Main package
 │   ├── __init__.py
-│   ├── frame_extractor.py   # Video frame extraction
+│   ├── workflow.py             # Processing orchestrator
-│   ├── ocr_processor.py     # OCR processing
+│   ├── output_manager.py       # Output directory & manifest management
-│   └── transcript_merger.py # Transcript merging
+│   ├── cache_manager.py        # Caching logic
-├── process_meeting.py       # Main CLI script
+│   ├── frame_extractor.py      # Video frame extraction
-├── requirements.txt         # Python dependencies
+│   ├── vision_processor.py     # Vision model analysis (Ollama/LLaVA)
-└── README.md               # This file
+│   ├── ocr_processor.py        # OCR processing
 │   ├── transcript_merger.py    # Transcript merging
 │   └── prompts/                # Vision analysis prompts (editable!)
 │       ├── meeting.txt         # General meeting analysis
 │       ├── code.txt            # Code screenshot analysis
 │       ├── dashboard.txt       # Dashboard/monitoring analysis
 │       └── console.txt         # Terminal/console analysis
 ├── process_meeting.py          # Main CLI script (thin wrapper)
 ├── requirements.txt            # Python dependencies
 ├── output/                     # Timestamped output directories
 │   ├── .gitkeep
 │   └── YYYYMMDD_HHMMSS-video/  # Auto-generated per video
 ├── samples/                    # Sample videos (gitignored)
 └── README.md                   # This file
 ```
 The code is modular and easy to extend - each module has a single responsibility.
 ## License
 For personal use.
--- a/meetus/cache_manager.py
+++ b/meetus/cache_manager.py
@@ -0,0 +1,137 @@
 """
 Manage caching for frames, transcripts, and analysis results.
 """
 from pathlib import Path
 import json
 import logging
 from typing import List, Tuple, Dict, Optional
 logger = logging.getLogger(__name__)
 class CacheManager:
    """Manage caching of intermediate processing results."""
    def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True):
        """
        Initialize cache manager.
        Args:
            output_dir: Output directory for cached files
            frames_dir: Directory for cached frames
            video_name: Name of the video (stem)
            use_cache: Whether to use caching
        """
        self.output_dir = output_dir
        self.frames_dir = frames_dir
        self.video_name = video_name
        self.use_cache = use_cache
    def get_whisper_cache(self) -> Optional[Path]:
        """
        Check for cached Whisper transcript.
        Returns:
            Path to cached transcript or None
        """
        if not self.use_cache:
            return None
        cache_path = self.output_dir / f"{self.video_name}.json"
        if cache_path.exists():
            logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}")
            return cache_path
        return None
    def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]:
        """
        Check for cached frames.
        Returns:
            List of (frame_path, timestamp) tuples or None
        """
        if not self.use_cache or not self.frames_dir.exists():
            return None
        existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
        if not existing_frames:
            return None
        logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/")
        # Build frames_info from existing files
        frames_info = []
        for frame_path in sorted(existing_frames):
            # Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg)
            try:
                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
                timestamp = float(timestamp_str)
            except:
                timestamp = 0.0
            frames_info.append((str(frame_path), timestamp))
        return frames_info
    def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]:
        """
        Check for cached analysis results.
        Args:
            analysis_type: 'vision' or 'ocr'
        Returns:
            List of analysis results or None
        """
        if not self.use_cache:
            return None
        cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
        if cache_path.exists():
            logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}")
            with open(cache_path, 'r', encoding='utf-8') as f:
                results = json.load(f)
            logger.info(f"✓ Loaded {len(results)} analyzed frames from cache")
            return results
        return None
    def save_analysis(self, analysis_type: str, results: List[Dict]):
        """
        Save analysis results to cache.
        Args:
            analysis_type: 'vision' or 'ocr'
            results: Analysis results to save
        """
        cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
        with open(cache_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)
        logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}")
    def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]:
        """
        Check what caches exist.
        Args:
            analysis_type: Optional specific analysis type to check
        Returns:
            Dictionary of cache status
        """
        status = {
            "whisper": (self.output_dir / f"{self.video_name}.json").exists(),
            "frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False,
        }
        if analysis_type:
            status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists()
        else:
            status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists()
            status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists()
        return status
--- a/meetus/output_manager.py
+++ b/meetus/output_manager.py
@@ -0,0 +1,135 @@
 """
 Manage output directories and manifest files.
 Creates timestamped folders for each video and tracks processing options.
 """
 from pathlib import Path
 from datetime import datetime
 import json
 import logging
 from typing import Dict, Any, Optional
 logger = logging.getLogger(__name__)
 class OutputManager:
    """Manage output directories and manifest files for video processing."""
    def __init__(self, video_path: Path, base_output_dir: str = "output", use_cache: bool = True):
        """
        Initialize output manager.
        Args:
            video_path: Path to the video file being processed
            base_output_dir: Base directory for all outputs
            use_cache: Whether to use existing directories if found
        """
        self.video_path = video_path
        self.base_output_dir = Path(base_output_dir)
        self.use_cache = use_cache
        # Find or create output directory
        self.output_dir = self._get_or_create_output_dir()
        self.frames_dir = self.output_dir / "frames"
        self.frames_dir.mkdir(exist_ok=True)
        logger.info(f"Output directory: {self.output_dir}")
    def _get_or_create_output_dir(self) -> Path:
        """
        Get existing output directory or create a new timestamped one.
        Returns:
            Path to output directory
        """
        video_name = self.video_path.stem
        # Look for existing directories if caching is enabled
        if self.use_cache and self.base_output_dir.exists():
            existing_dirs = sorted([
                d for d in self.base_output_dir.iterdir()
                if d.is_dir() and d.name.endswith(f"-{video_name}")
            ], reverse=True)  # Most recent first
            if existing_dirs:
                logger.info(f"Found existing output: {existing_dirs[0].name}")
                return existing_dirs[0]
        # Create new timestamped directory
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        dir_name = f"{timestamp}-{video_name}"
        output_dir = self.base_output_dir / dir_name
        output_dir.mkdir(parents=True, exist_ok=True)
        logger.info(f"Created new output directory: {dir_name}")
        return output_dir
    def get_path(self, filename: str) -> Path:
        """Get full path for a file in the output directory."""
        return self.output_dir / filename
    def get_frames_path(self, filename: str) -> Path:
        """Get full path for a file in the frames directory."""
        return self.frames_dir / filename
    def save_manifest(self, config: Dict[str, Any]):
        """
        Save processing configuration to manifest.json.
        Args:
            config: Dictionary of processing options
        """
        manifest_path = self.output_dir / "manifest.json"
        manifest = {
            "video": {
                "name": self.video_path.name,
                "path": str(self.video_path.absolute()),
            },
            "processed_at": datetime.now().isoformat(),
            "configuration": config,
            "outputs": {
                "frames": str(self.frames_dir.relative_to(self.output_dir)),
                "enhanced_transcript": f"{self.video_path.stem}_enhanced.txt",
                "whisper_transcript": f"{self.video_path.stem}.json" if config.get("run_whisper") else None,
                "analysis": f"{self.video_path.stem}_{'vision' if config.get('use_vision') else 'ocr'}.json"
            }
        }
        with open(manifest_path, 'w', encoding='utf-8') as f:
            json.dump(manifest, f, indent=2, ensure_ascii=False)
        logger.info(f"Saved manifest: {manifest_path}")
    def load_manifest(self) -> Optional[Dict[str, Any]]:
        """
        Load existing manifest if it exists.
        Returns:
            Manifest dictionary or None
        """
        manifest_path = self.output_dir / "manifest.json"
        if manifest_path.exists():
            with open(manifest_path, 'r', encoding='utf-8') as f:
                return json.load(f)
        return None
    def list_outputs(self) -> Dict[str, Any]:
        """
        List all output files in the directory.
        Returns:
            Dictionary of output files and their status
        """
        video_name = self.video_path.stem
        return {
            "output_dir": str(self.output_dir),
            "manifest": (self.output_dir / "manifest.json").exists(),
            "enhanced_transcript": (self.output_dir / f"{video_name}_enhanced.txt").exists(),
            "whisper_transcript": (self.output_dir / f"{video_name}.json").exists(),
            "vision_analysis": (self.output_dir / f"{video_name}_vision.json").exists(),
            "ocr_analysis": (self.output_dir / f"{video_name}_ocr.json").exists(),
            "frames": len(list(self.frames_dir.glob("*.jpg"))) if self.frames_dir.exists() else 0
        }
--- a/meetus/prompts/code.txt
+++ b/meetus/prompts/code.txt
@@ -0,0 +1,9 @@
 Analyze this code screenshot. Extract:
 1. Programming language
 2. File name or path (if visible)
 3. Code content (preserve exact formatting)
 4. Comments
 5. Function/class names
 6. Any error messages or warnings
 Preserve code exactly as shown.
--- a/meetus/prompts/console.txt
+++ b/meetus/prompts/console.txt
@@ -0,0 +1,8 @@
 Analyze this console/terminal output. Extract:
 1. Commands executed
 2. Output/results
 3. Error messages
 4. Warnings or status messages
 5. File paths or URLs
 Preserve formatting and structure.
--- a/meetus/prompts/dashboard.txt
+++ b/meetus/prompts/dashboard.txt
@@ -0,0 +1,9 @@
 Analyze this dashboard/monitoring panel. Extract:
 1. Panel titles and metrics names
 2. Current values and units
 3. Trends (up/down/stable)
 4. Alerts or warnings
 5. Time ranges shown
 6. Any anomalies or notable patterns
 Format as structured data.
--- a/meetus/prompts/meeting.txt
+++ b/meetus/prompts/meeting.txt
@@ -0,0 +1,10 @@
 Analyze this screen capture from a meeting recording. Extract:
 1. Any visible text (titles, labels, headings)
 2. Key metrics, numbers, or data points shown
 3. Dashboard panels or visualizations (describe what they show)
 4. Code snippets (preserve formatting and context)
 5. Console/terminal output (commands and results)
 6. Application names or UI elements
 Focus on information that would help someone understand what was being discussed.
 Be concise but include all important details. If there's code, preserve it exactly.
--- a/meetus/vision_processor.py
+++ b/meetus/vision_processor.py
@@ -6,6 +6,7 @@ from typing import List, Tuple, Dict, Optional
 from pathlib import Path
 import logging
 from difflib import SequenceMatcher
 import os
 logger = logging.getLogger(__name__)
@@ -13,15 +14,24 @@ logger = logging.getLogger(__name__)
 class VisionProcessor:
    """Process frames using local vision models via Ollama."""
-    def __init__(self, model: str = "llava:13b"):
+    def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
        """
        Initialize vision processor.
        Args:
            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
            prompts_dir: Directory containing prompt files (default: meetus/prompts/)
        """
        self.model = model
        self._client = None
        # Set prompts directory
        if prompts_dir:
            self.prompts_dir = Path(prompts_dir)
        else:
            # Default to meetus/prompts/ relative to this file
            self.prompts_dir = Path(__file__).parent / "prompts"
        self._init_client()
    def _init_client(self):
@@ -53,6 +63,26 @@ class VisionProcessor:
                "Also install Ollama: https://ollama.ai/download"
            )
    def _load_prompt(self, context: str) -> str:
        """
        Load prompt from file.
        Args:
            context: Context name (meeting, dashboard, code, console)
        Returns:
            Prompt text
        """
        prompt_file = self.prompts_dir / f"{context}.txt"
        if prompt_file.exists():
            with open(prompt_file, 'r', encoding='utf-8') as f:
                return f.read().strip()
        else:
            # Fallback to default prompt
            logger.warning(f"Prompt file not found: {prompt_file}, using default")
            return "Analyze this image and describe what you see in detail."
    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
        """
        Analyze a single frame using local vision model.
@@ -64,50 +94,8 @@ class VisionProcessor:
        Returns:
            Analyzed content description
        """
-        # Context-specific prompts
+        # Load prompt from file
-        prompts = {
+        prompt = self._load_prompt(context)
            "meeting": """Analyze this screen capture from a meeting recording. Extract:
 1. Any visible text (titles, labels, headings)
 2. Key metrics, numbers, or data points shown
 3. Dashboard panels or visualizations (describe what they show)
 4. Code snippets (preserve formatting and context)
 5. Console/terminal output (commands and results)
 6. Application names or UI elements
 Focus on information that would help someone understand what was being discussed.
 Be concise but include all important details. If there's code, preserve it exactly.""",
            "dashboard": """Analyze this dashboard/monitoring panel. Extract:
 1. Panel titles and metrics names
 2. Current values and units
 3. Trends (up/down/stable)
 4. Alerts or warnings
 5. Time ranges shown
 6. Any anomalies or notable patterns
 Format as structured data.""",
            "code": """Analyze this code screenshot. Extract:
 1. Programming language
 2. File name or path (if visible)
 3. Code content (preserve exact formatting)
 4. Comments
 5. Function/class names
 6. Any error messages or warnings
 Preserve code exactly as shown.""",
            "console": """Analyze this console/terminal output. Extract:
 1. Commands executed
 2. Output/results
 3. Error messages
 4. Warnings or status messages
 5. File paths or URLs
 Preserve formatting and structure."""
        }
        prompt = prompts.get(context, prompts["meeting"])
        try:
            # Use Ollama's chat API with vision
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -0,0 +1,316 @@
 """
 Orchestrate the video processing workflow.
 Coordinates frame extraction, analysis, and transcript merging.
 """
 from pathlib import Path
 import logging
 import subprocess
 import shutil
 from typing import Dict, Any, Optional
 from .output_manager import OutputManager
 from .cache_manager import CacheManager
 from .frame_extractor import FrameExtractor
 from .ocr_processor import OCRProcessor
 from .vision_processor import VisionProcessor
 from .transcript_merger import TranscriptMerger
 logger = logging.getLogger(__name__)
 class WorkflowConfig:
    """Configuration for the processing workflow."""
    def __init__(self, **kwargs):
        """Initialize configuration from keyword arguments."""
        # Video and paths
        self.video_path = Path(kwargs['video'])
        self.transcript_path = kwargs.get('transcript')
        self.output_dir = kwargs.get('output_dir', 'output')
        self.custom_output = kwargs.get('output')
        # Whisper options
        self.run_whisper = kwargs.get('run_whisper', False)
        self.whisper_model = kwargs.get('whisper_model', 'base')
        # Frame extraction
        self.scene_detection = kwargs.get('scene_detection', False)
        self.interval = kwargs.get('interval', 5)
        # Analysis options
        self.use_vision = kwargs.get('use_vision', False)
        self.vision_model = kwargs.get('vision_model', 'llava:13b')
        self.vision_context = kwargs.get('vision_context', 'meeting')
        self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')
        # Processing options
        self.no_deduplicate = kwargs.get('no_deduplicate', False)
        self.no_cache = kwargs.get('no_cache', False)
        self.extract_only = kwargs.get('extract_only', False)
        self.format = kwargs.get('format', 'detailed')
    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary for manifest."""
        return {
            "whisper": {
                "enabled": self.run_whisper,
                "model": self.whisper_model
            },
            "frame_extraction": {
                "method": "scene_detection" if self.scene_detection else "interval",
                "interval_seconds": self.interval if not self.scene_detection else None
            },
            "analysis": {
                "method": "vision" if self.use_vision else "ocr",
                "vision_model": self.vision_model if self.use_vision else None,
                "vision_context": self.vision_context if self.use_vision else None,
                "ocr_engine": self.ocr_engine if not self.use_vision else None,
                "deduplication": not self.no_deduplicate
            },
            "output_format": self.format
        }
 class ProcessingWorkflow:
    """Orchestrate the complete video processing workflow."""
    def __init__(self, config: WorkflowConfig):
        """
        Initialize workflow.
        Args:
            config: Workflow configuration
        """
        self.config = config
        self.output_mgr = OutputManager(
            config.video_path,
            config.output_dir,
            use_cache=not config.no_cache
        )
        self.cache_mgr = CacheManager(
            self.output_mgr.output_dir,
            self.output_mgr.frames_dir,
            config.video_path.stem,
            use_cache=not config.no_cache
        )
    def run(self) -> Dict[str, Any]:
        """
        Run the complete processing workflow.
        Returns:
            Dictionary with output paths and status
        """
        logger.info("=" * 80)
        logger.info("MEETING PROCESSOR")
        logger.info("=" * 80)
        logger.info(f"Video: {self.config.video_path.name}")
        logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
        if self.config.use_vision:
            logger.info(f"Vision Model: {self.config.vision_model}")
            logger.info(f"Context: {self.config.vision_context}")
        logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
        logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
        logger.info("=" * 80)
        # Step 0: Whisper transcription
        transcript_path = self._run_whisper()
        # Step 1: Extract frames
        frames_info = self._extract_frames()
        if not frames_info:
            logger.error("No frames extracted")
            raise RuntimeError("Frame extraction failed")
        # Step 2: Analyze frames
        screen_segments = self._analyze_frames(frames_info)
        if self.config.extract_only:
            logger.info("Done! (extract-only mode)")
            return self._build_result(transcript_path, screen_segments)
        # Step 3: Merge with transcript
        enhanced_transcript = self._merge_transcripts(transcript_path, screen_segments)
        # Save manifest
        self.output_mgr.save_manifest(self.config.to_dict())
        # Build final result
        return self._build_result(transcript_path, screen_segments, enhanced_transcript)
    def _run_whisper(self) -> Optional[str]:
        """Run Whisper transcription if requested."""
        if not self.config.run_whisper:
            return self.config.transcript_path
        # Check cache
        cached = self.cache_mgr.get_whisper_cache()
        if cached:
            return str(cached)
        logger.info("=" * 80)
        logger.info("STEP 0: Running Whisper Transcription")
        logger.info("=" * 80)
        # Check if whisper is installed
        if not shutil.which("whisper"):
            logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
            raise RuntimeError("Whisper not installed")
        logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
        logger.info("This may take a few minutes depending on video length...")
        # Run whisper command
        cmd = [
            "whisper",
            str(self.config.video_path),
            "--model", self.config.whisper_model,
            "--output_format", "json",
            "--output_dir", str(self.output_mgr.output_dir)
        ]
        try:
            subprocess.run(cmd, check=True, capture_output=True, text=True)
            transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
            if transcript_path.exists():
                logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
                logger.info("")
                return str(transcript_path)
            else:
                logger.error("Whisper completed but transcript file not found")
                raise RuntimeError("Whisper output missing")
        except subprocess.CalledProcessError as e:
            logger.error(f"Whisper failed: {e.stderr}")
            raise
    def _extract_frames(self):
        """Extract frames from video."""
        logger.info("Step 1: Extracting frames from video...")
        # Check cache
        cached_frames = self.cache_mgr.get_frames_cache()
        if cached_frames:
            return cached_frames
        # Extract frames
        extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
        if self.config.scene_detection:
            frames_info = extractor.extract_scene_changes()
        else:
            frames_info = extractor.extract_by_interval(self.config.interval)
        logger.info(f"✓ Extracted {len(frames_info)} frames")
        return frames_info
    def _analyze_frames(self, frames_info):
        """Analyze frames with vision or OCR."""
        analysis_type = 'vision' if self.config.use_vision else 'ocr'
        # Check cache
        cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
        if cached_analysis:
            return cached_analysis
        if self.config.use_vision:
            return self._run_vision_analysis(frames_info)
        else:
            return self._run_ocr_analysis(frames_info)
    def _run_vision_analysis(self, frames_info):
        """Run vision analysis on frames."""
        logger.info("Step 2: Running vision analysis on extracted frames...")
        try:
            vision = VisionProcessor(model=self.config.vision_model)
            screen_segments = vision.process_frames(
                frames_info,
                context=self.config.vision_context,
                deduplicate=not self.config.no_deduplicate
            )
            logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
            # Cache results
            self.cache_mgr.save_analysis('vision', screen_segments)
            return screen_segments
        except ImportError as e:
            logger.error(f"{e}")
            raise
    def _run_ocr_analysis(self, frames_info):
        """Run OCR analysis on frames."""
        logger.info("Step 2: Running OCR on extracted frames...")
        try:
            ocr = OCRProcessor(engine=self.config.ocr_engine)
            screen_segments = ocr.process_frames(
                frames_info,
                deduplicate=not self.config.no_deduplicate
            )
            logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
            # Cache results
            self.cache_mgr.save_analysis('ocr', screen_segments)
            return screen_segments
        except ImportError as e:
            logger.error(f"{e}")
            logger.error(f"To install {self.config.ocr_engine}:")
            logger.error(f"  pip install {self.config.ocr_engine}")
            raise
    def _merge_transcripts(self, transcript_path, screen_segments):
        """Merge audio and screen transcripts."""
        merger = TranscriptMerger()
        # Load audio transcript if available
        audio_segments = []
        if transcript_path:
            logger.info("Step 3: Merging with Whisper transcript...")
            transcript_file = Path(transcript_path)
            if not transcript_file.exists():
                logger.warning(f"Transcript not found: {transcript_path}")
                logger.info("Proceeding with screen content only...")
            else:
                audio_segments = merger.load_whisper_transcript(str(transcript_file))
                logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
        else:
            logger.info("No transcript provided, using screen content only...")
        # Merge and format
        merged = merger.merge_transcripts(audio_segments, screen_segments)
        formatted = merger.format_for_claude(merged, format_style=self.config.format)
        # Save output
        if self.config.custom_output:
            output_path = self.config.custom_output
        else:
            output_path = self.output_mgr.get_path(f"{self.config.video_path.stem}_enhanced.txt")
        merger.save_transcript(formatted, str(output_path))
        logger.info("=" * 80)
        logger.info("✓ PROCESSING COMPLETE!")
        logger.info("=" * 80)
        logger.info(f"Output directory: {self.output_mgr.output_dir}")
        logger.info(f"Enhanced transcript: {Path(output_path).name}")
        logger.info("")
        return output_path
    def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
        """Build result dictionary."""
        return {
            "output_dir": str(self.output_mgr.output_dir),
            "transcript": transcript_path,
            "analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
            "frames_count": len(screen_segments) if screen_segments else 0,
            "enhanced_transcript": enhanced_transcript,
            "manifest": str(self.output_mgr.get_path("manifest.json"))
        }
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -1,34 +1,19 @@
 #!/usr/bin/env python3
 """
 Process meeting recordings to extract audio + screen content.
-Combines Whisper transcripts with OCR from screen shares.
+Combines Whisper transcripts with vision analysis or OCR from screen shares.
 """
 import argparse
 from pathlib import Path
 import sys
 import json
 import logging
 import subprocess
 import shutil
-from meetus.frame_extractor import FrameExtractor
+from meetus.workflow import WorkflowConfig, ProcessingWorkflow
 from meetus.ocr_processor import OCRProcessor
 from meetus.vision_processor import VisionProcessor
 from meetus.transcript_merger import TranscriptMerger
 logger = logging.getLogger(__name__)
 def setup_logging(verbose: bool = False):
-    """
+    """Configure logging for the application."""
    Configure logging for the application.
    Args:
        verbose: If True, set DEBUG level, otherwise INFO
    """
    level = logging.DEBUG if verbose else logging.INFO
    # Configure root logger
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(levelname)s - %(message)s',
@@ -41,58 +26,6 @@ def setup_logging(verbose: bool = False):
    logging.getLogger('paddleocr').setLevel(logging.WARNING)
 def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
    """
    Run Whisper transcription on video file.
    Args:
        video_path: Path to video file
        model: Whisper model to use (tiny, base, small, medium, large)
        output_dir: Directory to save output
    Returns:
        Path to generated JSON transcript
    """
    # Check if whisper is installed
    if not shutil.which("whisper"):
        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
        sys.exit(1)
    logger.info(f"Running Whisper transcription (model: {model})...")
    logger.info("This may take a few minutes depending on video length...")
    # Run whisper command
    cmd = [
        "whisper",
        str(video_path),
        "--model", model,
        "--output_format", "json",
        "--output_dir", output_dir
    ]
    try:
        result = subprocess.run(
            cmd,
            check=True,
            capture_output=True,
            text=True
        )
        # Whisper outputs to <output_dir>/<video_stem>.json
        transcript_path = Path(output_dir) / f"{video_path.stem}.json"
        if transcript_path.exists():
            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
            return transcript_path
        else:
            logger.error("Whisper completed but transcript file not found")
            sys.exit(1)
    except subprocess.CalledProcessError as e:
        logger.error(f"Whisper failed: {e.stderr}")
        sys.exit(1)
 def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
@@ -119,23 +52,23 @@ Examples:
        """
    )
    # Required arguments
    parser.add_argument(
        'video',
        help='Path to video file'
    )
    # Whisper options
    parser.add_argument(
        '--transcript', '-t',
        help='Path to Whisper transcript (JSON or TXT)',
        default=None
    )
    parser.add_argument(
        '--run-whisper',
        action='store_true',
        help='Run Whisper transcription before processing'
    )
    parser.add_argument(
        '--whisper-model',
        choices=['tiny', 'base', 'small', 'medium', 'large'],
@@ -143,56 +76,48 @@ Examples:
        default='base'
    )
    # Output options
    parser.add_argument(
        '--output', '-o',
-        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
+        help='Output file for enhanced transcript (default: auto-generated in output directory)',
        default=None
    )
    parser.add_argument(
        '--output-dir',
-        help='Directory for output files (default: output/)',
+        help='Base directory for outputs (default: output/)',
        default='output'
    )
-    parser.add_argument(
+    # Frame extraction options
        '--frames-dir',
        help='Directory to save extracted frames (default: frames/)',
        default='frames'
    )
    parser.add_argument(
        '--interval',
        type=int,
        help='Extract frame every N seconds (default: 5)',
        default=5
    )
    parser.add_argument(
        '--scene-detection',
        action='store_true',
        help='Use scene detection instead of interval extraction'
    )
    # Analysis options
    parser.add_argument(
        '--ocr-engine',
        choices=['tesseract', 'easyocr', 'paddleocr'],
        help='OCR engine to use (default: tesseract)',
        default='tesseract'
    )
    parser.add_argument(
        '--use-vision',
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )
    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
        default='llava:13b'
    )
    parser.add_argument(
        '--vision-context',
        choices=['meeting', 'dashboard', 'code', 'console'],
@@ -200,24 +125,22 @@ Examples:
        default='meeting'
    )
    # Processing options
    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable caching - reprocess everything even if outputs exist'
    )
    parser.add_argument(
        '--no-deduplicate',
        action='store_true',
        help='Disable text deduplication'
    )
    parser.add_argument(
        '--extract-only',
        action='store_true',
-        help='Only extract frames and OCR, skip transcript merging'
+        help='Only extract frames and analyze, skip transcript merging'
    )
    parser.add_argument(
        '--format',
        choices=['detailed', 'compact'],
@@ -225,6 +148,7 @@ Examples:
        default='detailed'
    )
    # Logging
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
@@ -236,166 +160,38 @@ Examples:
    # Setup logging
    setup_logging(args.verbose)
-    # Validate video path
+    try:
-    video_path = Path(args.video)
+        # Create workflow configuration
-    if not video_path.exists():
+        config = WorkflowConfig(**vars(args))
        logger.error(f"Video file not found: {args.video}")
        sys.exit(1)
-    # Create output directory
+        # Run processing workflow
-    output_dir = Path(args.output_dir)
+        workflow = ProcessingWorkflow(config)
-    output_dir.mkdir(parents=True, exist_ok=True)
+        result = workflow.run()
-    # Set default output path
+        # Print final summary
-    if args.output is None:
+        print("\n" + "=" * 80)
-        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
+        print("✓ SUCCESS!")
        print("=" * 80)
        print(f"Output directory: {result['output_dir']}")
        if result.get('enhanced_transcript'):
            print(f"Enhanced transcript ready for AI summarization!")
        print("=" * 80)
-    # Define cache paths
+        return 0
    whisper_cache = output_dir / f"{video_path.stem}.json"
    analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
    frames_cache_dir = Path(args.frames_dir)
-    # Check for cached Whisper transcript
+    except FileNotFoundError as e:
-    if args.run_whisper:
+        logging.error(f"File not found: {e}")
-        if not args.no_cache and whisper_cache.exists():
+        return 1
-            logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
+    except RuntimeError as e:
-            args.transcript = str(whisper_cache)
+        logging.error(f"Processing failed: {e}")
-        else:
+        return 1
-            logger.info("=" * 80)
+    except KeyboardInterrupt:
-            logger.info("STEP 0: Running Whisper Transcription")
+        logging.warning("\nProcessing interrupted by user")
-            logger.info("=" * 80)
+        return 130
-            transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
+    except Exception as e:
-            args.transcript = str(transcript_path)
+        logging.exception(f"Unexpected error: {e}")
-            logger.info("")
+        return 1
    logger.info("=" * 80)
    logger.info("MEETING PROCESSOR")
    logger.info("=" * 80)
    logger.info(f"Video: {video_path.name}")
    logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
    if args.use_vision:
        logger.info(f"Vision Model: {args.vision_model}")
        logger.info(f"Context: {args.vision_context}")
    logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
    if args.transcript:
        logger.info(f"Transcript: {args.transcript}")
    logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
    logger.info("=" * 80)
    # Step 1: Extract frames (with caching)
    logger.info("Step 1: Extracting frames from video...")
    # Check if frames already exist
    existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
    if not args.no_cache and existing_frames and len(existing_frames) > 0:
        logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
        # Build frames_info from existing files
        frames_info = []
        for frame_path in sorted(existing_frames):
            # Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
            try:
                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
                timestamp = float(timestamp_str)
            except:
                timestamp = 0.0
            frames_info.append((str(frame_path), timestamp))
    else:
        extractor = FrameExtractor(str(video_path), args.frames_dir)
        if args.scene_detection:
            frames_info = extractor.extract_scene_changes()
        else:
            frames_info = extractor.extract_by_interval(args.interval)
        if not frames_info:
            logger.error("No frames extracted")
            sys.exit(1)
        logger.info(f"✓ Extracted {len(frames_info)} frames")
    # Step 2: Run analysis on frames (with caching)
    if not args.no_cache and analysis_cache.exists():
        logger.info(f"✓ Found cached analysis results: {analysis_cache}")
        with open(analysis_cache, 'r', encoding='utf-8') as f:
            screen_segments = json.load(f)
        logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
    else:
        if args.use_vision:
            # Use vision model
            logger.info("Step 2: Running vision analysis on extracted frames...")
            try:
                vision = VisionProcessor(model=args.vision_model)
                screen_segments = vision.process_frames(
                    frames_info,
                    context=args.vision_context,
                    deduplicate=not args.no_deduplicate
                )
                logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
            except ImportError as e:
                logger.error(f"{e}")
                sys.exit(1)
        else:
            # Use OCR
            logger.info("Step 2: Running OCR on extracted frames...")
            try:
                ocr = OCRProcessor(engine=args.ocr_engine)
                screen_segments = ocr.process_frames(
                    frames_info,
                    deduplicate=not args.no_deduplicate
                )
                logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
            except ImportError as e:
                logger.error(f"{e}")
                logger.error(f"To install {args.ocr_engine}:")
                logger.error(f"  pip install {args.ocr_engine}")
                sys.exit(1)
        # Save analysis results as JSON
        with open(analysis_cache, 'w', encoding='utf-8') as f:
            json.dump(screen_segments, f, indent=2, ensure_ascii=False)
        logger.info(f"✓ Saved analysis results to: {analysis_cache}")
    if args.extract_only:
        logger.info("Done! (extract-only mode)")
        return
    # Step 3: Merge with transcript (if provided)
    merger = TranscriptMerger()
    if args.transcript:
        logger.info("Step 3: Merging with Whisper transcript...")
        transcript_path = Path(args.transcript)
        if not transcript_path.exists():
            logger.warning(f"Transcript not found: {args.transcript}")
            logger.info("Proceeding with screen content only...")
            audio_segments = []
        else:
            audio_segments = merger.load_whisper_transcript(str(transcript_path))
            logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
    else:
        logger.info("No transcript provided, using screen content only...")
        audio_segments = []
    # Merge and format
    merged = merger.merge_transcripts(audio_segments, screen_segments)
    formatted = merger.format_for_claude(merged, format_style=args.format)
    # Save output
    merger.save_transcript(formatted, args.output)
    logger.info("=" * 80)
    logger.info("✓ PROCESSING COMPLETE!")
    logger.info("=" * 80)
    logger.info(f"Enhanced transcript: {args.output}")
    logger.info(f"OCR data: {ocr_output}")
    logger.info(f"Frames: {args.frames_dir}/")
    logger.info("")
    logger.info("You can now use the enhanced transcript with Claude for summarization!")
 if __name__ == '__main__':
-    main()
+    sys.exit(main())