refactor

2025-10-20 00:03:41 -03:00
parent a999bc9093
commit cd7b0aed07
11 changed files with 776 additions and 312 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -2,10 +2,11 @@
 samples/*
 !samples/.gitkeep

-# Output files
+# Output directories (timestamped folders for each video)
 output/*
 !output/.gitkeep

-# Extracted frames
-frames/
+# Python cache
 __pycache__
+*.pyc
+.pytest_cache/
--- a/README.md
+++ b/README.md
@@ -184,22 +184,53 @@ python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --verbo

 ## Output Files

-All output files are saved to the `output/` directory by default:
+Each video gets its own timestamped output directory:

- **`output/<video>_enhanced.txt`** - Enhanced transcript ready for AI summarization
- **`output/<video>.json`** - Whisper transcript (if `--run-whisper` was used)
- **`output/<video>_vision.json`** - Vision analysis results with timestamps (if `--use-vision`)
- **`output/<video>_ocr.json`** - OCR results with timestamps (if using OCR)
- **`frames/`** - Extracted video frames (JPG files)
+```
+output/
+└── 20241019_143022-meeting/
+    ├── manifest.json                    # Processing configuration
+    ├── meeting_enhanced.txt             # Enhanced transcript for AI
+    ├── meeting.json                     # Whisper transcript
+    ├── meeting_vision.json              # Vision analysis results
+    └── frames/                          # Extracted video frames
+        ├── frame_00001_5.00s.jpg
+        ├── frame_00002_10.00s.jpg
+        └── ...
+```
+
+### Manifest File
+
+Each processing run creates a `manifest.json` that tracks:
+- Video information (name, path)
+- Processing timestamp
+- Configuration used (Whisper model, vision settings, etc.)
+- Output file locations
+
+Example manifest:
+```json
+{
+  "video": {
+    "name": "meeting.mkv",
+    "path": "/full/path/to/meeting.mkv"
+  },
+  "processed_at": "2024-10-19T14:30:22",
+  "configuration": {
+    "whisper": {"enabled": true, "model": "base"},
+    "analysis": {"method": "vision", "vision_model": "llava:13b", "vision_context": "code"}
+  }
+}
+```

 ### Caching Behavior

-The tool automatically caches intermediate results to speed up re-runs:
- **Whisper transcript**: Cached as `output/<video>.json`
- **Extracted frames**: Cached in `frames/<video>_*.jpg`
- **Analysis results**: Cached as `output/<video>_vision.json` or `output/<video>_ocr.json`
+The tool automatically reuses the most recent output directory for the same video:
+- **First run**: Creates new timestamped directory (e.g., `20241019_143022-meeting/`)
+- **Subsequent runs**: Reuses the same directory and cached results
+- **Cached items**: Whisper transcript, extracted frames, analysis results
+- **Force new run**: Use `--no-cache` to create a fresh directory

-Re-running with the same video will use cached results unless `--no-cache` is specified.
+This means you can instantly switch between OCR and vision analysis without re-extracting frames!

 ## Workflow for Meeting Analysis

@@ -310,6 +341,15 @@ Options:
 - **`--vision-context dashboard`**: Extracts metrics, trends, panel names
 - **`--vision-context console`**: Captures commands, output, error messages

+**Customizing Prompts:**
+Prompts are stored as editable text files in `meetus/prompts/`:
+- `meeting.txt` - General meeting analysis
+- `code.txt` - Code screenshot analysis
+- `dashboard.txt` - Dashboard/monitoring analysis
+- `console.txt` - Terminal/console analysis
+
+Just edit these files to customize how the vision model analyzes your frames!
+
 ### Scene Detection vs Interval
 - **Scene detection**: Better for presentations with distinct slides. More efficient.
 - **Interval extraction**: Better for continuous screen sharing (coding, browsing). More thorough.
@@ -386,14 +426,29 @@ sudo apt-get install tesseract-ocr  # Don't forget system package!
 meetus/
 ├── meetus/                     # Main package
 │   ├── __init__.py
+│   ├── workflow.py             # Processing orchestrator
+│   ├── output_manager.py       # Output directory & manifest management
+│   ├── cache_manager.py        # Caching logic
 │   ├── frame_extractor.py      # Video frame extraction
+│   ├── vision_processor.py     # Vision model analysis (Ollama/LLaVA)
 │   ├── ocr_processor.py        # OCR processing
-│   └── transcript_merger.py # Transcript merging
-├── process_meeting.py       # Main CLI script
+│   ├── transcript_merger.py    # Transcript merging
+│   └── prompts/                # Vision analysis prompts (editable!)
+│       ├── meeting.txt         # General meeting analysis
+│       ├── code.txt            # Code screenshot analysis
+│       ├── dashboard.txt       # Dashboard/monitoring analysis
+│       └── console.txt         # Terminal/console analysis
+├── process_meeting.py          # Main CLI script (thin wrapper)
 ├── requirements.txt            # Python dependencies
+├── output/                     # Timestamped output directories
+│   ├── .gitkeep
+│   └── YYYYMMDD_HHMMSS-video/  # Auto-generated per video
+├── samples/                    # Sample videos (gitignored)
 └── README.md                   # This file
 ```

+The code is modular and easy to extend - each module has a single responsibility.
+
 ## License

 For personal use.
--- a/meetus/cache_manager.py
+++ b/meetus/cache_manager.py
@@ -0,0 +1,137 @@
+"""
+Manage caching for frames, transcripts, and analysis results.
+"""
+from pathlib import Path
+import json
+import logging
+from typing import List, Tuple, Dict, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class CacheManager:
+    """Manage caching of intermediate processing results."""
+
+    def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True):
+        """
+        Initialize cache manager.
+
+        Args:
+            output_dir: Output directory for cached files
+            frames_dir: Directory for cached frames
+            video_name: Name of the video (stem)
+            use_cache: Whether to use caching
+        """
+        self.output_dir = output_dir
+        self.frames_dir = frames_dir
+        self.video_name = video_name
+        self.use_cache = use_cache
+
+    def get_whisper_cache(self) -> Optional[Path]:
+        """
+        Check for cached Whisper transcript.
+
+        Returns:
+            Path to cached transcript or None
+        """
+        if not self.use_cache:
+            return None
+
+        cache_path = self.output_dir / f"{self.video_name}.json"
+        if cache_path.exists():
+            logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}")
+            return cache_path
+
+        return None
+
+    def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]:
+        """
+        Check for cached frames.
+
+        Returns:
+            List of (frame_path, timestamp) tuples or None
+        """
+        if not self.use_cache or not self.frames_dir.exists():
+            return None
+
+        existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
+
+        if not existing_frames:
+            return None
+
+        logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/")
+
+        # Build frames_info from existing files
+        frames_info = []
+        for frame_path in sorted(existing_frames):
+            # Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg)
+            try:
+                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
+                timestamp = float(timestamp_str)
+            except:
+                timestamp = 0.0
+            frames_info.append((str(frame_path), timestamp))
+
+        return frames_info
+
+    def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]:
+        """
+        Check for cached analysis results.
+
+        Args:
+            analysis_type: 'vision' or 'ocr'
+
+        Returns:
+            List of analysis results or None
+        """
+        if not self.use_cache:
+            return None
+
+        cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
+
+        if cache_path.exists():
+            logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}")
+            with open(cache_path, 'r', encoding='utf-8') as f:
+                results = json.load(f)
+            logger.info(f"✓ Loaded {len(results)} analyzed frames from cache")
+            return results
+
+        return None
+
+    def save_analysis(self, analysis_type: str, results: List[Dict]):
+        """
+        Save analysis results to cache.
+
+        Args:
+            analysis_type: 'vision' or 'ocr'
+            results: Analysis results to save
+        """
+        cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
+
+        with open(cache_path, 'w', encoding='utf-8') as f:
+            json.dump(results, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}")
+
+    def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]:
+        """
+        Check what caches exist.
+
+        Args:
+            analysis_type: Optional specific analysis type to check
+
+        Returns:
+            Dictionary of cache status
+        """
+        status = {
+            "whisper": (self.output_dir / f"{self.video_name}.json").exists(),
+            "frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False,
+        }
+
+        if analysis_type:
+            status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists()
+        else:
+            status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists()
+            status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists()
+
+        return status
--- a/meetus/output_manager.py
+++ b/meetus/output_manager.py
@@ -0,0 +1,135 @@
+"""
+Manage output directories and manifest files.
+Creates timestamped folders for each video and tracks processing options.
+"""
+from pathlib import Path
+from datetime import datetime
+import json
+import logging
+from typing import Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+
+class OutputManager:
+    """Manage output directories and manifest files for video processing."""
+
+    def __init__(self, video_path: Path, base_output_dir: str = "output", use_cache: bool = True):
+        """
+        Initialize output manager.
+
+        Args:
+            video_path: Path to the video file being processed
+            base_output_dir: Base directory for all outputs
+            use_cache: Whether to use existing directories if found
+        """
+        self.video_path = video_path
+        self.base_output_dir = Path(base_output_dir)
+        self.use_cache = use_cache
+
+        # Find or create output directory
+        self.output_dir = self._get_or_create_output_dir()
+        self.frames_dir = self.output_dir / "frames"
+        self.frames_dir.mkdir(exist_ok=True)
+
+        logger.info(f"Output directory: {self.output_dir}")
+
+    def _get_or_create_output_dir(self) -> Path:
+        """
+        Get existing output directory or create a new timestamped one.
+
+        Returns:
+            Path to output directory
+        """
+        video_name = self.video_path.stem
+
+        # Look for existing directories if caching is enabled
+        if self.use_cache and self.base_output_dir.exists():
+            existing_dirs = sorted([
+                d for d in self.base_output_dir.iterdir()
+                if d.is_dir() and d.name.endswith(f"-{video_name}")
+            ], reverse=True)  # Most recent first
+
+            if existing_dirs:
+                logger.info(f"Found existing output: {existing_dirs[0].name}")
+                return existing_dirs[0]
+
+        # Create new timestamped directory
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        dir_name = f"{timestamp}-{video_name}"
+        output_dir = self.base_output_dir / dir_name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"Created new output directory: {dir_name}")
+
+        return output_dir
+
+    def get_path(self, filename: str) -> Path:
+        """Get full path for a file in the output directory."""
+        return self.output_dir / filename
+
+    def get_frames_path(self, filename: str) -> Path:
+        """Get full path for a file in the frames directory."""
+        return self.frames_dir / filename
+
+    def save_manifest(self, config: Dict[str, Any]):
+        """
+        Save processing configuration to manifest.json.
+
+        Args:
+            config: Dictionary of processing options
+        """
+        manifest_path = self.output_dir / "manifest.json"
+
+        manifest = {
+            "video": {
+                "name": self.video_path.name,
+                "path": str(self.video_path.absolute()),
+            },
+            "processed_at": datetime.now().isoformat(),
+            "configuration": config,
+            "outputs": {
+                "frames": str(self.frames_dir.relative_to(self.output_dir)),
+                "enhanced_transcript": f"{self.video_path.stem}_enhanced.txt",
+                "whisper_transcript": f"{self.video_path.stem}.json" if config.get("run_whisper") else None,
+                "analysis": f"{self.video_path.stem}_{'vision' if config.get('use_vision') else 'ocr'}.json"
+            }
+        }
+
+        with open(manifest_path, 'w', encoding='utf-8') as f:
+            json.dump(manifest, f, indent=2, ensure_ascii=False)
+
+        logger.info(f"Saved manifest: {manifest_path}")
+
+    def load_manifest(self) -> Optional[Dict[str, Any]]:
+        """
+        Load existing manifest if it exists.
+
+        Returns:
+            Manifest dictionary or None
+        """
+        manifest_path = self.output_dir / "manifest.json"
+
+        if manifest_path.exists():
+            with open(manifest_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+
+        return None
+
+    def list_outputs(self) -> Dict[str, Any]:
+        """
+        List all output files in the directory.
+
+        Returns:
+            Dictionary of output files and their status
+        """
+        video_name = self.video_path.stem
+
+        return {
+            "output_dir": str(self.output_dir),
+            "manifest": (self.output_dir / "manifest.json").exists(),
+            "enhanced_transcript": (self.output_dir / f"{video_name}_enhanced.txt").exists(),
+            "whisper_transcript": (self.output_dir / f"{video_name}.json").exists(),
+            "vision_analysis": (self.output_dir / f"{video_name}_vision.json").exists(),
+            "ocr_analysis": (self.output_dir / f"{video_name}_ocr.json").exists(),
+            "frames": len(list(self.frames_dir.glob("*.jpg"))) if self.frames_dir.exists() else 0
+        }
--- a/meetus/prompts/code.txt
+++ b/meetus/prompts/code.txt
@@ -0,0 +1,9 @@
+Analyze this code screenshot. Extract:
+1. Programming language
+2. File name or path (if visible)
+3. Code content (preserve exact formatting)
+4. Comments
+5. Function/class names
+6. Any error messages or warnings
+
+Preserve code exactly as shown.
--- a/meetus/prompts/console.txt
+++ b/meetus/prompts/console.txt
@@ -0,0 +1,8 @@
+Analyze this console/terminal output. Extract:
+1. Commands executed
+2. Output/results
+3. Error messages
+4. Warnings or status messages
+5. File paths or URLs
+
+Preserve formatting and structure.
--- a/meetus/prompts/dashboard.txt
+++ b/meetus/prompts/dashboard.txt
@@ -0,0 +1,9 @@
+Analyze this dashboard/monitoring panel. Extract:
+1. Panel titles and metrics names
+2. Current values and units
+3. Trends (up/down/stable)
+4. Alerts or warnings
+5. Time ranges shown
+6. Any anomalies or notable patterns
+
+Format as structured data.
--- a/meetus/prompts/meeting.txt
+++ b/meetus/prompts/meeting.txt
@@ -0,0 +1,10 @@
+Analyze this screen capture from a meeting recording. Extract:
+1. Any visible text (titles, labels, headings)
+2. Key metrics, numbers, or data points shown
+3. Dashboard panels or visualizations (describe what they show)
+4. Code snippets (preserve formatting and context)
+5. Console/terminal output (commands and results)
+6. Application names or UI elements
+
+Focus on information that would help someone understand what was being discussed.
+Be concise but include all important details. If there's code, preserve it exactly.
--- a/meetus/vision_processor.py
+++ b/meetus/vision_processor.py
@@ -6,6 +6,7 @@ from typing import List, Tuple, Dict, Optional
 from pathlib import Path
 import logging
 from difflib import SequenceMatcher
+import os

 logger = logging.getLogger(__name__)

@@ -13,15 +14,24 @@ logger = logging.getLogger(__name__)
 class VisionProcessor:
    """Process frames using local vision models via Ollama."""

-    def __init__(self, model: str = "llava:13b"):
+    def __init__(self, model: str = "llava:13b", prompts_dir: Optional[str] = None):
        """
        Initialize vision processor.

        Args:
            model: Ollama vision model to use (llava:13b, llava:7b, llava-llama3, bakllava)
+            prompts_dir: Directory containing prompt files (default: meetus/prompts/)
        """
        self.model = model
        self._client = None
+
+        # Set prompts directory
+        if prompts_dir:
+            self.prompts_dir = Path(prompts_dir)
+        else:
+            # Default to meetus/prompts/ relative to this file
+            self.prompts_dir = Path(__file__).parent / "prompts"
+
        self._init_client()

    def _init_client(self):
@@ -53,6 +63,26 @@ class VisionProcessor:
                "Also install Ollama: https://ollama.ai/download"
            )

+    def _load_prompt(self, context: str) -> str:
+        """
+        Load prompt from file.
+
+        Args:
+            context: Context name (meeting, dashboard, code, console)
+
+        Returns:
+            Prompt text
+        """
+        prompt_file = self.prompts_dir / f"{context}.txt"
+
+        if prompt_file.exists():
+            with open(prompt_file, 'r', encoding='utf-8') as f:
+                return f.read().strip()
+        else:
+            # Fallback to default prompt
+            logger.warning(f"Prompt file not found: {prompt_file}, using default")
+            return "Analyze this image and describe what you see in detail."
+
    def analyze_frame(self, image_path: str, context: str = "meeting") -> str:
        """
        Analyze a single frame using local vision model.
@@ -64,50 +94,8 @@ class VisionProcessor:
        Returns:
            Analyzed content description
        """
-        # Context-specific prompts
-        prompts = {
-            "meeting": """Analyze this screen capture from a meeting recording. Extract:
-1. Any visible text (titles, labels, headings)
-2. Key metrics, numbers, or data points shown
-3. Dashboard panels or visualizations (describe what they show)
-4. Code snippets (preserve formatting and context)
-5. Console/terminal output (commands and results)
-6. Application names or UI elements
-
-Focus on information that would help someone understand what was being discussed.
-Be concise but include all important details. If there's code, preserve it exactly.""",
-
-            "dashboard": """Analyze this dashboard/monitoring panel. Extract:
-1. Panel titles and metrics names
-2. Current values and units
-3. Trends (up/down/stable)
-4. Alerts or warnings
-5. Time ranges shown
-6. Any anomalies or notable patterns
-
-Format as structured data.""",
-
-            "code": """Analyze this code screenshot. Extract:
-1. Programming language
-2. File name or path (if visible)
-3. Code content (preserve exact formatting)
-4. Comments
-5. Function/class names
-6. Any error messages or warnings
-
-Preserve code exactly as shown.""",
-
-            "console": """Analyze this console/terminal output. Extract:
-1. Commands executed
-2. Output/results
-3. Error messages
-4. Warnings or status messages
-5. File paths or URLs
-
-Preserve formatting and structure."""
-        }
-
-        prompt = prompts.get(context, prompts["meeting"])
+        # Load prompt from file
+        prompt = self._load_prompt(context)

        try:
            # Use Ollama's chat API with vision
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -0,0 +1,316 @@
+"""
+Orchestrate the video processing workflow.
+Coordinates frame extraction, analysis, and transcript merging.
+"""
+from pathlib import Path
+import logging
+import subprocess
+import shutil
+from typing import Dict, Any, Optional
+
+from .output_manager import OutputManager
+from .cache_manager import CacheManager
+from .frame_extractor import FrameExtractor
+from .ocr_processor import OCRProcessor
+from .vision_processor import VisionProcessor
+from .transcript_merger import TranscriptMerger
+
+logger = logging.getLogger(__name__)
+
+
+class WorkflowConfig:
+    """Configuration for the processing workflow."""
+
+    def __init__(self, **kwargs):
+        """Initialize configuration from keyword arguments."""
+        # Video and paths
+        self.video_path = Path(kwargs['video'])
+        self.transcript_path = kwargs.get('transcript')
+        self.output_dir = kwargs.get('output_dir', 'output')
+        self.custom_output = kwargs.get('output')
+
+        # Whisper options
+        self.run_whisper = kwargs.get('run_whisper', False)
+        self.whisper_model = kwargs.get('whisper_model', 'base')
+
+        # Frame extraction
+        self.scene_detection = kwargs.get('scene_detection', False)
+        self.interval = kwargs.get('interval', 5)
+
+        # Analysis options
+        self.use_vision = kwargs.get('use_vision', False)
+        self.vision_model = kwargs.get('vision_model', 'llava:13b')
+        self.vision_context = kwargs.get('vision_context', 'meeting')
+        self.ocr_engine = kwargs.get('ocr_engine', 'tesseract')
+
+        # Processing options
+        self.no_deduplicate = kwargs.get('no_deduplicate', False)
+        self.no_cache = kwargs.get('no_cache', False)
+        self.extract_only = kwargs.get('extract_only', False)
+        self.format = kwargs.get('format', 'detailed')
+
+    def to_dict(self) -> Dict[str, Any]:
+        """Convert config to dictionary for manifest."""
+        return {
+            "whisper": {
+                "enabled": self.run_whisper,
+                "model": self.whisper_model
+            },
+            "frame_extraction": {
+                "method": "scene_detection" if self.scene_detection else "interval",
+                "interval_seconds": self.interval if not self.scene_detection else None
+            },
+            "analysis": {
+                "method": "vision" if self.use_vision else "ocr",
+                "vision_model": self.vision_model if self.use_vision else None,
+                "vision_context": self.vision_context if self.use_vision else None,
+                "ocr_engine": self.ocr_engine if not self.use_vision else None,
+                "deduplication": not self.no_deduplicate
+            },
+            "output_format": self.format
+        }
+
+
+class ProcessingWorkflow:
+    """Orchestrate the complete video processing workflow."""
+
+    def __init__(self, config: WorkflowConfig):
+        """
+        Initialize workflow.
+
+        Args:
+            config: Workflow configuration
+        """
+        self.config = config
+        self.output_mgr = OutputManager(
+            config.video_path,
+            config.output_dir,
+            use_cache=not config.no_cache
+        )
+        self.cache_mgr = CacheManager(
+            self.output_mgr.output_dir,
+            self.output_mgr.frames_dir,
+            config.video_path.stem,
+            use_cache=not config.no_cache
+        )
+
+    def run(self) -> Dict[str, Any]:
+        """
+        Run the complete processing workflow.
+
+        Returns:
+            Dictionary with output paths and status
+        """
+        logger.info("=" * 80)
+        logger.info("MEETING PROCESSOR")
+        logger.info("=" * 80)
+        logger.info(f"Video: {self.config.video_path.name}")
+        logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}")
+        if self.config.use_vision:
+            logger.info(f"Vision Model: {self.config.vision_model}")
+            logger.info(f"Context: {self.config.vision_context}")
+        logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}")
+        logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}")
+        logger.info("=" * 80)
+
+        # Step 0: Whisper transcription
+        transcript_path = self._run_whisper()
+
+        # Step 1: Extract frames
+        frames_info = self._extract_frames()
+
+        if not frames_info:
+            logger.error("No frames extracted")
+            raise RuntimeError("Frame extraction failed")
+
+        # Step 2: Analyze frames
+        screen_segments = self._analyze_frames(frames_info)
+
+        if self.config.extract_only:
+            logger.info("Done! (extract-only mode)")
+            return self._build_result(transcript_path, screen_segments)
+
+        # Step 3: Merge with transcript
+        enhanced_transcript = self._merge_transcripts(transcript_path, screen_segments)
+
+        # Save manifest
+        self.output_mgr.save_manifest(self.config.to_dict())
+
+        # Build final result
+        return self._build_result(transcript_path, screen_segments, enhanced_transcript)
+
+    def _run_whisper(self) -> Optional[str]:
+        """Run Whisper transcription if requested."""
+        if not self.config.run_whisper:
+            return self.config.transcript_path
+
+        # Check cache
+        cached = self.cache_mgr.get_whisper_cache()
+        if cached:
+            return str(cached)
+
+        logger.info("=" * 80)
+        logger.info("STEP 0: Running Whisper Transcription")
+        logger.info("=" * 80)
+
+        # Check if whisper is installed
+        if not shutil.which("whisper"):
+            logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
+            raise RuntimeError("Whisper not installed")
+
+        logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...")
+        logger.info("This may take a few minutes depending on video length...")
+
+        # Run whisper command
+        cmd = [
+            "whisper",
+            str(self.config.video_path),
+            "--model", self.config.whisper_model,
+            "--output_format", "json",
+            "--output_dir", str(self.output_mgr.output_dir)
+        ]
+
+        try:
+            subprocess.run(cmd, check=True, capture_output=True, text=True)
+
+            transcript_path = self.output_mgr.get_path(f"{self.config.video_path.stem}.json")
+
+            if transcript_path.exists():
+                logger.info(f"✓ Whisper transcription completed: {transcript_path.name}")
+                logger.info("")
+                return str(transcript_path)
+            else:
+                logger.error("Whisper completed but transcript file not found")
+                raise RuntimeError("Whisper output missing")
+
+        except subprocess.CalledProcessError as e:
+            logger.error(f"Whisper failed: {e.stderr}")
+            raise
+
+    def _extract_frames(self):
+        """Extract frames from video."""
+        logger.info("Step 1: Extracting frames from video...")
+
+        # Check cache
+        cached_frames = self.cache_mgr.get_frames_cache()
+        if cached_frames:
+            return cached_frames
+
+        # Extract frames
+        extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
+
+        if self.config.scene_detection:
+            frames_info = extractor.extract_scene_changes()
+        else:
+            frames_info = extractor.extract_by_interval(self.config.interval)
+
+        logger.info(f"✓ Extracted {len(frames_info)} frames")
+        return frames_info
+
+    def _analyze_frames(self, frames_info):
+        """Analyze frames with vision or OCR."""
+        analysis_type = 'vision' if self.config.use_vision else 'ocr'
+
+        # Check cache
+        cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type)
+        if cached_analysis:
+            return cached_analysis
+
+        if self.config.use_vision:
+            return self._run_vision_analysis(frames_info)
+        else:
+            return self._run_ocr_analysis(frames_info)
+
+    def _run_vision_analysis(self, frames_info):
+        """Run vision analysis on frames."""
+        logger.info("Step 2: Running vision analysis on extracted frames...")
+
+        try:
+            vision = VisionProcessor(model=self.config.vision_model)
+            screen_segments = vision.process_frames(
+                frames_info,
+                context=self.config.vision_context,
+                deduplicate=not self.config.no_deduplicate
+            )
+            logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
+
+            # Cache results
+            self.cache_mgr.save_analysis('vision', screen_segments)
+            return screen_segments
+
+        except ImportError as e:
+            logger.error(f"{e}")
+            raise
+
+    def _run_ocr_analysis(self, frames_info):
+        """Run OCR analysis on frames."""
+        logger.info("Step 2: Running OCR on extracted frames...")
+
+        try:
+            ocr = OCRProcessor(engine=self.config.ocr_engine)
+            screen_segments = ocr.process_frames(
+                frames_info,
+                deduplicate=not self.config.no_deduplicate
+            )
+            logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
+
+            # Cache results
+            self.cache_mgr.save_analysis('ocr', screen_segments)
+            return screen_segments
+
+        except ImportError as e:
+            logger.error(f"{e}")
+            logger.error(f"To install {self.config.ocr_engine}:")
+            logger.error(f"  pip install {self.config.ocr_engine}")
+            raise
+
+    def _merge_transcripts(self, transcript_path, screen_segments):
+        """Merge audio and screen transcripts."""
+        merger = TranscriptMerger()
+
+        # Load audio transcript if available
+        audio_segments = []
+        if transcript_path:
+            logger.info("Step 3: Merging with Whisper transcript...")
+            transcript_file = Path(transcript_path)
+
+            if not transcript_file.exists():
+                logger.warning(f"Transcript not found: {transcript_path}")
+                logger.info("Proceeding with screen content only...")
+            else:
+                audio_segments = merger.load_whisper_transcript(str(transcript_file))
+                logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
+        else:
+            logger.info("No transcript provided, using screen content only...")
+
+        # Merge and format
+        merged = merger.merge_transcripts(audio_segments, screen_segments)
+        formatted = merger.format_for_claude(merged, format_style=self.config.format)
+
+        # Save output
+        if self.config.custom_output:
+            output_path = self.config.custom_output
+        else:
+            output_path = self.output_mgr.get_path(f"{self.config.video_path.stem}_enhanced.txt")
+
+        merger.save_transcript(formatted, str(output_path))
+
+        logger.info("=" * 80)
+        logger.info("✓ PROCESSING COMPLETE!")
+        logger.info("=" * 80)
+        logger.info(f"Output directory: {self.output_mgr.output_dir}")
+        logger.info(f"Enhanced transcript: {Path(output_path).name}")
+        logger.info("")
+
+        return output_path
+
+    def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None):
+        """Build result dictionary."""
+        return {
+            "output_dir": str(self.output_mgr.output_dir),
+            "transcript": transcript_path,
+            "analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json",
+            "frames_count": len(screen_segments) if screen_segments else 0,
+            "enhanced_transcript": enhanced_transcript,
+            "manifest": str(self.output_mgr.get_path("manifest.json"))
+        }
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -1,34 +1,19 @@
 #!/usr/bin/env python3
 """
 Process meeting recordings to extract audio + screen content.
-Combines Whisper transcripts with OCR from screen shares.
+Combines Whisper transcripts with vision analysis or OCR from screen shares.
 """
 import argparse
-from pathlib import Path
 import sys
-import json
 import logging
-import subprocess
-import shutil

-from meetus.frame_extractor import FrameExtractor
-from meetus.ocr_processor import OCRProcessor
-from meetus.vision_processor import VisionProcessor
-from meetus.transcript_merger import TranscriptMerger
-
-logger = logging.getLogger(__name__)
+from meetus.workflow import WorkflowConfig, ProcessingWorkflow


 def setup_logging(verbose: bool = False):
-    """
-    Configure logging for the application.
-
-    Args:
-        verbose: If True, set DEBUG level, otherwise INFO
-    """
+    """Configure logging for the application."""
    level = logging.DEBUG if verbose else logging.INFO

-    # Configure root logger
    logging.basicConfig(
        level=level,
        format='%(asctime)s - %(levelname)s - %(message)s',
@@ -41,58 +26,6 @@ def setup_logging(verbose: bool = False):
    logging.getLogger('paddleocr').setLevel(logging.WARNING)


-def run_whisper(video_path: Path, model: str = "base", output_dir: str = "output") -> Path:
-    """
-    Run Whisper transcription on video file.
-
-    Args:
-        video_path: Path to video file
-        model: Whisper model to use (tiny, base, small, medium, large)
-        output_dir: Directory to save output
-
-    Returns:
-        Path to generated JSON transcript
-    """
-    # Check if whisper is installed
-    if not shutil.which("whisper"):
-        logger.error("Whisper is not installed. Install it with: pip install openai-whisper")
-        sys.exit(1)
-
-    logger.info(f"Running Whisper transcription (model: {model})...")
-    logger.info("This may take a few minutes depending on video length...")
-
-    # Run whisper command
-    cmd = [
-        "whisper",
-        str(video_path),
-        "--model", model,
-        "--output_format", "json",
-        "--output_dir", output_dir
-    ]
-
-    try:
-        result = subprocess.run(
-            cmd,
-            check=True,
-            capture_output=True,
-            text=True
-        )
-
-        # Whisper outputs to <output_dir>/<video_stem>.json
-        transcript_path = Path(output_dir) / f"{video_path.stem}.json"
-
-        if transcript_path.exists():
-            logger.info(f"✓ Whisper transcription completed: {transcript_path}")
-            return transcript_path
-        else:
-            logger.error("Whisper completed but transcript file not found")
-            sys.exit(1)
-
-    except subprocess.CalledProcessError as e:
-        logger.error(f"Whisper failed: {e.stderr}")
-        sys.exit(1)
-
-
 def main():
    parser = argparse.ArgumentParser(
        description="Extract screen content from meeting recordings and merge with transcripts",
@@ -119,23 +52,23 @@ Examples:
        """
    )

+    # Required arguments
    parser.add_argument(
        'video',
        help='Path to video file'
    )

+    # Whisper options
    parser.add_argument(
        '--transcript', '-t',
        help='Path to Whisper transcript (JSON or TXT)',
        default=None
    )
-
    parser.add_argument(
        '--run-whisper',
        action='store_true',
        help='Run Whisper transcription before processing'
    )
-
    parser.add_argument(
        '--whisper-model',
        choices=['tiny', 'base', 'small', 'medium', 'large'],
@@ -143,56 +76,48 @@ Examples:
        default='base'
    )

+    # Output options
    parser.add_argument(
        '--output', '-o',
-        help='Output file for enhanced transcript (default: output/<video>_enhanced.txt)',
+        help='Output file for enhanced transcript (default: auto-generated in output directory)',
        default=None
    )
-
    parser.add_argument(
        '--output-dir',
-        help='Directory for output files (default: output/)',
+        help='Base directory for outputs (default: output/)',
        default='output'
    )

-    parser.add_argument(
-        '--frames-dir',
-        help='Directory to save extracted frames (default: frames/)',
-        default='frames'
-    )
-
+    # Frame extraction options
    parser.add_argument(
        '--interval',
        type=int,
        help='Extract frame every N seconds (default: 5)',
        default=5
    )
-
    parser.add_argument(
        '--scene-detection',
        action='store_true',
        help='Use scene detection instead of interval extraction'
    )

+    # Analysis options
    parser.add_argument(
        '--ocr-engine',
        choices=['tesseract', 'easyocr', 'paddleocr'],
        help='OCR engine to use (default: tesseract)',
        default='tesseract'
    )
-
    parser.add_argument(
        '--use-vision',
        action='store_true',
        help='Use local vision model (Ollama) instead of OCR for better context understanding'
    )
-
    parser.add_argument(
        '--vision-model',
        help='Vision model to use with Ollama (default: llava:13b)',
        default='llava:13b'
    )
-
    parser.add_argument(
        '--vision-context',
        choices=['meeting', 'dashboard', 'code', 'console'],
@@ -200,24 +125,22 @@ Examples:
        default='meeting'
    )

+    # Processing options
    parser.add_argument(
        '--no-cache',
        action='store_true',
        help='Disable caching - reprocess everything even if outputs exist'
    )
-
    parser.add_argument(
        '--no-deduplicate',
        action='store_true',
        help='Disable text deduplication'
    )
-
    parser.add_argument(
        '--extract-only',
        action='store_true',
-        help='Only extract frames and OCR, skip transcript merging'
+        help='Only extract frames and analyze, skip transcript merging'
    )
-
    parser.add_argument(
        '--format',
        choices=['detailed', 'compact'],
@@ -225,6 +148,7 @@ Examples:
        default='detailed'
    )

+    # Logging
    parser.add_argument(
        '--verbose', '-v',
        action='store_true',
@@ -236,166 +160,38 @@ Examples:
    # Setup logging
    setup_logging(args.verbose)

-    # Validate video path
-    video_path = Path(args.video)
-    if not video_path.exists():
-        logger.error(f"Video file not found: {args.video}")
-        sys.exit(1)
-
-    # Create output directory
-    output_dir = Path(args.output_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    # Set default output path
-    if args.output is None:
-        args.output = str(output_dir / f"{video_path.stem}_enhanced.txt")
-
-    # Define cache paths
-    whisper_cache = output_dir / f"{video_path.stem}.json"
-    analysis_cache = output_dir / f"{video_path.stem}_{'vision' if args.use_vision else 'ocr'}.json"
-    frames_cache_dir = Path(args.frames_dir)
-
-    # Check for cached Whisper transcript
-    if args.run_whisper:
-        if not args.no_cache and whisper_cache.exists():
-            logger.info(f"✓ Found cached Whisper transcript: {whisper_cache}")
-            args.transcript = str(whisper_cache)
-        else:
-            logger.info("=" * 80)
-            logger.info("STEP 0: Running Whisper Transcription")
-            logger.info("=" * 80)
-            transcript_path = run_whisper(video_path, args.whisper_model, str(output_dir))
-            args.transcript = str(transcript_path)
-            logger.info("")
-
-    logger.info("=" * 80)
-    logger.info("MEETING PROCESSOR")
-    logger.info("=" * 80)
-    logger.info(f"Video: {video_path.name}")
-    logger.info(f"Analysis: {'Vision Model' if args.use_vision else f'OCR ({args.ocr_engine})'}")
-    if args.use_vision:
-        logger.info(f"Vision Model: {args.vision_model}")
-        logger.info(f"Context: {args.vision_context}")
-    logger.info(f"Frame extraction: {'Scene detection' if args.scene_detection else f'Every {args.interval}s'}")
-    if args.transcript:
-        logger.info(f"Transcript: {args.transcript}")
-    logger.info(f"Caching: {'Disabled' if args.no_cache else 'Enabled'}")
-    logger.info("=" * 80)
-
-    # Step 1: Extract frames (with caching)
-    logger.info("Step 1: Extracting frames from video...")
-
-    # Check if frames already exist
-    existing_frames = list(frames_cache_dir.glob(f"{video_path.stem}_*.jpg")) if frames_cache_dir.exists() else []
-
-    if not args.no_cache and existing_frames and len(existing_frames) > 0:
-        logger.info(f"✓ Found {len(existing_frames)} cached frames in {args.frames_dir}/")
-        # Build frames_info from existing files
-        frames_info = []
-        for frame_path in sorted(existing_frames):
-            # Try to extract timestamp from filename (e.g., video_00001_12.34s.jpg)
    try:
-                timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
-                timestamp = float(timestamp_str)
-            except:
-                timestamp = 0.0
-            frames_info.append((str(frame_path), timestamp))
-    else:
-        extractor = FrameExtractor(str(video_path), args.frames_dir)
+        # Create workflow configuration
+        config = WorkflowConfig(**vars(args))

-        if args.scene_detection:
-            frames_info = extractor.extract_scene_changes()
-        else:
-            frames_info = extractor.extract_by_interval(args.interval)
+        # Run processing workflow
+        workflow = ProcessingWorkflow(config)
+        result = workflow.run()

-        if not frames_info:
-            logger.error("No frames extracted")
-            sys.exit(1)
+        # Print final summary
+        print("\n" + "=" * 80)
+        print("✓ SUCCESS!")
+        print("=" * 80)
+        print(f"Output directory: {result['output_dir']}")
+        if result.get('enhanced_transcript'):
+            print(f"Enhanced transcript ready for AI summarization!")
+        print("=" * 80)

-        logger.info(f"✓ Extracted {len(frames_info)} frames")
+        return 0

-    # Step 2: Run analysis on frames (with caching)
-    if not args.no_cache and analysis_cache.exists():
-        logger.info(f"✓ Found cached analysis results: {analysis_cache}")
-        with open(analysis_cache, 'r', encoding='utf-8') as f:
-            screen_segments = json.load(f)
-        logger.info(f"✓ Loaded {len(screen_segments)} analyzed frames from cache")
-    else:
-        if args.use_vision:
-            # Use vision model
-            logger.info("Step 2: Running vision analysis on extracted frames...")
-            try:
-                vision = VisionProcessor(model=args.vision_model)
-                screen_segments = vision.process_frames(
-                    frames_info,
-                    context=args.vision_context,
-                    deduplicate=not args.no_deduplicate
-                )
-                logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
-
-            except ImportError as e:
-                logger.error(f"{e}")
-                sys.exit(1)
-        else:
-            # Use OCR
-            logger.info("Step 2: Running OCR on extracted frames...")
-            try:
-                ocr = OCRProcessor(engine=args.ocr_engine)
-                screen_segments = ocr.process_frames(
-                    frames_info,
-                    deduplicate=not args.no_deduplicate
-                )
-                logger.info(f"✓ Processed {len(screen_segments)} frames with OCR")
-
-            except ImportError as e:
-                logger.error(f"{e}")
-                logger.error(f"To install {args.ocr_engine}:")
-                logger.error(f"  pip install {args.ocr_engine}")
-                sys.exit(1)
-
-        # Save analysis results as JSON
-        with open(analysis_cache, 'w', encoding='utf-8') as f:
-            json.dump(screen_segments, f, indent=2, ensure_ascii=False)
-        logger.info(f"✓ Saved analysis results to: {analysis_cache}")
-
-    if args.extract_only:
-        logger.info("Done! (extract-only mode)")
-        return
-
-    # Step 3: Merge with transcript (if provided)
-    merger = TranscriptMerger()
-
-    if args.transcript:
-        logger.info("Step 3: Merging with Whisper transcript...")
-        transcript_path = Path(args.transcript)
-
-        if not transcript_path.exists():
-            logger.warning(f"Transcript not found: {args.transcript}")
-            logger.info("Proceeding with screen content only...")
-            audio_segments = []
-        else:
-            audio_segments = merger.load_whisper_transcript(str(transcript_path))
-            logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
-    else:
-        logger.info("No transcript provided, using screen content only...")
-        audio_segments = []
-
-    # Merge and format
-    merged = merger.merge_transcripts(audio_segments, screen_segments)
-    formatted = merger.format_for_claude(merged, format_style=args.format)
-
-    # Save output
-    merger.save_transcript(formatted, args.output)
-
-    logger.info("=" * 80)
-    logger.info("✓ PROCESSING COMPLETE!")
-    logger.info("=" * 80)
-    logger.info(f"Enhanced transcript: {args.output}")
-    logger.info(f"OCR data: {ocr_output}")
-    logger.info(f"Frames: {args.frames_dir}/")
-    logger.info("")
-    logger.info("You can now use the enhanced transcript with Claude for summarization!")
+    except FileNotFoundError as e:
+        logging.error(f"File not found: {e}")
+        return 1
+    except RuntimeError as e:
+        logging.error(f"Processing failed: {e}")
+        return 1
+    except KeyboardInterrupt:
+        logging.warning("\nProcessing interrupted by user")
+        return 130
+    except Exception as e:
+        logging.exception(f"Unexpected error: {e}")
+        return 1


 if __name__ == '__main__':
-    main()
+    sys.exit(main())