From b1e1daf27825972834e3cac6d38fbdfb29647044 Mon Sep 17 00:00:00 2001
From: Mariano Gabriel <pensalo@gmail.com>
Date: Tue, 28 Oct 2025 05:52:31 -0300
Subject: [PATCH] scene detection quality and caching

---
 def/01-scene-detection-quality-caching.md | 80 +++++++++++++++++++++++
 meetus/cache_manager.py                   | 18 +++--
 meetus/frame_extractor.py                 | 53 +++++++++------
 meetus/workflow.py                        | 22 +++++--
 process_meeting.py                        | 25 ++++++-
 requirements.txt                          |  1 +
 6 files changed, 169 insertions(+), 30 deletions(-)
 create mode 100644 def/01-scene-detection-quality-caching.md

diff --git a/def/01-scene-detection-quality-caching.md b/def/01-scene-detection-quality-caching.md
new file mode 100644
index 0000000..3d406f6
--- /dev/null
+++ b/def/01-scene-detection-quality-caching.md
@@ -0,0 +1,80 @@
+# 01 - Scene Detection Sensitivity, Image Quality, and Granular Caching
+
+## Date
+2025-10-28
+
+## Context
+Last run on zaca-run-scrapers sample (Zed editor walkthrough) only detected 19 frames with 7+ minute gaps. Whisper wasn't running (flag not passed). JPEG compression quality was poor for code/text readability.
+
+## Problems Identified
+1. **Scene detection too conservative** - Default threshold of 30.0 missed file switches and scrolling in clean UI (Zed vs VS Code)
+2. **No whisper transcription** - User expected it to run but `--run-whisper` is opt-in
+3. **Poor JPEG quality** - Default compression made code/text hard to read for OCR/vision
+4. **Subprocess-based FFmpeg** - Using shell commands instead of Python library
+5. **All-or-nothing caching** - `--no-cache` regenerates everything including slow whisper transcription
+
+## Changes Made
+
+### 1. Scene Detection Sensitivity
+**Files:** `meetus/frame_extractor.py`, `process_meeting.py`, `meetus/workflow.py`
+
+- Lowered default threshold: `30.0` → `15.0` (more sensitive for clean UIs)
+- Added `--scene-threshold` CLI argument (0-100, lower = more sensitive)
+- Added threshold to manifest for tracking
+- Updated docstring with usage guidelines:
+  - 15.0: Good for clean UIs like Zed
+  - 20-30: Busy UIs like VS Code
+  - 5-10: Very subtle changes
+
+### 2. JPEG Quality Improvements
+**Files:** `meetus/frame_extractor.py`
+
+- **Interval extraction**: Added `cv2.IMWRITE_JPEG_QUALITY, 95` (line 60)
+- **Scene detection**: Added `-q:v 2` to FFmpeg (best quality, line 94)
+
+### 3. Migration to ffmpeg-python
+**Files:** `meetus/frame_extractor.py`, `requirements.txt`
+
+- Replaced `subprocess.run()` with `ffmpeg-python` library
+- Cleaner, more Pythonic API
+- Better error handling with `ffmpeg.Error`
+- Added to requirements.txt
+
+### 4. Granular Cache Control
+**Files:** `process_meeting.py`, `meetus/workflow.py`, `meetus/cache_manager.py`
+
+Added three new flags for selective cache invalidation:
+- `--skip-cache-frames`: Regenerate frames (useful when tuning scene threshold)
+- `--skip-cache-whisper`: Rerun whisper transcription
+- `--skip-cache-analysis`: Rerun OCR/vision analysis
+
+**Key design:**
+- `--no-cache`: Still works as before (new directory + regenerate everything)
+- New flags: Reuse existing output directory but selectively invalidate caches
+- Frames are cleaned up when regenerating to avoid stale data
+
+## Typical Workflow
+
+```bash
+# First run - generate everything including whisper (expensive, once)
+python process_meeting.py samples/video.mkv --run-whisper --scene-detection --use-vision
+
+# Iterate on scene threshold without re-running whisper
+python process_meeting.py samples/video.mkv --scene-detection --scene-threshold 10 --use-vision --skip-cache-frames --skip-cache-analysis
+
+# Try even more sensitive
+python process_meeting.py samples/video.mkv --scene-detection --scene-threshold 5 --use-vision --skip-cache-frames --skip-cache-analysis
+```
+
+## Notes
+- Whisper is the most expensive and reliable step → always cache it during iteration
+- Scene detection needs tuning per UI style (Zed vs VS Code)
+- Vision analysis should regenerate when frames change
+- Walking through code (file switches, scrolling) should trigger scene changes
+
+## Files Modified
+- `meetus/frame_extractor.py` - Scene threshold, quality, ffmpeg-python
+- `meetus/workflow.py` - Cache flags, frame cleanup
+- `meetus/cache_manager.py` - Granular cache checks
+- `process_meeting.py` - CLI arguments
+- `requirements.txt` - Added ffmpeg-python
diff --git a/meetus/cache_manager.py b/meetus/cache_manager.py
index 44df8f4..85c2c99 100644
--- a/meetus/cache_manager.py
+++ b/meetus/cache_manager.py
@@ -12,7 +12,9 @@ logger = logging.getLogger(__name__)
 class CacheManager:
     """Manage caching of intermediate processing results."""
 
-    def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True):
+    def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True,
+                 skip_cache_frames: bool = False, skip_cache_whisper: bool = False,
+                 skip_cache_analysis: bool = False):
         """
         Initialize cache manager.
 
@@ -20,12 +22,18 @@ class CacheManager:
             output_dir: Output directory for cached files
             frames_dir: Directory for cached frames
             video_name: Name of the video (stem)
-            use_cache: Whether to use caching
+            use_cache: Whether to use caching globally
+            skip_cache_frames: Skip cached frames specifically
+            skip_cache_whisper: Skip cached whisper specifically
+            skip_cache_analysis: Skip cached analysis specifically
         """
         self.output_dir = output_dir
         self.frames_dir = frames_dir
         self.video_name = video_name
         self.use_cache = use_cache
+        self.skip_cache_frames = skip_cache_frames
+        self.skip_cache_whisper = skip_cache_whisper
+        self.skip_cache_analysis = skip_cache_analysis
 
     def get_whisper_cache(self) -> Optional[Path]:
         """
@@ -34,7 +42,7 @@ class CacheManager:
         Returns:
             Path to cached transcript or None
         """
-        if not self.use_cache:
+        if not self.use_cache or self.skip_cache_whisper:
             return None
 
         cache_path = self.output_dir / f"{self.video_name}.json"
@@ -51,7 +59,7 @@ class CacheManager:
         Returns:
             List of (frame_path, timestamp) tuples or None
         """
-        if not self.use_cache or not self.frames_dir.exists():
+        if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists():
             return None
 
         existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
@@ -84,7 +92,7 @@ class CacheManager:
         Returns:
             List of analysis results or None
         """
-        if not self.use_cache:
+        if not self.use_cache or self.skip_cache_analysis:
             return None
 
         cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
diff --git a/meetus/frame_extractor.py b/meetus/frame_extractor.py
index 6cf447e..6b71676 100644
--- a/meetus/frame_extractor.py
+++ b/meetus/frame_extractor.py
@@ -6,9 +6,9 @@ import cv2
 import os
 from pathlib import Path
 from typing import List, Tuple, Optional
-import subprocess
 import json
 import logging
+import re
 
 logger = logging.getLogger(__name__)
 
@@ -56,7 +56,8 @@ class FrameExtractor:
                 frame_filename = f"frame_{saved_count:05d}_{timestamp:.2f}s.jpg"
                 frame_path = self.output_dir / frame_filename
 
-                cv2.imwrite(str(frame_path), frame)
+                # Use high quality for text readability (95 = high quality JPEG)
+                cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95])
                 frames_info.append((str(frame_path), timestamp))
                 saved_count += 1
 
@@ -66,41 +67,51 @@ class FrameExtractor:
         logger.info(f"Extracted {saved_count} frames at {interval_seconds}s intervals")
         return frames_info
 
-    def extract_scene_changes(self, threshold: float = 30.0) -> List[Tuple[str, float]]:
+    def extract_scene_changes(self, threshold: float = 15.0) -> List[Tuple[str, float]]:
         """
         Extract frames only on scene changes using FFmpeg.
         More efficient than interval-based extraction.
 
         Args:
             threshold: Scene change detection threshold (0-100, lower = more sensitive)
+                      Default: 15.0 (good for clean UIs like Zed)
+                      Higher values (20-30) for busy UIs like VS Code
+                      Lower values (5-10) for very subtle changes
 
         Returns:
             List of (frame_path, timestamp) tuples
         """
+        try:
+            import ffmpeg
+        except ImportError:
+            raise ImportError("ffmpeg-python not installed. Run: pip install ffmpeg-python")
+
         video_name = Path(self.video_path).stem
         output_pattern = self.output_dir / f"{video_name}_%05d.jpg"
 
-        # Use FFmpeg's scene detection filter
-        cmd = [
-            'ffmpeg',
-            '-i', self.video_path,
-            '-vf', f'select=gt(scene\\,{threshold/100}),showinfo',
-            '-vsync', 'vfr',
-            '-frame_pts', '1',
-            str(output_pattern),
-            '-loglevel', 'info'
-        ]
-
         try:
-            result = subprocess.run(cmd, capture_output=True, text=True, check=True)
+            # Use FFmpeg's scene detection filter with high quality output
+            stream = ffmpeg.input(self.video_path)
+            stream = ffmpeg.filter(stream, 'select', f'gt(scene,{threshold/100})')
+            stream = ffmpeg.filter(stream, 'showinfo')
+            stream = ffmpeg.output(
+                stream,
+                str(output_pattern),
+                vsync='vfr',
+                frame_pts=1,
+                **{'q:v': '2'}  # High quality JPEG
+            )
+
+            # Run with stderr capture to get showinfo output
+            _, stderr = ffmpeg.run(stream, capture_stderr=True, overwrite_output=True)
+            stderr = stderr.decode('utf-8')
 
             # Parse FFmpeg output to get frame timestamps from showinfo filter
-            import re
             frames_info = []
 
             # Extract timestamps from stderr (showinfo outputs there)
             timestamp_pattern = r'pts_time:([\d.]+)'
-            timestamps = re.findall(timestamp_pattern, result.stderr)
+            timestamps = re.findall(timestamp_pattern, stderr)
 
             # Match frames to timestamps
             frame_files = sorted(self.output_dir.glob(f"{video_name}_*.jpg"))
@@ -113,11 +124,15 @@ class FrameExtractor:
             logger.info(f"Extracted {len(frames_info)} frames at scene changes")
             return frames_info
 
-        except subprocess.CalledProcessError as e:
-            logger.error(f"FFmpeg error: {e.stderr}")
+        except ffmpeg.Error as e:
+            logger.error(f"FFmpeg error: {e.stderr.decode() if e.stderr else str(e)}")
             # Fallback to interval extraction
             logger.warning("Falling back to interval extraction...")
             return self.extract_by_interval()
+        except Exception as e:
+            logger.error(f"Unexpected error during scene extraction: {e}")
+            logger.warning("Falling back to interval extraction...")
+            return self.extract_by_interval()
 
     def get_video_duration(self) -> float:
         """Get video duration in seconds."""
diff --git a/meetus/workflow.py b/meetus/workflow.py
index b7bb854..b60e3bd 100644
--- a/meetus/workflow.py
+++ b/meetus/workflow.py
@@ -31,10 +31,11 @@ class WorkflowConfig:
 
         # Whisper options
         self.run_whisper = kwargs.get('run_whisper', False)
-        self.whisper_model = kwargs.get('whisper_model', 'base')
+        self.whisper_model = kwargs.get('whisper_model', 'medium')
 
         # Frame extraction
         self.scene_detection = kwargs.get('scene_detection', False)
+        self.scene_threshold = kwargs.get('scene_threshold', 15.0)
         self.interval = kwargs.get('interval', 5)
 
         # Analysis options
@@ -46,6 +47,9 @@ class WorkflowConfig:
         # Processing options
         self.no_deduplicate = kwargs.get('no_deduplicate', False)
         self.no_cache = kwargs.get('no_cache', False)
+        self.skip_cache_frames = kwargs.get('skip_cache_frames', False)
+        self.skip_cache_whisper = kwargs.get('skip_cache_whisper', False)
+        self.skip_cache_analysis = kwargs.get('skip_cache_analysis', False)
         self.extract_only = kwargs.get('extract_only', False)
         self.format = kwargs.get('format', 'detailed')
 
@@ -58,7 +62,8 @@ class WorkflowConfig:
             },
             "frame_extraction": {
                 "method": "scene_detection" if self.scene_detection else "interval",
-                "interval_seconds": self.interval if not self.scene_detection else None
+                "interval_seconds": self.interval if not self.scene_detection else None,
+                "scene_threshold": self.scene_threshold if self.scene_detection else None
             },
             "analysis": {
                 "method": "vision" if self.use_vision else "ocr",
@@ -91,7 +96,10 @@ class ProcessingWorkflow:
             self.output_mgr.output_dir,
             self.output_mgr.frames_dir,
             config.video_path.stem,
-            use_cache=not config.no_cache
+            use_cache=not config.no_cache,
+            skip_cache_frames=config.skip_cache_frames,
+            skip_cache_whisper=config.skip_cache_whisper,
+            skip_cache_analysis=config.skip_cache_analysis
         )
 
     def run(self) -> Dict[str, Any]:
@@ -206,11 +214,17 @@ class ProcessingWorkflow:
         if cached_frames:
             return cached_frames
 
+        # Clean up old frames if regenerating
+        if self.config.skip_cache_frames and self.output_mgr.frames_dir.exists():
+            logger.info("Cleaning up old frames...")
+            for old_frame in self.output_mgr.frames_dir.glob("*.jpg"):
+                old_frame.unlink()
+
         # Extract frames
         extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir))
 
         if self.config.scene_detection:
-            frames_info = extractor.extract_scene_changes()
+            frames_info = extractor.extract_scene_changes(threshold=self.config.scene_threshold)
         else:
             frames_info = extractor.extract_by_interval(self.config.interval)
 
diff --git a/process_meeting.py b/process_meeting.py
index af01d0c..78dd5ca 100644
--- a/process_meeting.py
+++ b/process_meeting.py
@@ -72,8 +72,8 @@ Examples:
     parser.add_argument(
         '--whisper-model',
         choices=['tiny', 'base', 'small', 'medium', 'large'],
-        help='Whisper model to use (default: base)',
-        default='base'
+        help='Whisper model to use (default: medium)',
+        default='medium'
     )
 
     # Output options
@@ -100,6 +100,12 @@ Examples:
         action='store_true',
         help='Use scene detection instead of interval extraction'
     )
+    parser.add_argument(
+        '--scene-threshold',
+        type=float,
+        help='Scene detection threshold (0-100, lower=more sensitive, default: 15)',
+        default=15.0
+    )
 
     # Analysis options
     parser.add_argument(
@@ -131,6 +137,21 @@ Examples:
         action='store_true',
         help='Disable caching - reprocess everything even if outputs exist'
     )
+    parser.add_argument(
+        '--skip-cache-frames',
+        action='store_true',
+        help='Skip cached frames, re-extract from video (but keep whisper/analysis cache)'
+    )
+    parser.add_argument(
+        '--skip-cache-whisper',
+        action='store_true',
+        help='Skip cached whisper transcript, re-run transcription (but keep frames/analysis cache)'
+    )
+    parser.add_argument(
+        '--skip-cache-analysis',
+        action='store_true',
+        help='Skip cached analysis, re-run OCR/vision (but keep frames/whisper cache)'
+    )
     parser.add_argument(
         '--no-deduplicate',
         action='store_true',
diff --git a/requirements.txt b/requirements.txt
index 497ecc0..89943a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,7 @@
 # Core dependencies
 opencv-python>=4.8.0
 Pillow>=10.0.0
+ffmpeg-python>=0.2.0
 
 # Vision analysis (recommended for better results)
 # Requires Ollama to be installed: https://ollama.ai/download