From b1e1daf27825972834e3cac6d38fbdfb29647044 Mon Sep 17 00:00:00 2001 From: Mariano Gabriel Date: Tue, 28 Oct 2025 05:52:31 -0300 Subject: [PATCH] scene detection quality and caching --- def/01-scene-detection-quality-caching.md | 80 +++++++++++++++++++++++ meetus/cache_manager.py | 18 +++-- meetus/frame_extractor.py | 53 +++++++++------ meetus/workflow.py | 22 +++++-- process_meeting.py | 25 ++++++- requirements.txt | 1 + 6 files changed, 169 insertions(+), 30 deletions(-) create mode 100644 def/01-scene-detection-quality-caching.md diff --git a/def/01-scene-detection-quality-caching.md b/def/01-scene-detection-quality-caching.md new file mode 100644 index 0000000..3d406f6 --- /dev/null +++ b/def/01-scene-detection-quality-caching.md @@ -0,0 +1,80 @@ +# 01 - Scene Detection Sensitivity, Image Quality, and Granular Caching + +## Date +2025-10-28 + +## Context +Last run on zaca-run-scrapers sample (Zed editor walkthrough) only detected 19 frames with 7+ minute gaps. Whisper wasn't running (flag not passed). JPEG compression quality was poor for code/text readability. + +## Problems Identified +1. **Scene detection too conservative** - Default threshold of 30.0 missed file switches and scrolling in clean UI (Zed vs VS Code) +2. **No whisper transcription** - User expected it to run but `--run-whisper` is opt-in +3. **Poor JPEG quality** - Default compression made code/text hard to read for OCR/vision +4. **Subprocess-based FFmpeg** - Using shell commands instead of Python library +5. **All-or-nothing caching** - `--no-cache` regenerates everything including slow whisper transcription + +## Changes Made + +### 1. Scene Detection Sensitivity +**Files:** `meetus/frame_extractor.py`, `process_meeting.py`, `meetus/workflow.py` + +- Lowered default threshold: `30.0` → `15.0` (more sensitive for clean UIs) +- Added `--scene-threshold` CLI argument (0-100, lower = more sensitive) +- Added threshold to manifest for tracking +- Updated docstring with usage guidelines: + - 15.0: Good for clean UIs like Zed + - 20-30: Busy UIs like VS Code + - 5-10: Very subtle changes + +### 2. JPEG Quality Improvements +**Files:** `meetus/frame_extractor.py` + +- **Interval extraction**: Added `cv2.IMWRITE_JPEG_QUALITY, 95` (line 60) +- **Scene detection**: Added `-q:v 2` to FFmpeg (best quality, line 94) + +### 3. Migration to ffmpeg-python +**Files:** `meetus/frame_extractor.py`, `requirements.txt` + +- Replaced `subprocess.run()` with `ffmpeg-python` library +- Cleaner, more Pythonic API +- Better error handling with `ffmpeg.Error` +- Added to requirements.txt + +### 4. Granular Cache Control +**Files:** `process_meeting.py`, `meetus/workflow.py`, `meetus/cache_manager.py` + +Added three new flags for selective cache invalidation: +- `--skip-cache-frames`: Regenerate frames (useful when tuning scene threshold) +- `--skip-cache-whisper`: Rerun whisper transcription +- `--skip-cache-analysis`: Rerun OCR/vision analysis + +**Key design:** +- `--no-cache`: Still works as before (new directory + regenerate everything) +- New flags: Reuse existing output directory but selectively invalidate caches +- Frames are cleaned up when regenerating to avoid stale data + +## Typical Workflow + +```bash +# First run - generate everything including whisper (expensive, once) +python process_meeting.py samples/video.mkv --run-whisper --scene-detection --use-vision + +# Iterate on scene threshold without re-running whisper +python process_meeting.py samples/video.mkv --scene-detection --scene-threshold 10 --use-vision --skip-cache-frames --skip-cache-analysis + +# Try even more sensitive +python process_meeting.py samples/video.mkv --scene-detection --scene-threshold 5 --use-vision --skip-cache-frames --skip-cache-analysis +``` + +## Notes +- Whisper is the most expensive and reliable step → always cache it during iteration +- Scene detection needs tuning per UI style (Zed vs VS Code) +- Vision analysis should regenerate when frames change +- Walking through code (file switches, scrolling) should trigger scene changes + +## Files Modified +- `meetus/frame_extractor.py` - Scene threshold, quality, ffmpeg-python +- `meetus/workflow.py` - Cache flags, frame cleanup +- `meetus/cache_manager.py` - Granular cache checks +- `process_meeting.py` - CLI arguments +- `requirements.txt` - Added ffmpeg-python diff --git a/meetus/cache_manager.py b/meetus/cache_manager.py index 44df8f4..85c2c99 100644 --- a/meetus/cache_manager.py +++ b/meetus/cache_manager.py @@ -12,7 +12,9 @@ logger = logging.getLogger(__name__) class CacheManager: """Manage caching of intermediate processing results.""" - def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True): + def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True, + skip_cache_frames: bool = False, skip_cache_whisper: bool = False, + skip_cache_analysis: bool = False): """ Initialize cache manager. @@ -20,12 +22,18 @@ class CacheManager: output_dir: Output directory for cached files frames_dir: Directory for cached frames video_name: Name of the video (stem) - use_cache: Whether to use caching + use_cache: Whether to use caching globally + skip_cache_frames: Skip cached frames specifically + skip_cache_whisper: Skip cached whisper specifically + skip_cache_analysis: Skip cached analysis specifically """ self.output_dir = output_dir self.frames_dir = frames_dir self.video_name = video_name self.use_cache = use_cache + self.skip_cache_frames = skip_cache_frames + self.skip_cache_whisper = skip_cache_whisper + self.skip_cache_analysis = skip_cache_analysis def get_whisper_cache(self) -> Optional[Path]: """ @@ -34,7 +42,7 @@ class CacheManager: Returns: Path to cached transcript or None """ - if not self.use_cache: + if not self.use_cache or self.skip_cache_whisper: return None cache_path = self.output_dir / f"{self.video_name}.json" @@ -51,7 +59,7 @@ class CacheManager: Returns: List of (frame_path, timestamp) tuples or None """ - if not self.use_cache or not self.frames_dir.exists(): + if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists(): return None existing_frames = list(self.frames_dir.glob("frame_*.jpg")) @@ -84,7 +92,7 @@ class CacheManager: Returns: List of analysis results or None """ - if not self.use_cache: + if not self.use_cache or self.skip_cache_analysis: return None cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json" diff --git a/meetus/frame_extractor.py b/meetus/frame_extractor.py index 6cf447e..6b71676 100644 --- a/meetus/frame_extractor.py +++ b/meetus/frame_extractor.py @@ -6,9 +6,9 @@ import cv2 import os from pathlib import Path from typing import List, Tuple, Optional -import subprocess import json import logging +import re logger = logging.getLogger(__name__) @@ -56,7 +56,8 @@ class FrameExtractor: frame_filename = f"frame_{saved_count:05d}_{timestamp:.2f}s.jpg" frame_path = self.output_dir / frame_filename - cv2.imwrite(str(frame_path), frame) + # Use high quality for text readability (95 = high quality JPEG) + cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95]) frames_info.append((str(frame_path), timestamp)) saved_count += 1 @@ -66,41 +67,51 @@ class FrameExtractor: logger.info(f"Extracted {saved_count} frames at {interval_seconds}s intervals") return frames_info - def extract_scene_changes(self, threshold: float = 30.0) -> List[Tuple[str, float]]: + def extract_scene_changes(self, threshold: float = 15.0) -> List[Tuple[str, float]]: """ Extract frames only on scene changes using FFmpeg. More efficient than interval-based extraction. Args: threshold: Scene change detection threshold (0-100, lower = more sensitive) + Default: 15.0 (good for clean UIs like Zed) + Higher values (20-30) for busy UIs like VS Code + Lower values (5-10) for very subtle changes Returns: List of (frame_path, timestamp) tuples """ + try: + import ffmpeg + except ImportError: + raise ImportError("ffmpeg-python not installed. Run: pip install ffmpeg-python") + video_name = Path(self.video_path).stem output_pattern = self.output_dir / f"{video_name}_%05d.jpg" - # Use FFmpeg's scene detection filter - cmd = [ - 'ffmpeg', - '-i', self.video_path, - '-vf', f'select=gt(scene\\,{threshold/100}),showinfo', - '-vsync', 'vfr', - '-frame_pts', '1', - str(output_pattern), - '-loglevel', 'info' - ] - try: - result = subprocess.run(cmd, capture_output=True, text=True, check=True) + # Use FFmpeg's scene detection filter with high quality output + stream = ffmpeg.input(self.video_path) + stream = ffmpeg.filter(stream, 'select', f'gt(scene,{threshold/100})') + stream = ffmpeg.filter(stream, 'showinfo') + stream = ffmpeg.output( + stream, + str(output_pattern), + vsync='vfr', + frame_pts=1, + **{'q:v': '2'} # High quality JPEG + ) + + # Run with stderr capture to get showinfo output + _, stderr = ffmpeg.run(stream, capture_stderr=True, overwrite_output=True) + stderr = stderr.decode('utf-8') # Parse FFmpeg output to get frame timestamps from showinfo filter - import re frames_info = [] # Extract timestamps from stderr (showinfo outputs there) timestamp_pattern = r'pts_time:([\d.]+)' - timestamps = re.findall(timestamp_pattern, result.stderr) + timestamps = re.findall(timestamp_pattern, stderr) # Match frames to timestamps frame_files = sorted(self.output_dir.glob(f"{video_name}_*.jpg")) @@ -113,11 +124,15 @@ class FrameExtractor: logger.info(f"Extracted {len(frames_info)} frames at scene changes") return frames_info - except subprocess.CalledProcessError as e: - logger.error(f"FFmpeg error: {e.stderr}") + except ffmpeg.Error as e: + logger.error(f"FFmpeg error: {e.stderr.decode() if e.stderr else str(e)}") # Fallback to interval extraction logger.warning("Falling back to interval extraction...") return self.extract_by_interval() + except Exception as e: + logger.error(f"Unexpected error during scene extraction: {e}") + logger.warning("Falling back to interval extraction...") + return self.extract_by_interval() def get_video_duration(self) -> float: """Get video duration in seconds.""" diff --git a/meetus/workflow.py b/meetus/workflow.py index b7bb854..b60e3bd 100644 --- a/meetus/workflow.py +++ b/meetus/workflow.py @@ -31,10 +31,11 @@ class WorkflowConfig: # Whisper options self.run_whisper = kwargs.get('run_whisper', False) - self.whisper_model = kwargs.get('whisper_model', 'base') + self.whisper_model = kwargs.get('whisper_model', 'medium') # Frame extraction self.scene_detection = kwargs.get('scene_detection', False) + self.scene_threshold = kwargs.get('scene_threshold', 15.0) self.interval = kwargs.get('interval', 5) # Analysis options @@ -46,6 +47,9 @@ class WorkflowConfig: # Processing options self.no_deduplicate = kwargs.get('no_deduplicate', False) self.no_cache = kwargs.get('no_cache', False) + self.skip_cache_frames = kwargs.get('skip_cache_frames', False) + self.skip_cache_whisper = kwargs.get('skip_cache_whisper', False) + self.skip_cache_analysis = kwargs.get('skip_cache_analysis', False) self.extract_only = kwargs.get('extract_only', False) self.format = kwargs.get('format', 'detailed') @@ -58,7 +62,8 @@ class WorkflowConfig: }, "frame_extraction": { "method": "scene_detection" if self.scene_detection else "interval", - "interval_seconds": self.interval if not self.scene_detection else None + "interval_seconds": self.interval if not self.scene_detection else None, + "scene_threshold": self.scene_threshold if self.scene_detection else None }, "analysis": { "method": "vision" if self.use_vision else "ocr", @@ -91,7 +96,10 @@ class ProcessingWorkflow: self.output_mgr.output_dir, self.output_mgr.frames_dir, config.video_path.stem, - use_cache=not config.no_cache + use_cache=not config.no_cache, + skip_cache_frames=config.skip_cache_frames, + skip_cache_whisper=config.skip_cache_whisper, + skip_cache_analysis=config.skip_cache_analysis ) def run(self) -> Dict[str, Any]: @@ -206,11 +214,17 @@ class ProcessingWorkflow: if cached_frames: return cached_frames + # Clean up old frames if regenerating + if self.config.skip_cache_frames and self.output_mgr.frames_dir.exists(): + logger.info("Cleaning up old frames...") + for old_frame in self.output_mgr.frames_dir.glob("*.jpg"): + old_frame.unlink() + # Extract frames extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir)) if self.config.scene_detection: - frames_info = extractor.extract_scene_changes() + frames_info = extractor.extract_scene_changes(threshold=self.config.scene_threshold) else: frames_info = extractor.extract_by_interval(self.config.interval) diff --git a/process_meeting.py b/process_meeting.py index af01d0c..78dd5ca 100644 --- a/process_meeting.py +++ b/process_meeting.py @@ -72,8 +72,8 @@ Examples: parser.add_argument( '--whisper-model', choices=['tiny', 'base', 'small', 'medium', 'large'], - help='Whisper model to use (default: base)', - default='base' + help='Whisper model to use (default: medium)', + default='medium' ) # Output options @@ -100,6 +100,12 @@ Examples: action='store_true', help='Use scene detection instead of interval extraction' ) + parser.add_argument( + '--scene-threshold', + type=float, + help='Scene detection threshold (0-100, lower=more sensitive, default: 15)', + default=15.0 + ) # Analysis options parser.add_argument( @@ -131,6 +137,21 @@ Examples: action='store_true', help='Disable caching - reprocess everything even if outputs exist' ) + parser.add_argument( + '--skip-cache-frames', + action='store_true', + help='Skip cached frames, re-extract from video (but keep whisper/analysis cache)' + ) + parser.add_argument( + '--skip-cache-whisper', + action='store_true', + help='Skip cached whisper transcript, re-run transcription (but keep frames/analysis cache)' + ) + parser.add_argument( + '--skip-cache-analysis', + action='store_true', + help='Skip cached analysis, re-run OCR/vision (but keep frames/whisper cache)' + ) parser.add_argument( '--no-deduplicate', action='store_true', diff --git a/requirements.txt b/requirements.txt index 497ecc0..89943a5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,7 @@ # Core dependencies opencv-python>=4.8.0 Pillow>=10.0.0 +ffmpeg-python>=0.2.0 # Vision analysis (recommended for better results) # Requires Ollama to be installed: https://ollama.ai/download