""" Manage caching for frames, transcripts, and analysis results. """ from pathlib import Path import json import logging from typing import List, Tuple, Dict, Optional logger = logging.getLogger(__name__) class CacheManager: """Manage caching of intermediate processing results.""" def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True, skip_cache_frames: bool = False, skip_cache_whisper: bool = False, skip_cache_analysis: bool = False): """ Initialize cache manager. Args: output_dir: Output directory for cached files frames_dir: Directory for cached frames video_name: Name of the video (stem) use_cache: Whether to use caching globally skip_cache_frames: Skip cached frames specifically skip_cache_whisper: Skip cached whisper specifically skip_cache_analysis: Skip cached analysis specifically """ self.output_dir = output_dir self.frames_dir = frames_dir self.video_name = video_name self.use_cache = use_cache self.skip_cache_frames = skip_cache_frames self.skip_cache_whisper = skip_cache_whisper self.skip_cache_analysis = skip_cache_analysis def get_whisper_cache(self) -> Optional[Path]: """ Check for cached Whisper transcript. Returns: Path to cached transcript or None """ if not self.use_cache or self.skip_cache_whisper: return None cache_path = self.output_dir / f"{self.video_name}.json" if cache_path.exists(): logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}") # Debug: Show cached transcript info try: import json with open(cache_path, 'r', encoding='utf-8') as f: data = json.load(f) if 'segments' in data: logger.debug(f"Cached transcript has {len(data['segments'])} segments") except Exception as e: logger.debug(f"Could not parse cached whisper for debug: {e}") return cache_path return None def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]: """ Check for cached frames. Returns: List of (frame_path, timestamp) tuples or None """ if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists(): return None existing_frames = list(self.frames_dir.glob("frame_*.jpg")) if not existing_frames: return None logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/") logger.debug(f"Frame filenames: {[f.name for f in sorted(existing_frames)[:3]]}...") # Build frames_info from existing files frames_info = [] for frame_path in sorted(existing_frames): # Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg) try: timestamp_str = frame_path.stem.split('_')[-1].rstrip('s') timestamp = float(timestamp_str) except: timestamp = 0.0 frames_info.append((str(frame_path), timestamp)) return frames_info def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]: """ Check for cached analysis results. Args: analysis_type: 'vision' or 'ocr' Returns: List of analysis results or None """ if not self.use_cache or self.skip_cache_analysis: return None cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json" if cache_path.exists(): logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}") with open(cache_path, 'r', encoding='utf-8') as f: results = json.load(f) logger.info(f"✓ Loaded {len(results)} analyzed frames from cache") # Debug: Show first cached result if results: logger.debug(f"First cached result: timestamp={results[0].get('timestamp')}, text_length={len(results[0].get('text', ''))}") return results return None def save_analysis(self, analysis_type: str, results: List[Dict]): """ Save analysis results to cache. Args: analysis_type: 'vision' or 'ocr' results: Analysis results to save """ cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json" with open(cache_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=2, ensure_ascii=False) logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}") def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]: """ Check what caches exist. Args: analysis_type: Optional specific analysis type to check Returns: Dictionary of cache status """ status = { "whisper": (self.output_dir / f"{self.video_name}.json").exists(), "frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False, } if analysis_type: status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists() else: status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists() status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists() return status