163 lines
5.6 KiB
Python
163 lines
5.6 KiB
Python
"""
|
|
Manage caching for frames, transcripts, and analysis results.
|
|
"""
|
|
from pathlib import Path
|
|
import json
|
|
import logging
|
|
from typing import List, Tuple, Dict, Optional
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class CacheManager:
|
|
"""Manage caching of intermediate processing results."""
|
|
|
|
def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True,
|
|
skip_cache_frames: bool = False, skip_cache_whisper: bool = False,
|
|
skip_cache_analysis: bool = False):
|
|
"""
|
|
Initialize cache manager.
|
|
|
|
Args:
|
|
output_dir: Output directory for cached files
|
|
frames_dir: Directory for cached frames
|
|
video_name: Name of the video (stem)
|
|
use_cache: Whether to use caching globally
|
|
skip_cache_frames: Skip cached frames specifically
|
|
skip_cache_whisper: Skip cached whisper specifically
|
|
skip_cache_analysis: Skip cached analysis specifically
|
|
"""
|
|
self.output_dir = output_dir
|
|
self.frames_dir = frames_dir
|
|
self.video_name = video_name
|
|
self.use_cache = use_cache
|
|
self.skip_cache_frames = skip_cache_frames
|
|
self.skip_cache_whisper = skip_cache_whisper
|
|
self.skip_cache_analysis = skip_cache_analysis
|
|
|
|
def get_whisper_cache(self) -> Optional[Path]:
|
|
"""
|
|
Check for cached Whisper transcript.
|
|
|
|
Returns:
|
|
Path to cached transcript or None
|
|
"""
|
|
if not self.use_cache or self.skip_cache_whisper:
|
|
return None
|
|
|
|
cache_path = self.output_dir / f"{self.video_name}.json"
|
|
if cache_path.exists():
|
|
logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}")
|
|
|
|
# Debug: Show cached transcript info
|
|
try:
|
|
import json
|
|
with open(cache_path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
if 'segments' in data:
|
|
logger.debug(f"Cached transcript has {len(data['segments'])} segments")
|
|
except Exception as e:
|
|
logger.debug(f"Could not parse cached whisper for debug: {e}")
|
|
|
|
return cache_path
|
|
|
|
return None
|
|
|
|
def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]:
|
|
"""
|
|
Check for cached frames.
|
|
|
|
Returns:
|
|
List of (frame_path, timestamp) tuples or None
|
|
"""
|
|
if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists():
|
|
return None
|
|
|
|
existing_frames = list(self.frames_dir.glob("*.jpg"))
|
|
|
|
if not existing_frames:
|
|
return None
|
|
|
|
logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/")
|
|
logger.debug(f"Frame filenames: {[f.name for f in sorted(existing_frames)[:3]]}...")
|
|
|
|
# Build frames_info from existing files
|
|
frames_info = []
|
|
for frame_path in sorted(existing_frames):
|
|
# Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg)
|
|
try:
|
|
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
|
|
timestamp = float(timestamp_str)
|
|
except:
|
|
timestamp = 0.0
|
|
frames_info.append((str(frame_path), timestamp))
|
|
|
|
return frames_info
|
|
|
|
def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]:
|
|
"""
|
|
Check for cached analysis results.
|
|
|
|
Args:
|
|
analysis_type: 'vision' or 'ocr'
|
|
|
|
Returns:
|
|
List of analysis results or None
|
|
"""
|
|
if not self.use_cache or self.skip_cache_analysis:
|
|
return None
|
|
|
|
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
|
|
|
|
if cache_path.exists():
|
|
logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}")
|
|
with open(cache_path, 'r', encoding='utf-8') as f:
|
|
results = json.load(f)
|
|
logger.info(f"✓ Loaded {len(results)} analyzed frames from cache")
|
|
|
|
# Debug: Show first cached result
|
|
if results:
|
|
logger.debug(f"First cached result: timestamp={results[0].get('timestamp')}, text_length={len(results[0].get('text', ''))}")
|
|
|
|
return results
|
|
|
|
return None
|
|
|
|
def save_analysis(self, analysis_type: str, results: List[Dict]):
|
|
"""
|
|
Save analysis results to cache.
|
|
|
|
Args:
|
|
analysis_type: 'vision' or 'ocr'
|
|
results: Analysis results to save
|
|
"""
|
|
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
|
|
|
|
with open(cache_path, 'w', encoding='utf-8') as f:
|
|
json.dump(results, f, indent=2, ensure_ascii=False)
|
|
|
|
logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}")
|
|
|
|
def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]:
|
|
"""
|
|
Check what caches exist.
|
|
|
|
Args:
|
|
analysis_type: Optional specific analysis type to check
|
|
|
|
Returns:
|
|
Dictionary of cache status
|
|
"""
|
|
status = {
|
|
"whisper": (self.output_dir / f"{self.video_name}.json").exists(),
|
|
"frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False,
|
|
}
|
|
|
|
if analysis_type:
|
|
status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists()
|
|
else:
|
|
status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists()
|
|
status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists()
|
|
|
|
return status
|