Files
mitus/meetus/cache_manager.py
Mariano Gabriel 118ef04223 embed images
2025-10-28 08:02:45 -03:00

163 lines
5.6 KiB
Python

"""
Manage caching for frames, transcripts, and analysis results.
"""
from pathlib import Path
import json
import logging
from typing import List, Tuple, Dict, Optional
logger = logging.getLogger(__name__)
class CacheManager:
"""Manage caching of intermediate processing results."""
def __init__(self, output_dir: Path, frames_dir: Path, video_name: str, use_cache: bool = True,
skip_cache_frames: bool = False, skip_cache_whisper: bool = False,
skip_cache_analysis: bool = False):
"""
Initialize cache manager.
Args:
output_dir: Output directory for cached files
frames_dir: Directory for cached frames
video_name: Name of the video (stem)
use_cache: Whether to use caching globally
skip_cache_frames: Skip cached frames specifically
skip_cache_whisper: Skip cached whisper specifically
skip_cache_analysis: Skip cached analysis specifically
"""
self.output_dir = output_dir
self.frames_dir = frames_dir
self.video_name = video_name
self.use_cache = use_cache
self.skip_cache_frames = skip_cache_frames
self.skip_cache_whisper = skip_cache_whisper
self.skip_cache_analysis = skip_cache_analysis
def get_whisper_cache(self) -> Optional[Path]:
"""
Check for cached Whisper transcript.
Returns:
Path to cached transcript or None
"""
if not self.use_cache or self.skip_cache_whisper:
return None
cache_path = self.output_dir / f"{self.video_name}.json"
if cache_path.exists():
logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}")
# Debug: Show cached transcript info
try:
import json
with open(cache_path, 'r', encoding='utf-8') as f:
data = json.load(f)
if 'segments' in data:
logger.debug(f"Cached transcript has {len(data['segments'])} segments")
except Exception as e:
logger.debug(f"Could not parse cached whisper for debug: {e}")
return cache_path
return None
def get_frames_cache(self) -> Optional[List[Tuple[str, float]]]:
"""
Check for cached frames.
Returns:
List of (frame_path, timestamp) tuples or None
"""
if not self.use_cache or self.skip_cache_frames or not self.frames_dir.exists():
return None
existing_frames = list(self.frames_dir.glob("frame_*.jpg"))
if not existing_frames:
return None
logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/")
logger.debug(f"Frame filenames: {[f.name for f in sorted(existing_frames)[:3]]}...")
# Build frames_info from existing files
frames_info = []
for frame_path in sorted(existing_frames):
# Try to extract timestamp from filename (e.g., frame_00001_12.34s.jpg)
try:
timestamp_str = frame_path.stem.split('_')[-1].rstrip('s')
timestamp = float(timestamp_str)
except:
timestamp = 0.0
frames_info.append((str(frame_path), timestamp))
return frames_info
def get_analysis_cache(self, analysis_type: str) -> Optional[List[Dict]]:
"""
Check for cached analysis results.
Args:
analysis_type: 'vision' or 'ocr'
Returns:
List of analysis results or None
"""
if not self.use_cache or self.skip_cache_analysis:
return None
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
if cache_path.exists():
logger.info(f"✓ Found cached {analysis_type} analysis: {cache_path.name}")
with open(cache_path, 'r', encoding='utf-8') as f:
results = json.load(f)
logger.info(f"✓ Loaded {len(results)} analyzed frames from cache")
# Debug: Show first cached result
if results:
logger.debug(f"First cached result: timestamp={results[0].get('timestamp')}, text_length={len(results[0].get('text', ''))}")
return results
return None
def save_analysis(self, analysis_type: str, results: List[Dict]):
"""
Save analysis results to cache.
Args:
analysis_type: 'vision' or 'ocr'
results: Analysis results to save
"""
cache_path = self.output_dir / f"{self.video_name}_{analysis_type}.json"
with open(cache_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
logger.info(f"✓ Saved {analysis_type} analysis to: {cache_path.name}")
def cache_exists(self, analysis_type: Optional[str] = None) -> Dict[str, bool]:
"""
Check what caches exist.
Args:
analysis_type: Optional specific analysis type to check
Returns:
Dictionary of cache status
"""
status = {
"whisper": (self.output_dir / f"{self.video_name}.json").exists(),
"frames": len(list(self.frames_dir.glob("frame_*.jpg"))) > 0 if self.frames_dir.exists() else False,
}
if analysis_type:
status[analysis_type] = (self.output_dir / f"{self.video_name}_{analysis_type}.json").exists()
else:
status["vision"] = (self.output_dir / f"{self.video_name}_vision.json").exists()
status["ocr"] = (self.output_dir / f"{self.video_name}_ocr.json").exists()
return status