group text

This commit is contained in:
Mariano Gabriel
2025-10-23 14:49:14 -03:00
parent cdf7ad1199
commit c871af2def
3 changed files with 111 additions and 8 deletions

View File

@@ -236,12 +236,25 @@ class ProcessingWorkflow:
logger.info("Step 2: Running vision analysis on extracted frames...")
logger.info(f"Loading vision model {self.config.vision_model} to GPU...")
# Load audio segments for context if transcript exists
audio_segments = []
transcript_path = self.config.transcript_path or self._get_cached_transcript()
if transcript_path:
transcript_file = Path(transcript_path)
if transcript_file.exists():
logger.info("Loading audio transcript for context...")
merger = TranscriptMerger()
audio_segments = merger.load_whisper_transcript(str(transcript_file))
logger.info(f"✓ Loaded {len(audio_segments)} audio segments for context")
try:
vision = VisionProcessor(model=self.config.vision_model)
screen_segments = vision.process_frames(
frames_info,
context=self.config.vision_context,
deduplicate=not self.config.no_deduplicate
deduplicate=not self.config.no_deduplicate,
audio_segments=audio_segments
)
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
@@ -253,6 +266,11 @@ class ProcessingWorkflow:
logger.error(f"{e}")
raise
def _get_cached_transcript(self) -> Optional[str]:
"""Get cached Whisper transcript if available."""
cached = self.cache_mgr.get_whisper_cache()
return str(cached) if cached else None
def _run_ocr_analysis(self, frames_info):
"""Run OCR analysis on frames."""
logger.info("Step 2: Running OCR on extracted frames...")
@@ -289,7 +307,8 @@ class ProcessingWorkflow:
logger.warning(f"Transcript not found: {transcript_path}")
logger.info("Proceeding with screen content only...")
else:
audio_segments = merger.load_whisper_transcript(str(transcript_file))
# Group audio into 30-second intervals for cleaner reference timestamps
audio_segments = merger.load_whisper_transcript(str(transcript_file), group_interval=30)
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
else:
logger.info("No transcript provided, using screen content only...")