group text
This commit is contained in:
@@ -236,12 +236,25 @@ class ProcessingWorkflow:
|
||||
logger.info("Step 2: Running vision analysis on extracted frames...")
|
||||
logger.info(f"Loading vision model {self.config.vision_model} to GPU...")
|
||||
|
||||
# Load audio segments for context if transcript exists
|
||||
audio_segments = []
|
||||
transcript_path = self.config.transcript_path or self._get_cached_transcript()
|
||||
|
||||
if transcript_path:
|
||||
transcript_file = Path(transcript_path)
|
||||
if transcript_file.exists():
|
||||
logger.info("Loading audio transcript for context...")
|
||||
merger = TranscriptMerger()
|
||||
audio_segments = merger.load_whisper_transcript(str(transcript_file))
|
||||
logger.info(f"✓ Loaded {len(audio_segments)} audio segments for context")
|
||||
|
||||
try:
|
||||
vision = VisionProcessor(model=self.config.vision_model)
|
||||
screen_segments = vision.process_frames(
|
||||
frames_info,
|
||||
context=self.config.vision_context,
|
||||
deduplicate=not self.config.no_deduplicate
|
||||
deduplicate=not self.config.no_deduplicate,
|
||||
audio_segments=audio_segments
|
||||
)
|
||||
logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model")
|
||||
|
||||
@@ -253,6 +266,11 @@ class ProcessingWorkflow:
|
||||
logger.error(f"{e}")
|
||||
raise
|
||||
|
||||
def _get_cached_transcript(self) -> Optional[str]:
|
||||
"""Get cached Whisper transcript if available."""
|
||||
cached = self.cache_mgr.get_whisper_cache()
|
||||
return str(cached) if cached else None
|
||||
|
||||
def _run_ocr_analysis(self, frames_info):
|
||||
"""Run OCR analysis on frames."""
|
||||
logger.info("Step 2: Running OCR on extracted frames...")
|
||||
@@ -289,7 +307,8 @@ class ProcessingWorkflow:
|
||||
logger.warning(f"Transcript not found: {transcript_path}")
|
||||
logger.info("Proceeding with screen content only...")
|
||||
else:
|
||||
audio_segments = merger.load_whisper_transcript(str(transcript_file))
|
||||
# Group audio into 30-second intervals for cleaner reference timestamps
|
||||
audio_segments = merger.load_whisper_transcript(str(transcript_file), group_interval=30)
|
||||
logger.info(f"✓ Loaded {len(audio_segments)} audio segments")
|
||||
else:
|
||||
logger.info("No transcript provided, using screen content only...")
|
||||
|
||||
Reference in New Issue
Block a user