""" Merge Whisper transcripts with OCR screen content. Creates a unified, timestamped transcript for Claude summarization. """ from typing import List, Dict, Optional import json from pathlib import Path import logging import base64 from io import BytesIO logger = logging.getLogger(__name__) class TranscriptMerger: """Merge audio transcripts with screen OCR text.""" def __init__(self, embed_images: bool = False, embed_quality: int = 80): """ Initialize transcript merger. Args: embed_images: Whether to embed frame images as base64 embed_quality: JPEG quality for embedded images (0-100) """ self.embed_images = embed_images self.embed_quality = embed_quality def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]: """ Load Whisper transcript from file. Supports both JSON format (with timestamps) and plain text. Args: transcript_path: Path to transcript file group_interval: If specified, group audio segments into intervals (in seconds) Returns: List of dicts with 'timestamp' (optional) and 'text' """ path = Path(transcript_path) if path.suffix == '.json': with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle different Whisper/WhisperX output formats segments = [] if isinstance(data, dict) and 'segments' in data: # Standard Whisper/WhisperX JSON format segments = [ { 'timestamp': seg.get('start', 0), 'text': seg['text'].strip(), 'speaker': seg.get('speaker'), # WhisperX diarization 'type': 'audio' } for seg in data['segments'] ] elif isinstance(data, list): # List of segments segments = [ { 'timestamp': seg.get('start', seg.get('timestamp', 0)), 'text': seg['text'].strip(), 'speaker': seg.get('speaker'), # WhisperX diarization 'type': 'audio' } for seg in data ] # Group by interval if requested, but skip if we have speaker diarization # (merge_transcripts will group by speaker instead) has_speakers = any(seg.get('speaker') for seg in segments) if group_interval and segments and not has_speakers: segments = self.group_audio_by_intervals(segments, group_interval) return segments else: # Plain text file - no timestamps with open(path, 'r', encoding='utf-8') as f: text = f.read().strip() return [{ 'timestamp': 0, 'text': text, 'type': 'audio' }] def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]: """ Group audio segments into regular time intervals. Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds) with all text spoken during that interval concatenated together. Args: segments: List of audio segments with timestamps interval_seconds: Duration of each interval in seconds Returns: List of grouped segments with interval timestamps """ if not segments: return [] # Find the max timestamp to determine how many intervals we need max_timestamp = max(seg['timestamp'] for seg in segments) num_intervals = int(max_timestamp / interval_seconds) + 1 # Create interval buckets intervals = [] for i in range(num_intervals): interval_start = i * interval_seconds interval_end = (i + 1) * interval_seconds # Collect all text in this interval texts = [] for seg in segments: if interval_start <= seg['timestamp'] < interval_end: texts.append(seg['text']) # Only create interval if there's text if texts: intervals.append({ 'timestamp': interval_start, 'text': ' '.join(texts), 'type': 'audio' }) logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s") return intervals def _encode_image_base64(self, image_path: str) -> tuple[str, int]: """ Encode image as base64 (image already at target quality/size). Args: image_path: Path to image file Returns: Tuple of (base64_string, size_in_bytes) """ try: # Read file directly (already at target quality/resolution) with open(image_path, 'rb') as f: img_bytes = f.read() # Encode to base64 b64_string = base64.b64encode(img_bytes).decode('utf-8') logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes") return b64_string, len(img_bytes) except Exception as e: logger.error(f"Failed to encode image {image_path}: {e}") return "", 0 def merge_transcripts( self, audio_segments: List[Dict], screen_segments: List[Dict] ) -> List[Dict]: """ Merge audio and screen transcripts by timestamp. Groups consecutive audio from same speaker until a screen frame interrupts. Args: audio_segments: List of audio transcript segments screen_segments: List of screen OCR segments Returns: Merged list sorted by timestamp, with audio grouped by speaker """ # Mark segment types for seg in audio_segments: seg['type'] = 'audio' for seg in screen_segments: seg['type'] = 'screen' # Combine and sort by timestamp all_segments = audio_segments + screen_segments all_segments.sort(key=lambda x: x['timestamp']) # Group consecutive audio segments by speaker (screen frames break groups) grouped = [] current_group = None for seg in all_segments: if seg['type'] == 'screen': # Screen frame: flush current group and add frame if current_group: grouped.append(current_group) current_group = None grouped.append(seg) else: # Audio segment speaker = seg.get('speaker') if current_group is None: # Start new group current_group = { 'timestamp': seg['timestamp'], 'text': seg['text'], 'speaker': speaker, 'type': 'audio' } elif speaker == current_group.get('speaker'): # Same speaker, append text current_group['text'] += ' ' + seg['text'] else: # Speaker changed, flush and start new group grouped.append(current_group) current_group = { 'timestamp': seg['timestamp'], 'text': seg['text'], 'speaker': speaker, 'type': 'audio' } # Don't forget last group if current_group: grouped.append(current_group) return grouped def format_for_claude( self, merged_segments: List[Dict], format_style: str = "detailed" ) -> str: """ Format merged transcript for Claude processing. Args: merged_segments: Merged transcript segments format_style: 'detailed' or 'compact' Returns: Formatted transcript string """ if format_style == "detailed": return self._format_detailed(merged_segments) else: return self._format_compact(merged_segments) def _format_detailed(self, segments: List[Dict]) -> str: """Format with clear visual separation between audio and screen content.""" lines = [] lines.append("=" * 80) lines.append("ENHANCED MEETING TRANSCRIPT") lines.append("Audio transcript + Screen frames") lines.append("=" * 80) lines.append("") for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) if seg['type'] == 'audio': speaker = seg.get('speaker', 'SPEAKER') lines.append(f"[{timestamp}] {speaker}:") lines.append(f" {seg['text']}") lines.append("") else: # screen lines.append(f"[{timestamp}] SCREEN CONTENT:") # Show frame path if available if 'frame_path' in seg: # Get just the filename relative to the enhanced transcript frame_path = Path(seg['frame_path']) relative_path = f"frames/{frame_path.name}" lines.append(f" Frame: {relative_path}") # Include text content if available (fallback or additional context) if 'text' in seg and seg['text'].strip(): screen_text = seg['text'].replace('\n', '\n | ') lines.append(f" TEXT:") lines.append(f" | {screen_text}") lines.append("") return "\n".join(lines) def _format_compact(self, segments: List[Dict]) -> str: """Compact format for shorter transcripts.""" lines = [] for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) if seg['type'] == 'audio': prefix = seg.get('speaker', 'SPEAKER') else: prefix = "SCREEN" text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text lines.append(f"[{timestamp}] {prefix}: {text}") return "\n".join(lines) def _format_timestamp(self, seconds: float) -> str: """Format timestamp as MM:SS.""" minutes = int(seconds // 60) secs = int(seconds % 60) return f"{minutes:02d}:{secs:02d}" def save_transcript(self, formatted_text: str, output_path: str): """ Save formatted transcript to file. Args: formatted_text: Formatted transcript output_path: Output file path """ with open(output_path, 'w', encoding='utf-8') as f: f.write(formatted_text) logger.info(f"Saved enhanced transcript to: {output_path}")