""" Merge Whisper transcripts with OCR screen content. Creates a unified, timestamped transcript for Claude summarization. """ from typing import List, Dict, Optional import json from pathlib import Path import logging import base64 from io import BytesIO logger = logging.getLogger(__name__) class TranscriptMerger: """Merge audio transcripts with screen OCR text.""" def __init__(self, embed_images: bool = False, embed_quality: int = 80): """ Initialize transcript merger. Args: embed_images: Whether to embed frame images as base64 embed_quality: JPEG quality for embedded images (0-100) """ self.embed_images = embed_images self.embed_quality = embed_quality def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]: """ Load Whisper transcript from file. Supports both JSON format (with timestamps) and plain text. Args: transcript_path: Path to transcript file group_interval: If specified, group audio segments into intervals (in seconds) Returns: List of dicts with 'timestamp' (optional) and 'text' """ path = Path(transcript_path) if path.suffix == '.json': with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle different Whisper output formats segments = [] if isinstance(data, dict) and 'segments' in data: # Standard Whisper JSON format segments = [ { 'timestamp': seg.get('start', 0), 'text': seg['text'].strip(), 'type': 'audio' } for seg in data['segments'] ] elif isinstance(data, list): # List of segments segments = [ { 'timestamp': seg.get('start', seg.get('timestamp', 0)), 'text': seg['text'].strip(), 'type': 'audio' } for seg in data ] # Group by interval if requested if group_interval and segments: segments = self.group_audio_by_intervals(segments, group_interval) return segments else: # Plain text file - no timestamps with open(path, 'r', encoding='utf-8') as f: text = f.read().strip() return [{ 'timestamp': 0, 'text': text, 'type': 'audio' }] def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]: """ Group audio segments into regular time intervals. Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds) with all text spoken during that interval concatenated together. Args: segments: List of audio segments with timestamps interval_seconds: Duration of each interval in seconds Returns: List of grouped segments with interval timestamps """ if not segments: return [] # Find the max timestamp to determine how many intervals we need max_timestamp = max(seg['timestamp'] for seg in segments) num_intervals = int(max_timestamp / interval_seconds) + 1 # Create interval buckets intervals = [] for i in range(num_intervals): interval_start = i * interval_seconds interval_end = (i + 1) * interval_seconds # Collect all text in this interval texts = [] for seg in segments: if interval_start <= seg['timestamp'] < interval_end: texts.append(seg['text']) # Only create interval if there's text if texts: intervals.append({ 'timestamp': interval_start, 'text': ' '.join(texts), 'type': 'audio' }) logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s") return intervals def _encode_image_base64(self, image_path: str) -> tuple[str, int]: """ Encode image as base64 (image already at target quality/size). Args: image_path: Path to image file Returns: Tuple of (base64_string, size_in_bytes) """ try: # Read file directly (already at target quality/resolution) with open(image_path, 'rb') as f: img_bytes = f.read() # Encode to base64 b64_string = base64.b64encode(img_bytes).decode('utf-8') logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes") return b64_string, len(img_bytes) except Exception as e: logger.error(f"Failed to encode image {image_path}: {e}") return "", 0 def merge_transcripts( self, audio_segments: List[Dict], screen_segments: List[Dict] ) -> List[Dict]: """ Merge audio and screen transcripts by timestamp. Args: audio_segments: List of audio transcript segments screen_segments: List of screen OCR segments Returns: Merged list sorted by timestamp """ # Mark segment types for seg in audio_segments: seg['type'] = 'audio' for seg in screen_segments: seg['type'] = 'screen' # Combine and sort by timestamp all_segments = audio_segments + screen_segments all_segments.sort(key=lambda x: x['timestamp']) return all_segments def format_for_claude( self, merged_segments: List[Dict], format_style: str = "detailed" ) -> str: """ Format merged transcript for Claude processing. Args: merged_segments: Merged transcript segments format_style: 'detailed' or 'compact' Returns: Formatted transcript string """ if format_style == "detailed": return self._format_detailed(merged_segments) else: return self._format_compact(merged_segments) def _format_detailed(self, segments: List[Dict]) -> str: """Format with clear visual separation between audio and screen content.""" lines = [] lines.append("=" * 80) lines.append("ENHANCED MEETING TRANSCRIPT") if self.embed_images: lines.append("Audio transcript + Embedded frame images (base64)") else: lines.append("Audio transcript + Screen content") lines.append("=" * 80) lines.append("") total_image_bytes = 0 for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) if seg['type'] == 'audio': lines.append(f"[{timestamp}] SPEAKER:") lines.append(f" {seg['text']}") lines.append("") else: # screen lines.append(f"[{timestamp}] SCREEN CONTENT:") # Embed image if requested if self.embed_images and 'frame_path' in seg: b64_img, img_size = self._encode_image_base64(seg['frame_path']) total_image_bytes += img_size if b64_img: lines.append(f" IMAGE (base64, {img_size // 1024}KB):") lines.append(f" data:image/jpeg;base64,{b64_img}") lines.append("") # Include text content if available (fallback or additional context) if 'text' in seg and seg['text'].strip(): screen_text = seg['text'].replace('\n', '\n | ') lines.append(f" TEXT:") lines.append(f" | {screen_text}") lines.append("") if self.embed_images and total_image_bytes > 0: total_mb = total_image_bytes / (1024 * 1024) lines.append("") lines.append(f"Total embedded images size: {total_mb:.2f} MB") logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB") return "\n".join(lines) def _format_compact(self, segments: List[Dict]) -> str: """Compact format for shorter transcripts.""" lines = [] for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN" text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text lines.append(f"[{timestamp}] {prefix}: {text}") return "\n".join(lines) def _format_timestamp(self, seconds: float) -> str: """Format timestamp as MM:SS.""" minutes = int(seconds // 60) secs = int(seconds % 60) return f"{minutes:02d}:{secs:02d}" def save_transcript(self, formatted_text: str, output_path: str): """ Save formatted transcript to file. Args: formatted_text: Formatted transcript output_path: Output file path """ with open(output_path, 'w', encoding='utf-8') as f: f.write(formatted_text) logger.info(f"Saved enhanced transcript to: {output_path}")