""" Merge Whisper transcripts with OCR screen content. Creates a unified, timestamped transcript for Claude summarization. """ from typing import List, Dict, Optional import json from pathlib import Path import logging logger = logging.getLogger(__name__) class TranscriptMerger: """Merge audio transcripts with screen OCR text.""" def __init__(self): """Initialize transcript merger.""" pass def load_whisper_transcript(self, transcript_path: str) -> List[Dict]: """ Load Whisper transcript from file. Supports both JSON format (with timestamps) and plain text. Args: transcript_path: Path to transcript file Returns: List of dicts with 'timestamp' (optional) and 'text' """ path = Path(transcript_path) if path.suffix == '.json': with open(path, 'r', encoding='utf-8') as f: data = json.load(f) # Handle different Whisper output formats if isinstance(data, dict) and 'segments' in data: # Standard Whisper JSON format return [ { 'timestamp': seg.get('start', 0), 'text': seg['text'].strip(), 'type': 'audio' } for seg in data['segments'] ] elif isinstance(data, list): # List of segments return [ { 'timestamp': seg.get('start', seg.get('timestamp', 0)), 'text': seg['text'].strip(), 'type': 'audio' } for seg in data ] else: # Plain text file - no timestamps with open(path, 'r', encoding='utf-8') as f: text = f.read().strip() return [{ 'timestamp': 0, 'text': text, 'type': 'audio' }] def merge_transcripts( self, audio_segments: List[Dict], screen_segments: List[Dict] ) -> List[Dict]: """ Merge audio and screen transcripts by timestamp. Args: audio_segments: List of audio transcript segments screen_segments: List of screen OCR segments Returns: Merged list sorted by timestamp """ # Mark segment types for seg in audio_segments: seg['type'] = 'audio' for seg in screen_segments: seg['type'] = 'screen' # Combine and sort by timestamp all_segments = audio_segments + screen_segments all_segments.sort(key=lambda x: x['timestamp']) return all_segments def format_for_claude( self, merged_segments: List[Dict], format_style: str = "detailed" ) -> str: """ Format merged transcript for Claude processing. Args: merged_segments: Merged transcript segments format_style: 'detailed' or 'compact' Returns: Formatted transcript string """ if format_style == "detailed": return self._format_detailed(merged_segments) else: return self._format_compact(merged_segments) def _format_detailed(self, segments: List[Dict]) -> str: """Format with clear visual separation between audio and screen content.""" lines = [] lines.append("=" * 80) lines.append("ENHANCED MEETING TRANSCRIPT") lines.append("Audio transcript + Screen content") lines.append("=" * 80) lines.append("") for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) if seg['type'] == 'audio': lines.append(f"[{timestamp}] SPEAKER:") lines.append(f" {seg['text']}") lines.append("") else: # screen lines.append(f"[{timestamp}] SCREEN CONTENT:") # Indent screen text for visibility screen_text = seg['text'].replace('\n', '\n | ') lines.append(f" | {screen_text}") lines.append("") return "\n".join(lines) def _format_compact(self, segments: List[Dict]) -> str: """Compact format for shorter transcripts.""" lines = [] for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN" text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text lines.append(f"[{timestamp}] {prefix}: {text}") return "\n".join(lines) def _format_timestamp(self, seconds: float) -> str: """Format timestamp as MM:SS.""" minutes = int(seconds // 60) secs = int(seconds % 60) return f"{minutes:02d}:{secs:02d}" def save_transcript(self, formatted_text: str, output_path: str): """ Save formatted transcript to file. Args: formatted_text: Formatted transcript output_path: Output file path """ with open(output_path, 'w', encoding='utf-8') as f: f.write(formatted_text) logger.info(f"Saved enhanced transcript to: {output_path}")