mitus/meetus/transcript_merger.py

"""
Merge Whisper transcripts with OCR screen content.
Creates a unified, timestamped transcript for Claude summarization.
"""
from typing import List, Dict, Optional
import json
from pathlib import Path
import logging

logger = logging.getLogger(__name__)


class TranscriptMerger:
    """Merge audio transcripts with screen OCR text."""

    def __init__(self):
        """Initialize transcript merger."""
        pass

    def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
        """
        Load Whisper transcript from file.

        Supports both JSON format (with timestamps) and plain text.

        Args:
            transcript_path: Path to transcript file

        Returns:
            List of dicts with 'timestamp' (optional) and 'text'
        """
        path = Path(transcript_path)

        if path.suffix == '.json':
            with open(path, 'r', encoding='utf-8') as f:
                data = json.load(f)

            # Handle different Whisper output formats
            if isinstance(data, dict) and 'segments' in data:
                # Standard Whisper JSON format
                return [
                    {
                        'timestamp': seg.get('start', 0),
                        'text': seg['text'].strip(),
                        'type': 'audio'
                    }
                    for seg in data['segments']
                ]
            elif isinstance(data, list):
                # List of segments
                return [
                    {
                        'timestamp': seg.get('start', seg.get('timestamp', 0)),
                        'text': seg['text'].strip(),
                        'type': 'audio'
                    }
                    for seg in data
                ]

        else:
            # Plain text file - no timestamps
            with open(path, 'r', encoding='utf-8') as f:
                text = f.read().strip()

            return [{
                'timestamp': 0,
                'text': text,
                'type': 'audio'
            }]

    def merge_transcripts(
        self,
        audio_segments: List[Dict],
        screen_segments: List[Dict]
    ) -> List[Dict]:
        """
        Merge audio and screen transcripts by timestamp.

        Args:
            audio_segments: List of audio transcript segments
            screen_segments: List of screen OCR segments

        Returns:
            Merged list sorted by timestamp
        """
        # Mark segment types
        for seg in audio_segments:
            seg['type'] = 'audio'
        for seg in screen_segments:
            seg['type'] = 'screen'

        # Combine and sort by timestamp
        all_segments = audio_segments + screen_segments
        all_segments.sort(key=lambda x: x['timestamp'])

        return all_segments

    def format_for_claude(
        self,
        merged_segments: List[Dict],
        format_style: str = "detailed"
    ) -> str:
        """
        Format merged transcript for Claude processing.

        Args:
            merged_segments: Merged transcript segments
            format_style: 'detailed' or 'compact'

        Returns:
            Formatted transcript string
        """
        if format_style == "detailed":
            return self._format_detailed(merged_segments)
        else:
            return self._format_compact(merged_segments)

    def _format_detailed(self, segments: List[Dict]) -> str:
        """Format with clear visual separation between audio and screen content."""
        lines = []
        lines.append("=" * 80)
        lines.append("ENHANCED MEETING TRANSCRIPT")
        lines.append("Audio transcript + Screen content")
        lines.append("=" * 80)
        lines.append("")

        for seg in segments:
            timestamp = self._format_timestamp(seg['timestamp'])

            if seg['type'] == 'audio':
                lines.append(f"[{timestamp}] SPEAKER:")
                lines.append(f"  {seg['text']}")
                lines.append("")

            else:  # screen
                lines.append(f"[{timestamp}] SCREEN CONTENT:")
                # Indent screen text for visibility
                screen_text = seg['text'].replace('\n', '\n  | ')
                lines.append(f"  | {screen_text}")
                lines.append("")

        return "\n".join(lines)

    def _format_compact(self, segments: List[Dict]) -> str:
        """Compact format for shorter transcripts."""
        lines = []

        for seg in segments:
            timestamp = self._format_timestamp(seg['timestamp'])
            prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
            text = seg['text'].replace('\n', ' ')[:200]  # Truncate long screen text
            lines.append(f"[{timestamp}] {prefix}: {text}")

        return "\n".join(lines)

    def _format_timestamp(self, seconds: float) -> str:
        """Format timestamp as MM:SS."""
        minutes = int(seconds // 60)
        secs = int(seconds % 60)
        return f"{minutes:02d}:{secs:02d}"

    def save_transcript(self, formatted_text: str, output_path: str):
        """
        Save formatted transcript to file.

        Args:
            formatted_text: Formatted transcript
            output_path: Output file path
        """
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(formatted_text)

        logger.info(f"Saved enhanced transcript to: {output_path}")