init commit

2025-10-19 22:17:38 -03:00
commit 93e0c06d38
10 changed files with 969 additions and 0 deletions
--- a/meetus/transcript_merger.py
+++ b/meetus/transcript_merger.py
@@ -0,0 +1,173 @@
+"""
+Merge Whisper transcripts with OCR screen content.
+Creates a unified, timestamped transcript for Claude summarization.
+"""
+from typing import List, Dict, Optional
+import json
+from pathlib import Path
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+class TranscriptMerger:
+    """Merge audio transcripts with screen OCR text."""
+
+    def __init__(self):
+        """Initialize transcript merger."""
+        pass
+
+    def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
+        """
+        Load Whisper transcript from file.
+
+        Supports both JSON format (with timestamps) and plain text.
+
+        Args:
+            transcript_path: Path to transcript file
+
+        Returns:
+            List of dicts with 'timestamp' (optional) and 'text'
+        """
+        path = Path(transcript_path)
+
+        if path.suffix == '.json':
+            with open(path, 'r', encoding='utf-8') as f:
+                data = json.load(f)
+
+            # Handle different Whisper output formats
+            if isinstance(data, dict) and 'segments' in data:
+                # Standard Whisper JSON format
+                return [
+                    {
+                        'timestamp': seg.get('start', 0),
+                        'text': seg['text'].strip(),
+                        'type': 'audio'
+                    }
+                    for seg in data['segments']
+                ]
+            elif isinstance(data, list):
+                # List of segments
+                return [
+                    {
+                        'timestamp': seg.get('start', seg.get('timestamp', 0)),
+                        'text': seg['text'].strip(),
+                        'type': 'audio'
+                    }
+                    for seg in data
+                ]
+
+        else:
+            # Plain text file - no timestamps
+            with open(path, 'r', encoding='utf-8') as f:
+                text = f.read().strip()
+
+            return [{
+                'timestamp': 0,
+                'text': text,
+                'type': 'audio'
+            }]
+
+    def merge_transcripts(
+        self,
+        audio_segments: List[Dict],
+        screen_segments: List[Dict]
+    ) -> List[Dict]:
+        """
+        Merge audio and screen transcripts by timestamp.
+
+        Args:
+            audio_segments: List of audio transcript segments
+            screen_segments: List of screen OCR segments
+
+        Returns:
+            Merged list sorted by timestamp
+        """
+        # Mark segment types
+        for seg in audio_segments:
+            seg['type'] = 'audio'
+        for seg in screen_segments:
+            seg['type'] = 'screen'
+
+        # Combine and sort by timestamp
+        all_segments = audio_segments + screen_segments
+        all_segments.sort(key=lambda x: x['timestamp'])
+
+        return all_segments
+
+    def format_for_claude(
+        self,
+        merged_segments: List[Dict],
+        format_style: str = "detailed"
+    ) -> str:
+        """
+        Format merged transcript for Claude processing.
+
+        Args:
+            merged_segments: Merged transcript segments
+            format_style: 'detailed' or 'compact'
+
+        Returns:
+            Formatted transcript string
+        """
+        if format_style == "detailed":
+            return self._format_detailed(merged_segments)
+        else:
+            return self._format_compact(merged_segments)
+
+    def _format_detailed(self, segments: List[Dict]) -> str:
+        """Format with clear visual separation between audio and screen content."""
+        lines = []
+        lines.append("=" * 80)
+        lines.append("ENHANCED MEETING TRANSCRIPT")
+        lines.append("Audio transcript + Screen content")
+        lines.append("=" * 80)
+        lines.append("")
+
+        for seg in segments:
+            timestamp = self._format_timestamp(seg['timestamp'])
+
+            if seg['type'] == 'audio':
+                lines.append(f"[{timestamp}] SPEAKER:")
+                lines.append(f"  {seg['text']}")
+                lines.append("")
+
+            else:  # screen
+                lines.append(f"[{timestamp}] SCREEN CONTENT:")
+                # Indent screen text for visibility
+                screen_text = seg['text'].replace('\n', '\n  | ')
+                lines.append(f"  | {screen_text}")
+                lines.append("")
+
+        return "\n".join(lines)
+
+    def _format_compact(self, segments: List[Dict]) -> str:
+        """Compact format for shorter transcripts."""
+        lines = []
+
+        for seg in segments:
+            timestamp = self._format_timestamp(seg['timestamp'])
+            prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
+            text = seg['text'].replace('\n', ' ')[:200]  # Truncate long screen text
+            lines.append(f"[{timestamp}] {prefix}: {text}")
+
+        return "\n".join(lines)
+
+    def _format_timestamp(self, seconds: float) -> str:
+        """Format timestamp as MM:SS."""
+        minutes = int(seconds // 60)
+        secs = int(seconds % 60)
+        return f"{minutes:02d}:{secs:02d}"
+
+    def save_transcript(self, formatted_text: str, output_path: str):
+        """
+        Save formatted transcript to file.
+
+        Args:
+            formatted_text: Formatted transcript
+            output_path: Output file path
+        """
+        with open(output_path, 'w', encoding='utf-8') as f:
+            f.write(formatted_text)
+
+        logger.info(f"Saved enhanced transcript to: {output_path}")