Files
mitus/meetus/transcript_merger.py
Mariano Gabriel 93e0c06d38 init commit
2025-10-19 22:17:38 -03:00

174 lines
5.3 KiB
Python

"""
Merge Whisper transcripts with OCR screen content.
Creates a unified, timestamped transcript for Claude summarization.
"""
from typing import List, Dict, Optional
import json
from pathlib import Path
import logging
logger = logging.getLogger(__name__)
class TranscriptMerger:
"""Merge audio transcripts with screen OCR text."""
def __init__(self):
"""Initialize transcript merger."""
pass
def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
"""
Load Whisper transcript from file.
Supports both JSON format (with timestamps) and plain text.
Args:
transcript_path: Path to transcript file
Returns:
List of dicts with 'timestamp' (optional) and 'text'
"""
path = Path(transcript_path)
if path.suffix == '.json':
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle different Whisper output formats
if isinstance(data, dict) and 'segments' in data:
# Standard Whisper JSON format
return [
{
'timestamp': seg.get('start', 0),
'text': seg['text'].strip(),
'type': 'audio'
}
for seg in data['segments']
]
elif isinstance(data, list):
# List of segments
return [
{
'timestamp': seg.get('start', seg.get('timestamp', 0)),
'text': seg['text'].strip(),
'type': 'audio'
}
for seg in data
]
else:
# Plain text file - no timestamps
with open(path, 'r', encoding='utf-8') as f:
text = f.read().strip()
return [{
'timestamp': 0,
'text': text,
'type': 'audio'
}]
def merge_transcripts(
self,
audio_segments: List[Dict],
screen_segments: List[Dict]
) -> List[Dict]:
"""
Merge audio and screen transcripts by timestamp.
Args:
audio_segments: List of audio transcript segments
screen_segments: List of screen OCR segments
Returns:
Merged list sorted by timestamp
"""
# Mark segment types
for seg in audio_segments:
seg['type'] = 'audio'
for seg in screen_segments:
seg['type'] = 'screen'
# Combine and sort by timestamp
all_segments = audio_segments + screen_segments
all_segments.sort(key=lambda x: x['timestamp'])
return all_segments
def format_for_claude(
self,
merged_segments: List[Dict],
format_style: str = "detailed"
) -> str:
"""
Format merged transcript for Claude processing.
Args:
merged_segments: Merged transcript segments
format_style: 'detailed' or 'compact'
Returns:
Formatted transcript string
"""
if format_style == "detailed":
return self._format_detailed(merged_segments)
else:
return self._format_compact(merged_segments)
def _format_detailed(self, segments: List[Dict]) -> str:
"""Format with clear visual separation between audio and screen content."""
lines = []
lines.append("=" * 80)
lines.append("ENHANCED MEETING TRANSCRIPT")
lines.append("Audio transcript + Screen content")
lines.append("=" * 80)
lines.append("")
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
if seg['type'] == 'audio':
lines.append(f"[{timestamp}] SPEAKER:")
lines.append(f" {seg['text']}")
lines.append("")
else: # screen
lines.append(f"[{timestamp}] SCREEN CONTENT:")
# Indent screen text for visibility
screen_text = seg['text'].replace('\n', '\n | ')
lines.append(f" | {screen_text}")
lines.append("")
return "\n".join(lines)
def _format_compact(self, segments: List[Dict]) -> str:
"""Compact format for shorter transcripts."""
lines = []
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
lines.append(f"[{timestamp}] {prefix}: {text}")
return "\n".join(lines)
def _format_timestamp(self, seconds: float) -> str:
"""Format timestamp as MM:SS."""
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f"{minutes:02d}:{secs:02d}"
def save_transcript(self, formatted_text: str, output_path: str):
"""
Save formatted transcript to file.
Args:
formatted_text: Formatted transcript
output_path: Output file path
"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(formatted_text)
logger.info(f"Saved enhanced transcript to: {output_path}")