Files
mitus/meetus/transcript_merger.py
2025-12-03 06:48:45 -03:00

320 lines
11 KiB
Python

"""
Merge Whisper transcripts with OCR screen content.
Creates a unified, timestamped transcript for Claude summarization.
"""
from typing import List, Dict, Optional
import json
from pathlib import Path
import logging
import base64
from io import BytesIO
logger = logging.getLogger(__name__)
class TranscriptMerger:
"""Merge audio transcripts with screen OCR text."""
def __init__(self, embed_images: bool = False, embed_quality: int = 80):
"""
Initialize transcript merger.
Args:
embed_images: Whether to embed frame images as base64
embed_quality: JPEG quality for embedded images (0-100)
"""
self.embed_images = embed_images
self.embed_quality = embed_quality
def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
"""
Load Whisper transcript from file.
Supports both JSON format (with timestamps) and plain text.
Args:
transcript_path: Path to transcript file
group_interval: If specified, group audio segments into intervals (in seconds)
Returns:
List of dicts with 'timestamp' (optional) and 'text'
"""
path = Path(transcript_path)
if path.suffix == '.json':
with open(path, 'r', encoding='utf-8') as f:
data = json.load(f)
# Handle different Whisper/WhisperX output formats
segments = []
if isinstance(data, dict) and 'segments' in data:
# Standard Whisper/WhisperX JSON format
segments = [
{
'timestamp': seg.get('start', 0),
'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio'
}
for seg in data['segments']
]
elif isinstance(data, list):
# List of segments
segments = [
{
'timestamp': seg.get('start', seg.get('timestamp', 0)),
'text': seg['text'].strip(),
'speaker': seg.get('speaker'), # WhisperX diarization
'type': 'audio'
}
for seg in data
]
# Group by interval if requested, but skip if we have speaker diarization
# (merge_transcripts will group by speaker instead)
has_speakers = any(seg.get('speaker') for seg in segments)
if group_interval and segments and not has_speakers:
segments = self.group_audio_by_intervals(segments, group_interval)
return segments
else:
# Plain text file - no timestamps
with open(path, 'r', encoding='utf-8') as f:
text = f.read().strip()
return [{
'timestamp': 0,
'text': text,
'type': 'audio'
}]
def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]:
"""
Group audio segments into regular time intervals.
Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds)
with all text spoken during that interval concatenated together.
Args:
segments: List of audio segments with timestamps
interval_seconds: Duration of each interval in seconds
Returns:
List of grouped segments with interval timestamps
"""
if not segments:
return []
# Find the max timestamp to determine how many intervals we need
max_timestamp = max(seg['timestamp'] for seg in segments)
num_intervals = int(max_timestamp / interval_seconds) + 1
# Create interval buckets
intervals = []
for i in range(num_intervals):
interval_start = i * interval_seconds
interval_end = (i + 1) * interval_seconds
# Collect all text in this interval
texts = []
for seg in segments:
if interval_start <= seg['timestamp'] < interval_end:
texts.append(seg['text'])
# Only create interval if there's text
if texts:
intervals.append({
'timestamp': interval_start,
'text': ' '.join(texts),
'type': 'audio'
})
logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
return intervals
def _encode_image_base64(self, image_path: str) -> tuple[str, int]:
"""
Encode image as base64 (image already at target quality/size).
Args:
image_path: Path to image file
Returns:
Tuple of (base64_string, size_in_bytes)
"""
try:
# Read file directly (already at target quality/resolution)
with open(image_path, 'rb') as f:
img_bytes = f.read()
# Encode to base64
b64_string = base64.b64encode(img_bytes).decode('utf-8')
logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes")
return b64_string, len(img_bytes)
except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}")
return "", 0
def merge_transcripts(
self,
audio_segments: List[Dict],
screen_segments: List[Dict]
) -> List[Dict]:
"""
Merge audio and screen transcripts by timestamp.
Groups consecutive audio from same speaker until a screen frame interrupts.
Args:
audio_segments: List of audio transcript segments
screen_segments: List of screen OCR segments
Returns:
Merged list sorted by timestamp, with audio grouped by speaker
"""
# Mark segment types
for seg in audio_segments:
seg['type'] = 'audio'
for seg in screen_segments:
seg['type'] = 'screen'
# Combine and sort by timestamp
all_segments = audio_segments + screen_segments
all_segments.sort(key=lambda x: x['timestamp'])
# Group consecutive audio segments by speaker (screen frames break groups)
grouped = []
current_group = None
for seg in all_segments:
if seg['type'] == 'screen':
# Screen frame: flush current group and add frame
if current_group:
grouped.append(current_group)
current_group = None
grouped.append(seg)
else:
# Audio segment
speaker = seg.get('speaker')
if current_group is None:
# Start new group
current_group = {
'timestamp': seg['timestamp'],
'text': seg['text'],
'speaker': speaker,
'type': 'audio'
}
elif speaker == current_group.get('speaker'):
# Same speaker, append text
current_group['text'] += ' ' + seg['text']
else:
# Speaker changed, flush and start new group
grouped.append(current_group)
current_group = {
'timestamp': seg['timestamp'],
'text': seg['text'],
'speaker': speaker,
'type': 'audio'
}
# Don't forget last group
if current_group:
grouped.append(current_group)
return grouped
def format_for_claude(
self,
merged_segments: List[Dict],
format_style: str = "detailed"
) -> str:
"""
Format merged transcript for Claude processing.
Args:
merged_segments: Merged transcript segments
format_style: 'detailed' or 'compact'
Returns:
Formatted transcript string
"""
if format_style == "detailed":
return self._format_detailed(merged_segments)
else:
return self._format_compact(merged_segments)
def _format_detailed(self, segments: List[Dict]) -> str:
"""Format with clear visual separation between audio and screen content."""
lines = []
lines.append("=" * 80)
lines.append("ENHANCED MEETING TRANSCRIPT")
lines.append("Audio transcript + Screen frames")
lines.append("=" * 80)
lines.append("")
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
if seg['type'] == 'audio':
speaker = seg.get('speaker', 'SPEAKER')
lines.append(f"[{timestamp}] {speaker}:")
lines.append(f" {seg['text']}")
lines.append("")
else: # screen
lines.append(f"[{timestamp}] SCREEN CONTENT:")
# Show frame path if available
if 'frame_path' in seg:
# Get just the filename relative to the enhanced transcript
frame_path = Path(seg['frame_path'])
relative_path = f"frames/{frame_path.name}"
lines.append(f" Frame: {relative_path}")
# Include text content if available (fallback or additional context)
if 'text' in seg and seg['text'].strip():
screen_text = seg['text'].replace('\n', '\n | ')
lines.append(f" TEXT:")
lines.append(f" | {screen_text}")
lines.append("")
return "\n".join(lines)
def _format_compact(self, segments: List[Dict]) -> str:
"""Compact format for shorter transcripts."""
lines = []
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
if seg['type'] == 'audio':
prefix = seg.get('speaker', 'SPEAKER')
else:
prefix = "SCREEN"
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
lines.append(f"[{timestamp}] {prefix}: {text}")
return "\n".join(lines)
def _format_timestamp(self, seconds: float) -> str:
"""Format timestamp as MM:SS."""
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f"{minutes:02d}:{secs:02d}"
def save_transcript(self, formatted_text: str, output_path: str):
"""
Save formatted transcript to file.
Args:
formatted_text: Formatted transcript
output_path: Output file path
"""
with open(output_path, 'w', encoding='utf-8') as f:
f.write(formatted_text)
logger.info(f"Saved enhanced transcript to: {output_path}")