286 lines
9.5 KiB
Python
286 lines
9.5 KiB
Python
"""
|
|
Merge Whisper transcripts with OCR screen content.
|
|
Creates a unified, timestamped transcript for Claude summarization.
|
|
"""
|
|
from typing import List, Dict, Optional
|
|
import json
|
|
from pathlib import Path
|
|
import logging
|
|
import base64
|
|
from io import BytesIO
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class TranscriptMerger:
|
|
"""Merge audio transcripts with screen OCR text."""
|
|
|
|
def __init__(self, embed_images: bool = False, embed_quality: int = 80):
|
|
"""
|
|
Initialize transcript merger.
|
|
|
|
Args:
|
|
embed_images: Whether to embed frame images as base64
|
|
embed_quality: JPEG quality for embedded images (0-100)
|
|
"""
|
|
self.embed_images = embed_images
|
|
self.embed_quality = embed_quality
|
|
|
|
def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
|
|
"""
|
|
Load Whisper transcript from file.
|
|
|
|
Supports both JSON format (with timestamps) and plain text.
|
|
|
|
Args:
|
|
transcript_path: Path to transcript file
|
|
group_interval: If specified, group audio segments into intervals (in seconds)
|
|
|
|
Returns:
|
|
List of dicts with 'timestamp' (optional) and 'text'
|
|
"""
|
|
path = Path(transcript_path)
|
|
|
|
if path.suffix == '.json':
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# Handle different Whisper output formats
|
|
segments = []
|
|
if isinstance(data, dict) and 'segments' in data:
|
|
# Standard Whisper JSON format
|
|
segments = [
|
|
{
|
|
'timestamp': seg.get('start', 0),
|
|
'text': seg['text'].strip(),
|
|
'type': 'audio'
|
|
}
|
|
for seg in data['segments']
|
|
]
|
|
elif isinstance(data, list):
|
|
# List of segments
|
|
segments = [
|
|
{
|
|
'timestamp': seg.get('start', seg.get('timestamp', 0)),
|
|
'text': seg['text'].strip(),
|
|
'type': 'audio'
|
|
}
|
|
for seg in data
|
|
]
|
|
|
|
# Group by interval if requested
|
|
if group_interval and segments:
|
|
segments = self.group_audio_by_intervals(segments, group_interval)
|
|
|
|
return segments
|
|
|
|
else:
|
|
# Plain text file - no timestamps
|
|
with open(path, 'r', encoding='utf-8') as f:
|
|
text = f.read().strip()
|
|
|
|
return [{
|
|
'timestamp': 0,
|
|
'text': text,
|
|
'type': 'audio'
|
|
}]
|
|
|
|
def group_audio_by_intervals(self, segments: List[Dict], interval_seconds: int = 30) -> List[Dict]:
|
|
"""
|
|
Group audio segments into regular time intervals.
|
|
|
|
Instead of word-level timestamps, this creates intervals (e.g., every 30 seconds)
|
|
with all text spoken during that interval concatenated together.
|
|
|
|
Args:
|
|
segments: List of audio segments with timestamps
|
|
interval_seconds: Duration of each interval in seconds
|
|
|
|
Returns:
|
|
List of grouped segments with interval timestamps
|
|
"""
|
|
if not segments:
|
|
return []
|
|
|
|
# Find the max timestamp to determine how many intervals we need
|
|
max_timestamp = max(seg['timestamp'] for seg in segments)
|
|
num_intervals = int(max_timestamp / interval_seconds) + 1
|
|
|
|
# Create interval buckets
|
|
intervals = []
|
|
for i in range(num_intervals):
|
|
interval_start = i * interval_seconds
|
|
interval_end = (i + 1) * interval_seconds
|
|
|
|
# Collect all text in this interval
|
|
texts = []
|
|
for seg in segments:
|
|
if interval_start <= seg['timestamp'] < interval_end:
|
|
texts.append(seg['text'])
|
|
|
|
# Only create interval if there's text
|
|
if texts:
|
|
intervals.append({
|
|
'timestamp': interval_start,
|
|
'text': ' '.join(texts),
|
|
'type': 'audio'
|
|
})
|
|
|
|
logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
|
|
return intervals
|
|
|
|
def _encode_image_base64(self, image_path: str) -> tuple[str, int]:
|
|
"""
|
|
Encode image as base64 (image already at target quality/size).
|
|
|
|
Args:
|
|
image_path: Path to image file
|
|
|
|
Returns:
|
|
Tuple of (base64_string, size_in_bytes)
|
|
"""
|
|
try:
|
|
# Read file directly (already at target quality/resolution)
|
|
with open(image_path, 'rb') as f:
|
|
img_bytes = f.read()
|
|
|
|
# Encode to base64
|
|
b64_string = base64.b64encode(img_bytes).decode('utf-8')
|
|
|
|
logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes")
|
|
|
|
return b64_string, len(img_bytes)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Failed to encode image {image_path}: {e}")
|
|
return "", 0
|
|
|
|
def merge_transcripts(
|
|
self,
|
|
audio_segments: List[Dict],
|
|
screen_segments: List[Dict]
|
|
) -> List[Dict]:
|
|
"""
|
|
Merge audio and screen transcripts by timestamp.
|
|
|
|
Args:
|
|
audio_segments: List of audio transcript segments
|
|
screen_segments: List of screen OCR segments
|
|
|
|
Returns:
|
|
Merged list sorted by timestamp
|
|
"""
|
|
# Mark segment types
|
|
for seg in audio_segments:
|
|
seg['type'] = 'audio'
|
|
for seg in screen_segments:
|
|
seg['type'] = 'screen'
|
|
|
|
# Combine and sort by timestamp
|
|
all_segments = audio_segments + screen_segments
|
|
all_segments.sort(key=lambda x: x['timestamp'])
|
|
|
|
return all_segments
|
|
|
|
def format_for_claude(
|
|
self,
|
|
merged_segments: List[Dict],
|
|
format_style: str = "detailed"
|
|
) -> str:
|
|
"""
|
|
Format merged transcript for Claude processing.
|
|
|
|
Args:
|
|
merged_segments: Merged transcript segments
|
|
format_style: 'detailed' or 'compact'
|
|
|
|
Returns:
|
|
Formatted transcript string
|
|
"""
|
|
if format_style == "detailed":
|
|
return self._format_detailed(merged_segments)
|
|
else:
|
|
return self._format_compact(merged_segments)
|
|
|
|
def _format_detailed(self, segments: List[Dict]) -> str:
|
|
"""Format with clear visual separation between audio and screen content."""
|
|
lines = []
|
|
lines.append("=" * 80)
|
|
lines.append("ENHANCED MEETING TRANSCRIPT")
|
|
if self.embed_images:
|
|
lines.append("Audio transcript + Embedded frame images (base64)")
|
|
else:
|
|
lines.append("Audio transcript + Screen content")
|
|
lines.append("=" * 80)
|
|
lines.append("")
|
|
|
|
total_image_bytes = 0
|
|
|
|
for seg in segments:
|
|
timestamp = self._format_timestamp(seg['timestamp'])
|
|
|
|
if seg['type'] == 'audio':
|
|
lines.append(f"[{timestamp}] SPEAKER:")
|
|
lines.append(f" {seg['text']}")
|
|
lines.append("")
|
|
|
|
else: # screen
|
|
lines.append(f"[{timestamp}] SCREEN CONTENT:")
|
|
|
|
# Embed image if requested
|
|
if self.embed_images and 'frame_path' in seg:
|
|
b64_img, img_size = self._encode_image_base64(seg['frame_path'])
|
|
total_image_bytes += img_size
|
|
|
|
if b64_img:
|
|
lines.append(f" IMAGE (base64, {img_size // 1024}KB):")
|
|
lines.append(f" <image>data:image/jpeg;base64,{b64_img}</image>")
|
|
lines.append("")
|
|
|
|
# Include text content if available (fallback or additional context)
|
|
if 'text' in seg and seg['text'].strip():
|
|
screen_text = seg['text'].replace('\n', '\n | ')
|
|
lines.append(f" TEXT:")
|
|
lines.append(f" | {screen_text}")
|
|
|
|
lines.append("")
|
|
|
|
if self.embed_images and total_image_bytes > 0:
|
|
total_mb = total_image_bytes / (1024 * 1024)
|
|
lines.append("")
|
|
lines.append(f"Total embedded images size: {total_mb:.2f} MB")
|
|
logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_compact(self, segments: List[Dict]) -> str:
|
|
"""Compact format for shorter transcripts."""
|
|
lines = []
|
|
|
|
for seg in segments:
|
|
timestamp = self._format_timestamp(seg['timestamp'])
|
|
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
|
|
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
|
|
lines.append(f"[{timestamp}] {prefix}: {text}")
|
|
|
|
return "\n".join(lines)
|
|
|
|
def _format_timestamp(self, seconds: float) -> str:
|
|
"""Format timestamp as MM:SS."""
|
|
minutes = int(seconds // 60)
|
|
secs = int(seconds % 60)
|
|
return f"{minutes:02d}:{secs:02d}"
|
|
|
|
def save_transcript(self, formatted_text: str, output_path: str):
|
|
"""
|
|
Save formatted transcript to file.
|
|
|
|
Args:
|
|
formatted_text: Formatted transcript
|
|
output_path: Output file path
|
|
"""
|
|
with open(output_path, 'w', encoding='utf-8') as f:
|
|
f.write(formatted_text)
|
|
|
|
logger.info(f"Saved enhanced transcript to: {output_path}")
|