init commit
This commit is contained in:
173
meetus/transcript_merger.py
Normal file
173
meetus/transcript_merger.py
Normal file
@@ -0,0 +1,173 @@
|
||||
"""
|
||||
Merge Whisper transcripts with OCR screen content.
|
||||
Creates a unified, timestamped transcript for Claude summarization.
|
||||
"""
|
||||
from typing import List, Dict, Optional
|
||||
import json
|
||||
from pathlib import Path
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TranscriptMerger:
|
||||
"""Merge audio transcripts with screen OCR text."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initialize transcript merger."""
|
||||
pass
|
||||
|
||||
def load_whisper_transcript(self, transcript_path: str) -> List[Dict]:
|
||||
"""
|
||||
Load Whisper transcript from file.
|
||||
|
||||
Supports both JSON format (with timestamps) and plain text.
|
||||
|
||||
Args:
|
||||
transcript_path: Path to transcript file
|
||||
|
||||
Returns:
|
||||
List of dicts with 'timestamp' (optional) and 'text'
|
||||
"""
|
||||
path = Path(transcript_path)
|
||||
|
||||
if path.suffix == '.json':
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
|
||||
# Handle different Whisper output formats
|
||||
if isinstance(data, dict) and 'segments' in data:
|
||||
# Standard Whisper JSON format
|
||||
return [
|
||||
{
|
||||
'timestamp': seg.get('start', 0),
|
||||
'text': seg['text'].strip(),
|
||||
'type': 'audio'
|
||||
}
|
||||
for seg in data['segments']
|
||||
]
|
||||
elif isinstance(data, list):
|
||||
# List of segments
|
||||
return [
|
||||
{
|
||||
'timestamp': seg.get('start', seg.get('timestamp', 0)),
|
||||
'text': seg['text'].strip(),
|
||||
'type': 'audio'
|
||||
}
|
||||
for seg in data
|
||||
]
|
||||
|
||||
else:
|
||||
# Plain text file - no timestamps
|
||||
with open(path, 'r', encoding='utf-8') as f:
|
||||
text = f.read().strip()
|
||||
|
||||
return [{
|
||||
'timestamp': 0,
|
||||
'text': text,
|
||||
'type': 'audio'
|
||||
}]
|
||||
|
||||
def merge_transcripts(
|
||||
self,
|
||||
audio_segments: List[Dict],
|
||||
screen_segments: List[Dict]
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Merge audio and screen transcripts by timestamp.
|
||||
|
||||
Args:
|
||||
audio_segments: List of audio transcript segments
|
||||
screen_segments: List of screen OCR segments
|
||||
|
||||
Returns:
|
||||
Merged list sorted by timestamp
|
||||
"""
|
||||
# Mark segment types
|
||||
for seg in audio_segments:
|
||||
seg['type'] = 'audio'
|
||||
for seg in screen_segments:
|
||||
seg['type'] = 'screen'
|
||||
|
||||
# Combine and sort by timestamp
|
||||
all_segments = audio_segments + screen_segments
|
||||
all_segments.sort(key=lambda x: x['timestamp'])
|
||||
|
||||
return all_segments
|
||||
|
||||
def format_for_claude(
|
||||
self,
|
||||
merged_segments: List[Dict],
|
||||
format_style: str = "detailed"
|
||||
) -> str:
|
||||
"""
|
||||
Format merged transcript for Claude processing.
|
||||
|
||||
Args:
|
||||
merged_segments: Merged transcript segments
|
||||
format_style: 'detailed' or 'compact'
|
||||
|
||||
Returns:
|
||||
Formatted transcript string
|
||||
"""
|
||||
if format_style == "detailed":
|
||||
return self._format_detailed(merged_segments)
|
||||
else:
|
||||
return self._format_compact(merged_segments)
|
||||
|
||||
def _format_detailed(self, segments: List[Dict]) -> str:
|
||||
"""Format with clear visual separation between audio and screen content."""
|
||||
lines = []
|
||||
lines.append("=" * 80)
|
||||
lines.append("ENHANCED MEETING TRANSCRIPT")
|
||||
lines.append("Audio transcript + Screen content")
|
||||
lines.append("=" * 80)
|
||||
lines.append("")
|
||||
|
||||
for seg in segments:
|
||||
timestamp = self._format_timestamp(seg['timestamp'])
|
||||
|
||||
if seg['type'] == 'audio':
|
||||
lines.append(f"[{timestamp}] SPEAKER:")
|
||||
lines.append(f" {seg['text']}")
|
||||
lines.append("")
|
||||
|
||||
else: # screen
|
||||
lines.append(f"[{timestamp}] SCREEN CONTENT:")
|
||||
# Indent screen text for visibility
|
||||
screen_text = seg['text'].replace('\n', '\n | ')
|
||||
lines.append(f" | {screen_text}")
|
||||
lines.append("")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_compact(self, segments: List[Dict]) -> str:
|
||||
"""Compact format for shorter transcripts."""
|
||||
lines = []
|
||||
|
||||
for seg in segments:
|
||||
timestamp = self._format_timestamp(seg['timestamp'])
|
||||
prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN"
|
||||
text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text
|
||||
lines.append(f"[{timestamp}] {prefix}: {text}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
def _format_timestamp(self, seconds: float) -> str:
|
||||
"""Format timestamp as MM:SS."""
|
||||
minutes = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f"{minutes:02d}:{secs:02d}"
|
||||
|
||||
def save_transcript(self, formatted_text: str, output_path: str):
|
||||
"""
|
||||
Save formatted transcript to file.
|
||||
|
||||
Args:
|
||||
formatted_text: Formatted transcript
|
||||
output_path: Output file path
|
||||
"""
|
||||
with open(output_path, 'w', encoding='utf-8') as f:
|
||||
f.write(formatted_text)
|
||||
|
||||
logger.info(f"Saved enhanced transcript to: {output_path}")
|
||||
Reference in New Issue
Block a user