embed images

This commit is contained in:
Mariano Gabriel
2025-10-28 08:02:45 -03:00
parent b1e1daf278
commit 118ef04223
12 changed files with 1016 additions and 61 deletions

View File

@@ -6,6 +6,8 @@ from typing import List, Dict, Optional
import json
from pathlib import Path
import logging
import base64
from io import BytesIO
logger = logging.getLogger(__name__)
@@ -13,9 +15,16 @@ logger = logging.getLogger(__name__)
class TranscriptMerger:
"""Merge audio transcripts with screen OCR text."""
def __init__(self):
"""Initialize transcript merger."""
pass
def __init__(self, embed_images: bool = False, embed_quality: int = 80):
"""
Initialize transcript merger.
Args:
embed_images: Whether to embed frame images as base64
embed_quality: JPEG quality for embedded images (0-100)
"""
self.embed_images = embed_images
self.embed_quality = embed_quality
def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]:
"""
@@ -120,6 +129,32 @@ class TranscriptMerger:
logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s")
return intervals
def _encode_image_base64(self, image_path: str) -> tuple[str, int]:
"""
Encode image as base64 (image already at target quality/size).
Args:
image_path: Path to image file
Returns:
Tuple of (base64_string, size_in_bytes)
"""
try:
# Read file directly (already at target quality/resolution)
with open(image_path, 'rb') as f:
img_bytes = f.read()
# Encode to base64
b64_string = base64.b64encode(img_bytes).decode('utf-8')
logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes")
return b64_string, len(img_bytes)
except Exception as e:
logger.error(f"Failed to encode image {image_path}: {e}")
return "", 0
def merge_transcripts(
self,
audio_segments: List[Dict],
@@ -172,10 +207,15 @@ class TranscriptMerger:
lines = []
lines.append("=" * 80)
lines.append("ENHANCED MEETING TRANSCRIPT")
lines.append("Audio transcript + Screen content")
if self.embed_images:
lines.append("Audio transcript + Embedded frame images (base64)")
else:
lines.append("Audio transcript + Screen content")
lines.append("=" * 80)
lines.append("")
total_image_bytes = 0
for seg in segments:
timestamp = self._format_timestamp(seg['timestamp'])
@@ -186,11 +226,31 @@ class TranscriptMerger:
else: # screen
lines.append(f"[{timestamp}] SCREEN CONTENT:")
# Indent screen text for visibility
screen_text = seg['text'].replace('\n', '\n | ')
lines.append(f" | {screen_text}")
# Embed image if requested
if self.embed_images and 'frame_path' in seg:
b64_img, img_size = self._encode_image_base64(seg['frame_path'])
total_image_bytes += img_size
if b64_img:
lines.append(f" IMAGE (base64, {img_size // 1024}KB):")
lines.append(f" <image>data:image/jpeg;base64,{b64_img}</image>")
lines.append("")
# Include text content if available (fallback or additional context)
if 'text' in seg and seg['text'].strip():
screen_text = seg['text'].replace('\n', '\n | ')
lines.append(f" TEXT:")
lines.append(f" | {screen_text}")
lines.append("")
if self.embed_images and total_image_bytes > 0:
total_mb = total_image_bytes / (1024 * 1024)
lines.append("")
lines.append(f"Total embedded images size: {total_mb:.2f} MB")
logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB")
return "\n".join(lines)
def _format_compact(self, segments: List[Dict]) -> str: