diff --git a/def/05-reference-frames-instead-of-embedding.md b/def/05-reference-frames-instead-of-embedding.md new file mode 100644 index 0000000..7169956 --- /dev/null +++ b/def/05-reference-frames-instead-of-embedding.md @@ -0,0 +1,124 @@ +# 05 - Reference Frame Files Instead of Embedding + +## Date +2025-10-28 + +## Context +Embedding base64 images made the enhanced transcript files very large (3.7MB for ~40 frames). This made them harder to work with and slower to process. + +## Problem +- Enhanced transcript with embedded base64 images was 3.7MB +- Large file size makes it slow to read/process +- Difficult to inspect individual frames +- Harder to share and version control + +## Solution: Reference Frame Paths +Instead of embedding base64 image data, reference the frame files by their relative paths. + +### Before (Embedded): +``` +[00:08] SCREEN CONTENT: + IMAGE (base64, 85KB): + data:image/jpeg;base64,/9j/4AAQSkZJRg... +``` +File size: 3.7MB + +### After (Referenced): +``` +[00:08] SCREEN CONTENT: + Frame: frames/zaca-run-scrapers_00257.jpg +``` +File size: ~50KB + +## Implementation + +**Directory Structure:** +``` +output/20251028-003-zaca-run-scrapers/ +├── frames/ +│ ├── zaca-run-scrapers_00257.jpg +│ ├── zaca-run-scrapers_00487.jpg +│ └── ... +├── zaca-run-scrapers.json (whisper transcript) +└── zaca-run-scrapers_enhanced.txt (references frames/ directory) +``` + +**Enhanced Transcript Format:** +``` +================================================================================ +ENHANCED MEETING TRANSCRIPT +Audio transcript + Screen frames +================================================================================ + +[00:30] SPEAKER: + Bueno, te dio un tour para el proyecto... + +[00:08] SCREEN CONTENT: + Frame: frames/zaca-run-scrapers_00257.jpg + +[01:00] SPEAKER: + Mayormente en Scrapping lo que tenemos... + +[01:15] SCREEN CONTENT: + Frame: frames/zaca-run-scrapers_00487.jpg + TEXT: + | Code snippet from screen (if OCR was used) +``` + +## Benefits + +✓ **Much smaller files**: ~50KB vs 3.7MB (74x smaller!) +✓ **Easier to inspect**: Can view individual frames directly +✓ **LLM can access images**: Frame paths allow LLM to load images on demand +✓ **Better version control**: Text files are small and diffable +✓ **Cleaner structure**: Frames organized in dedicated directory +✓ **Flexible**: Can still do OCR/vision analysis if needed (adds TEXT section) + +## Flags + +**`--embed-images`**: Skip OCR/vision analysis, just reference frame files +- Faster (no analysis needed) +- Lets LLM analyze raw images +- Enhanced transcript only contains frame references + +**Without `--embed-images`**: Run OCR/vision analysis +- Extracts text from frames +- Enhanced transcript includes both frame reference AND extracted text +- Useful for code/dashboard analysis + +## Usage + +```bash +# Reference frames only (no OCR, faster) +python process_meeting.py samples/video.mkv --run-whisper --embed-images --scene-detection -v + +# Reference frames + OCR text extraction +python process_meeting.py samples/video.mkv --run-whisper --use-hybrid --scene-detection -v + +# Adjust frame quality (smaller files) +python process_meeting.py samples/video.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection -v +``` + +## Files Modified + +- `meetus/transcript_merger.py` - Modified `_format_detailed()` to output frame paths instead of base64 +- `process_meeting.py` - Updated help text and examples to reflect frame referencing +- All processors (OCR, vision, hybrid) already include `frame_path` in results (no changes needed) + +## Workflow Example + +```bash +# First run: Generate everything +python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection -v + +# Result: +# - output/20251028-004-meeting/ +# - frames/ (40 frames, ~80KB each) +# - meeting.json (whisper transcript) +# - meeting_enhanced.txt (~50KB, references frames/) + +# LLM can now: +# 1. Read enhanced transcript +# 2. See timeline of audio + screen changes +# 3. Load individual frames as needed from frames/ directory +``` diff --git a/meetus/transcript_merger.py b/meetus/transcript_merger.py index 30b5b40..dda4ff3 100644 --- a/meetus/transcript_merger.py +++ b/meetus/transcript_merger.py @@ -45,14 +45,15 @@ class TranscriptMerger: with open(path, 'r', encoding='utf-8') as f: data = json.load(f) - # Handle different Whisper output formats + # Handle different Whisper/WhisperX output formats segments = [] if isinstance(data, dict) and 'segments' in data: - # Standard Whisper JSON format + # Standard Whisper/WhisperX JSON format segments = [ { 'timestamp': seg.get('start', 0), 'text': seg['text'].strip(), + 'speaker': seg.get('speaker'), # WhisperX diarization 'type': 'audio' } for seg in data['segments'] @@ -63,6 +64,7 @@ class TranscriptMerger: { 'timestamp': seg.get('start', seg.get('timestamp', 0)), 'text': seg['text'].strip(), + 'speaker': seg.get('speaker'), # WhisperX diarization 'type': 'audio' } for seg in data @@ -207,35 +209,28 @@ class TranscriptMerger: lines = [] lines.append("=" * 80) lines.append("ENHANCED MEETING TRANSCRIPT") - if self.embed_images: - lines.append("Audio transcript + Embedded frame images (base64)") - else: - lines.append("Audio transcript + Screen content") + lines.append("Audio transcript + Screen frames") lines.append("=" * 80) lines.append("") - total_image_bytes = 0 - for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) if seg['type'] == 'audio': - lines.append(f"[{timestamp}] SPEAKER:") + speaker = seg.get('speaker', 'SPEAKER') + lines.append(f"[{timestamp}] {speaker}:") lines.append(f" {seg['text']}") lines.append("") else: # screen lines.append(f"[{timestamp}] SCREEN CONTENT:") - # Embed image if requested - if self.embed_images and 'frame_path' in seg: - b64_img, img_size = self._encode_image_base64(seg['frame_path']) - total_image_bytes += img_size - - if b64_img: - lines.append(f" IMAGE (base64, {img_size // 1024}KB):") - lines.append(f" data:image/jpeg;base64,{b64_img}") - lines.append("") + # Show frame path if available + if 'frame_path' in seg: + # Get just the filename relative to the enhanced transcript + frame_path = Path(seg['frame_path']) + relative_path = f"frames/{frame_path.name}" + lines.append(f" Frame: {relative_path}") # Include text content if available (fallback or additional context) if 'text' in seg and seg['text'].strip(): @@ -245,12 +240,6 @@ class TranscriptMerger: lines.append("") - if self.embed_images and total_image_bytes > 0: - total_mb = total_image_bytes / (1024 * 1024) - lines.append("") - lines.append(f"Total embedded images size: {total_mb:.2f} MB") - logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB") - return "\n".join(lines) def _format_compact(self, segments: List[Dict]) -> str: @@ -259,7 +248,10 @@ class TranscriptMerger: for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) - prefix = "SPEAKER" if seg['type'] == 'audio' else "SCREEN" + if seg['type'] == 'audio': + prefix = seg.get('speaker', 'SPEAKER') + else: + prefix = "SCREEN" text = seg['text'].replace('\n', ' ')[:200] # Truncate long screen text lines.append(f"[{timestamp}] {prefix}: {text}") diff --git a/meetus/workflow.py b/meetus/workflow.py index b1e5f46..3c2b0be 100644 --- a/meetus/workflow.py +++ b/meetus/workflow.py @@ -184,10 +184,10 @@ class ProcessingWorkflow: logger.info("STEP 0: Running Whisper Transcription") logger.info("=" * 80) - # Check if whisper is installed - if not shutil.which("whisper"): - logger.error("Whisper is not installed. Install it with: pip install openai-whisper") - raise RuntimeError("Whisper not installed") + # Check if whisperx is installed + if not shutil.which("whisperx"): + logger.error("WhisperX is not installed. Install it with: pip install whisperx") + raise RuntimeError("WhisperX not installed") # Unload Ollama model to free GPU memory for Whisper (if using vision) if self.config.use_vision: @@ -199,16 +199,17 @@ class ProcessingWorkflow: except Exception as e: logger.warning(f"Could not unload Ollama model: {e}") - logger.info(f"Running Whisper transcription (model: {self.config.whisper_model})...") + logger.info(f"Running WhisperX transcription with diarization (model: {self.config.whisper_model})...") logger.info("This may take a few minutes depending on video length...") - # Run whisper command + # Run whisperx command with diarization cmd = [ - "whisper", + "whisperx", str(self.config.video_path), "--model", self.config.whisper_model, "--output_format", "json", - "--output_dir", str(self.output_mgr.output_dir) + "--output_dir", str(self.output_mgr.output_dir), + "--diarize", ] try: diff --git a/process_meeting.py b/process_meeting.py index 5b9d499..1e9d526 100644 --- a/process_meeting.py +++ b/process_meeting.py @@ -32,13 +32,13 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Embed images for LLM analysis (recommended - let LLM analyze actual frames) + # Reference frames for LLM analysis (recommended - transcript includes frame paths) python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection - # Embed with custom quality (lower = smaller file size) + # Adjust frame extraction quality (lower = smaller files) python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection - # Hybrid approach: OpenCV + OCR (extracts text, no images) + # Hybrid approach: OpenCV + OCR (extracts text from frames) python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection # Hybrid + LLM cleanup (best for code formatting) @@ -183,12 +183,12 @@ Examples: parser.add_argument( '--embed-images', action='store_true', - help='Embed frame images (as base64) in enhanced transcript for LLM analysis' + help='Skip OCR/vision analysis and reference frame files directly (faster, lets LLM analyze images)' ) parser.add_argument( '--embed-quality', type=int, - help='JPEG quality for embedded images (default: 80, lower = smaller file)', + help='JPEG quality for extracted frames (default: 80, lower = smaller files)', default=80 )