From 118ef04223e257f5ac77a611f7f91265365d3887 Mon Sep 17 00:00:00 2001 From: Mariano Gabriel Date: Tue, 28 Oct 2025 08:02:45 -0300 Subject: [PATCH] embed images --- def/02-hybrid-opencv-ocr-llm.md | 111 +++++++++ def/03-embed-images-for-llm.md | 100 ++++++++ def/04-fix-whisper-cache-loading.md | 78 ++++++ meetus/cache_manager.py | 17 ++ meetus/frame_extractor.py | 28 ++- meetus/hybrid_processor.py | 355 ++++++++++++++++++++++++++++ meetus/ocr_processor.py | 54 ++++- meetus/output_manager.py | 28 ++- meetus/transcript_merger.py | 74 +++++- meetus/vision_processor.py | 9 +- meetus/workflow.py | 174 ++++++++++++-- process_meeting.py | 49 +++- 12 files changed, 1016 insertions(+), 61 deletions(-) create mode 100644 def/02-hybrid-opencv-ocr-llm.md create mode 100644 def/03-embed-images-for-llm.md create mode 100644 def/04-fix-whisper-cache-loading.md create mode 100644 meetus/hybrid_processor.py diff --git a/def/02-hybrid-opencv-ocr-llm.md b/def/02-hybrid-opencv-ocr-llm.md new file mode 100644 index 0000000..5fd75d9 --- /dev/null +++ b/def/02-hybrid-opencv-ocr-llm.md @@ -0,0 +1,111 @@ +# 02 - Hybrid OpenCV + OCR + LLM Approach + +## Date +2025-10-28 + +## Context +Vision models (llava) were hallucinating text content badly - showing HTML code when there was none, inventing text that didn't exist. Pure OCR was fast and accurate but lost code formatting and structure. + +## Problem +- **Vision models**: Hallucinate text content, can't be trusted for accurate extraction +- **Pure OCR**: Accurate text but messy output, lost indentation/formatting +- **Need**: Accurate text extraction + preserved code structure + +## Solution: Three-Stage Hybrid Approach + +### Stage 1: OpenCV Text Detection +Use morphological operations to find text regions: +- Adaptive thresholding (handles varying lighting) +- Dilation with horizontal kernel to connect text lines +- Contour detection to find bounding boxes +- Filter by area and aspect ratio +- Merge overlapping regions + +### Stage 2: Region-Based OCR +- Sort regions by reading order (top-to-bottom, left-to-right) +- Crop each region from original image +- Run OCR on cropped regions (more accurate than full frame) +- Tesseract with PSM 6 mode to preserve layout +- Preserve indentation in cleaning step + +### Stage 3: Optional LLM Cleanup +- Take accurate OCR output (no hallucination) +- Use lightweight LLM (llama3.2:3b for speed) to: + - Fix obvious OCR errors (l→1, O→0) + - Restore code indentation and structure + - Preserve exact text content + - No added explanations or hallucinated content + +## Benefits +✓ **Accurate**: OCR reads actual pixels, no hallucination +✓ **Fast**: OpenCV detection is instant, focused OCR is quick +✓ **Structured**: Regions separated with headers showing position +✓ **Formatted**: Optional LLM cleanup preserves/restores code structure +✓ **Deterministic**: Same input = same output (unlike vision models) + +## Implementation + +**New file:** `meetus/hybrid_processor.py` +- `HybridProcessor` class with OpenCV detection + OCR + optional LLM +- Region sorting for proper reading order +- Visual separators between regions + +**CLI flags:** +```bash +--use-hybrid # Enable hybrid mode +--hybrid-llm-cleanup # Add LLM post-processing (optional) +--hybrid-llm-model MODEL # LLM model (default: llama3.2:3b) +``` + +**OCR improvements:** +- Tesseract PSM 6 mode for better layout preservation +- Modified text cleaning to keep indentation +- `preserve_layout` parameter + +## Usage + +```bash +# Basic hybrid (OpenCV + OCR) +python process_meeting.py samples/video.mkv --use-hybrid --scene-detection + +# With LLM cleanup for best code formatting +python process_meeting.py samples/video.mkv --use-hybrid --hybrid-llm-cleanup --scene-detection -v + +# Iterate on threshold +python process_meeting.py samples/video.mkv --use-hybrid --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis +``` + +## Output Format + +``` +[Region 1 at y=120] +function calculateTotal(items) { + return items.reduce((sum, item) => sum + item.price, 0); +} + +============================================================ + +[Region 2 at y=450] +const result = calculateTotal(cartItems); +console.log('Total:', result); +``` + +## Performance +- **Without LLM cleanup**: Very fast (~2-3s per frame) +- **With LLM cleanup**: Slower but still faster than vision models (~5-8s per frame) +- **Accuracy**: Much better than vision model hallucinations + +## When to Use What + +| Method | Best For | Pros | Cons | +|--------|----------|------|------| +| **Hybrid** | Code/terminal text extraction | Accurate, fast, no hallucination | Formatting may be messy | +| **Hybrid + LLM** | Code with preserved structure | Accurate + formatted | Slower, needs Ollama | +| **Vision** | Understanding layout/context | Semantic understanding | Hallucinates text | +| **Pure OCR** | Simple text, no structure needed | Fast, simple | Full-frame, no region detection | + +## Files Modified +- `meetus/hybrid_processor.py` - New hybrid processor +- `meetus/ocr_processor.py` - Layout preservation +- `meetus/workflow.py` - Hybrid mode integration +- `process_meeting.py` - CLI flags and examples diff --git a/def/03-embed-images-for-llm.md b/def/03-embed-images-for-llm.md new file mode 100644 index 0000000..a22c273 --- /dev/null +++ b/def/03-embed-images-for-llm.md @@ -0,0 +1,100 @@ +# 03 - Embed Images for LLM Analysis + +## Date +2025-10-28 + +## Context +Hybrid OCR approach was fast and accurate but formatting was messy. Vision models hallucinated text. Rather than fighting with text extraction, a better approach is to embed the actual frame images in the enhanced transcript and let the end-user's LLM analyze them with full audio context. + +## Problem +- OCR/vision models either hallucinate or produce messy text +- Code formatting/indentation is hard to preserve +- User wants to analyze frames with their own LLM (Claude, GPT, etc.) +- Need to keep file size reasonable (~200KB per image is too big) + +## Solution: Image Embedding + +Instead of extracting text, embed the actual frame images as base64 in the enhanced transcript. The LLM can then: +- See the actual screen content (no hallucination) +- Understand code structure, layout, and formatting visually +- Have full audio transcript context for each frame +- Analyze dashboards, terminals, editors with perfect accuracy + +## Implementation + +**Quality Optimization:** +- Default JPEG quality: 80 (good tradeoff between size and readability) +- Configurable via `--embed-quality` (0-100) +- Typical sizes at quality 80: ~40-80KB per image (vs 200KB original) + +**Format:** +``` +[MM:SS] SPEAKER: + Audio transcript text here + +[MM:SS] SCREEN CONTENT: + IMAGE (base64, 52KB): + data:image/jpeg;base64,/9j/4AAQSkZJRg... + + TEXT: + | Optional OCR text for reference +``` + +**Features:** +- Base64 encoding for easy embedding +- Size tracking and reporting +- Optional text content alongside images +- Works with scene detection for smart frame selection + +## Usage + +```bash +# Basic: Embed images at quality 80 (default) +python process_meeting.py samples/video.mkv --run-whisper --embed-images --scene-detection --no-cache -v + +# Lower quality for smaller files (still readable) +python process_meeting.py samples/video.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection --no-cache -v + +# Higher quality for detailed code +python process_meeting.py samples/video.mkv --run-whisper --embed-images --embed-quality 90 --scene-detection --no-cache -v + +# Iterate on scene threshold (reuse whisper) +python process_meeting.py samples/video.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis -v +``` + +## File Sizes + +**Example for 20 frames:** +- Quality 60: ~30-50KB per image = 0.6-1MB total +- Quality 80: ~40-80KB per image = 0.8-1.6MB total (recommended) +- Quality 90: ~80-120KB per image = 1.6-2.4MB total +- Original: ~200KB per image = 4MB total + +## Benefits + +✓ **No hallucination**: LLM sees actual pixels +✓ **Perfect formatting**: Code structure preserved visually +✓ **Full context**: Audio transcript + visual frame together +✓ **User's choice**: Use your preferred LLM (Claude, GPT, etc.) +✓ **Reasonable size**: Quality 80 gives 4x smaller files vs original +✓ **Simple workflow**: One file contains everything + +## Use Cases + +**Code walkthroughs:** LLM can see actual code structure and indentation +**Dashboard analysis:** Charts, graphs, metrics visible to LLM +**Terminal sessions:** Commands and output in proper context +**UI reviews:** Actual interface visible with audio commentary + +## Files Modified + +- `meetus/transcript_merger.py` - Image encoding and embedding +- `meetus/workflow.py` - Wire through config +- `process_meeting.py` - CLI flags +- `meetus/output_manager.py` - Cleaner directory naming (date + increment) + +## Output Directory Naming + +Also changed output directory format for clarity: +- Old: `20251028_054553-video` (confusing timestamps) +- New: `20251028-001-video` (clear date + run number) diff --git a/def/04-fix-whisper-cache-loading.md b/def/04-fix-whisper-cache-loading.md new file mode 100644 index 0000000..8900267 --- /dev/null +++ b/def/04-fix-whisper-cache-loading.md @@ -0,0 +1,78 @@ +# 04 - Fix Whisper Cache Loading + +## Date +2025-10-28 + +## Problem +Enhanced transcript was not including the audio segments from cached whisper transcripts when running without the `--run-whisper` flag. + +Example command that failed: +```bash +python process_meeting.py samples/zaca-run-scrapers.mkv --embed-images --scene-detection --scene-threshold 10 --skip-cache-frames -v +``` + +Result: Enhanced transcript only contained embedded images, no audio segments (0 "SPEAKER" entries). + +## Root Cause +In `workflow.py`, the `_run_whisper()` method was checking the `run_whisper` flag **before** checking the cache: + +```python +def _run_whisper(self) -> Optional[str]: + if not self.config.run_whisper: + return self.config.transcript_path # Returns None if --transcript not specified + + # Cache check NEVER REACHED if run_whisper is False + cached = self.cache_mgr.get_whisper_cache() + if cached: + return str(cached) +``` + +This meant: +- User runs command without `--run-whisper` +- Method returns None immediately +- Cached whisper transcript is never discovered +- No audio segments in enhanced output + +## Solution +Reorder the logic to check cache **first**, regardless of flags: + +```python +def _run_whisper(self) -> Optional[str]: + """Run Whisper transcription if requested, or use cached/provided transcript.""" + # First, check cache (regardless of run_whisper flag) + cached = self.cache_mgr.get_whisper_cache() + if cached: + return str(cached) + + # If no cache and not running whisper, use provided transcript path (if any) + if not self.config.run_whisper: + return self.config.transcript_path + + # If no cache and run_whisper is True, run whisper transcription + # ... rest of whisper code +``` + +## New Behavior +1. Cache is checked first (regardless of `--run-whisper` flag) +2. If cached whisper exists, use it +3. If no cache and `--run-whisper` not specified, use `--transcript` path (or None) +4. If no cache and `--run-whisper` specified, run whisper + +## Benefits +✓ Cached whisper transcripts are always discovered and used +✓ User can iterate on frame extraction/analysis without re-running whisper +✓ Enhanced transcripts now properly include both audio + visual content +✓ Granular cache flags (`--skip-cache-frames`, `--skip-cache-whisper`) work as expected + +## Use Case +```bash +# First run: Generate whisper transcript + extract frames +python process_meeting.py samples/video.mkv --run-whisper --embed-images --scene-detection -v + +# Second run: Iterate on scene threshold without re-running whisper +python process_meeting.py samples/video.mkv --embed-images --scene-detection --scene-threshold 10 --skip-cache-frames -v +# Now correctly includes cached whisper transcript in enhanced output! +``` + +## Files Modified +- `meetus/workflow.py` - Reordered logic in `_run_whisper()` method (lines 172-181) diff --git a/meetus/cache_manager.py b/meetus/cache_manager.py index 85c2c99..0f5cfb6 100644 --- a/meetus/cache_manager.py +++ b/meetus/cache_manager.py @@ -48,6 +48,17 @@ class CacheManager: cache_path = self.output_dir / f"{self.video_name}.json" if cache_path.exists(): logger.info(f"✓ Found cached Whisper transcript: {cache_path.name}") + + # Debug: Show cached transcript info + try: + import json + with open(cache_path, 'r', encoding='utf-8') as f: + data = json.load(f) + if 'segments' in data: + logger.debug(f"Cached transcript has {len(data['segments'])} segments") + except Exception as e: + logger.debug(f"Could not parse cached whisper for debug: {e}") + return cache_path return None @@ -68,6 +79,7 @@ class CacheManager: return None logger.info(f"✓ Found {len(existing_frames)} cached frames in {self.frames_dir.name}/") + logger.debug(f"Frame filenames: {[f.name for f in sorted(existing_frames)[:3]]}...") # Build frames_info from existing files frames_info = [] @@ -102,6 +114,11 @@ class CacheManager: with open(cache_path, 'r', encoding='utf-8') as f: results = json.load(f) logger.info(f"✓ Loaded {len(results)} analyzed frames from cache") + + # Debug: Show first cached result + if results: + logger.debug(f"First cached result: timestamp={results[0].get('timestamp')}, text_length={len(results[0].get('text', ''))}") + return results return None diff --git a/meetus/frame_extractor.py b/meetus/frame_extractor.py index 6b71676..75fa66c 100644 --- a/meetus/frame_extractor.py +++ b/meetus/frame_extractor.py @@ -16,17 +16,19 @@ logger = logging.getLogger(__name__) class FrameExtractor: """Extract frames from video files.""" - def __init__(self, video_path: str, output_dir: str = "frames"): + def __init__(self, video_path: str, output_dir: str = "frames", quality: int = 75): """ Initialize frame extractor. Args: video_path: Path to video file output_dir: Directory to save extracted frames + quality: JPEG quality for saved frames (0-100) """ self.video_path = video_path self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) + self.quality = quality def extract_by_interval(self, interval_seconds: int = 5) -> List[Tuple[str, float]]: """ @@ -56,8 +58,16 @@ class FrameExtractor: frame_filename = f"frame_{saved_count:05d}_{timestamp:.2f}s.jpg" frame_path = self.output_dir / frame_filename - # Use high quality for text readability (95 = high quality JPEG) - cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, 95]) + # Downscale to 1600px width for smaller file size (but still readable) + height, width = frame.shape[:2] + if width > 1600: + ratio = 1600 / width + new_width = 1600 + new_height = int(height * ratio) + frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LANCZOS4) + + # Save with configured quality (matches embed quality) + cv2.imwrite(str(frame_path), frame, [cv2.IMWRITE_JPEG_QUALITY, self.quality]) frames_info.append((str(frame_path), timestamp)) saved_count += 1 @@ -90,16 +100,24 @@ class FrameExtractor: output_pattern = self.output_dir / f"{video_name}_%05d.jpg" try: - # Use FFmpeg's scene detection filter with high quality output + # Use FFmpeg's scene detection filter with downscaling stream = ffmpeg.input(self.video_path) stream = ffmpeg.filter(stream, 'select', f'gt(scene,{threshold/100})') stream = ffmpeg.filter(stream, 'showinfo') + # Scale to 1600px width (maintains aspect ratio, still readable) + # Use simple conditional: if width > 1600, scale to 1600, else keep original + stream = ffmpeg.filter(stream, 'scale', w='min(1600,iw)', h=-1) + + # Convert JPEG quality (0-100) to FFmpeg qscale (2-31, lower=better) + # Rough mapping: qscale ≈ (100 - quality) / 10, clamped to 2-31 + qscale = max(2, min(31, int((100 - self.quality) / 10 + 2))) + stream = ffmpeg.output( stream, str(output_pattern), vsync='vfr', frame_pts=1, - **{'q:v': '2'} # High quality JPEG + **{'q:v': str(qscale)} # Matches configured quality ) # Run with stderr capture to get showinfo output diff --git a/meetus/hybrid_processor.py b/meetus/hybrid_processor.py new file mode 100644 index 0000000..21932b3 --- /dev/null +++ b/meetus/hybrid_processor.py @@ -0,0 +1,355 @@ +""" +Hybrid frame analysis: OpenCV text detection + OCR for accurate extraction. +Better than pure vision models which tend to hallucinate text content. +""" +from typing import List, Tuple, Dict, Optional +from pathlib import Path +import logging +import cv2 +import numpy as np +from difflib import SequenceMatcher + +logger = logging.getLogger(__name__) + + +class HybridProcessor: + """Combine OpenCV text detection with OCR for accurate text extraction.""" + + def __init__(self, ocr_engine: str = "tesseract", min_confidence: float = 0.5, + use_llm_cleanup: bool = False, llm_model: Optional[str] = None): + """ + Initialize hybrid processor. + + Args: + ocr_engine: OCR engine to use ('tesseract', 'easyocr', 'paddleocr') + min_confidence: Minimum confidence for text detection (0-1) + use_llm_cleanup: Use LLM to clean up OCR output and preserve formatting + llm_model: Ollama model for cleanup (default: llama3.2:3b for speed) + """ + from .ocr_processor import OCRProcessor + + self.ocr = OCRProcessor(engine=ocr_engine) + self.min_confidence = min_confidence + self.use_llm_cleanup = use_llm_cleanup + self.llm_model = llm_model or "llama3.2:3b" + self._llm_client = None + + if use_llm_cleanup: + self._init_llm() + + def _init_llm(self): + """Initialize Ollama client for LLM cleanup.""" + try: + import ollama + self._llm_client = ollama + logger.info(f"LLM cleanup enabled using {self.llm_model}") + except ImportError: + logger.warning("ollama package not installed. LLM cleanup disabled.") + self.use_llm_cleanup = False + + def _cleanup_with_llm(self, raw_text: str) -> str: + """ + Use LLM to clean up OCR output and preserve code formatting. + + Args: + raw_text: Raw OCR output + + Returns: + Cleaned up text with proper formatting + """ + if not self.use_llm_cleanup or not self._llm_client: + return raw_text + + prompt = """You are cleaning up OCR output from a code editor screenshot. + +Your task: +1. Fix any obvious OCR errors (l→1, O→0, etc.) +2. Preserve or restore code indentation and structure +3. Keep the exact text content - don't add explanations or comments +4. If it's code, maintain proper spacing and formatting +5. Return ONLY the cleaned text, nothing else + +OCR Text: +""" + + try: + response = self._llm_client.generate( + model=self.llm_model, + prompt=prompt + raw_text, + options={"temperature": 0.1} # Low temperature for accuracy + ) + cleaned = response['response'].strip() + logger.debug(f"LLM cleanup: {len(raw_text)} → {len(cleaned)} chars") + return cleaned + except Exception as e: + logger.warning(f"LLM cleanup failed: {e}, using raw OCR output") + return raw_text + + def detect_text_regions(self, image_path: str, min_area: int = 100) -> List[Tuple[int, int, int, int]]: + """ + Detect text regions in image using OpenCV. + + Args: + image_path: Path to image file + min_area: Minimum area for text region (pixels) + + Returns: + List of bounding boxes (x, y, w, h) + """ + # Read image + img = cv2.imread(image_path) + if img is None: + logger.warning(f"Could not read image: {image_path}") + return [] + + gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) + + # Method 1: Morphological operations to find text regions + # Works well for solid text blocks + regions = self._detect_by_morphology(gray, min_area) + + if not regions: + logger.debug(f"No text regions detected in {Path(image_path).name}") + + return regions + + def _detect_by_morphology(self, gray: np.ndarray, min_area: int) -> List[Tuple[int, int, int, int]]: + """ + Detect text regions using morphological operations. + Fast and works well for solid text blocks (code editors, terminals). + + Args: + gray: Grayscale image + min_area: Minimum area for region + + Returns: + List of bounding boxes (x, y, w, h) + """ + # Apply adaptive threshold to handle varying lighting + binary = cv2.adaptiveThreshold( + gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, + cv2.THRESH_BINARY_INV, 11, 2 + ) + + # Morphological operations to connect text regions + kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3)) # Horizontal kernel for text lines + dilated = cv2.dilate(binary, kernel, iterations=2) + + # Find contours + contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + # Filter and extract bounding boxes + regions = [] + for contour in contours: + x, y, w, h = cv2.boundingRect(contour) + area = w * h + + # Filter by area and aspect ratio + if area > min_area and w > 20 and h > 10: # Reasonable text dimensions + regions.append((x, y, w, h)) + + # Merge overlapping regions + regions = self._merge_overlapping_regions(regions) + + logger.debug(f"Detected {len(regions)} text regions using morphology") + return regions + + def _merge_overlapping_regions( + self, regions: List[Tuple[int, int, int, int]], + overlap_threshold: float = 0.3 + ) -> List[Tuple[int, int, int, int]]: + """ + Merge overlapping bounding boxes. + + Args: + regions: List of (x, y, w, h) tuples + overlap_threshold: Minimum overlap ratio to merge + + Returns: + Merged regions + """ + if not regions: + return [] + + # Sort by y-coordinate (top to bottom) + regions = sorted(regions, key=lambda r: r[1]) + + merged = [] + current = list(regions[0]) + + for region in regions[1:]: + x, y, w, h = region + cx, cy, cw, ch = current + + # Check for overlap + x_overlap = max(0, min(cx + cw, x + w) - max(cx, x)) + y_overlap = max(0, min(cy + ch, y + h) - max(cy, y)) + overlap_area = x_overlap * y_overlap + + current_area = cw * ch + region_area = w * h + min_area = min(current_area, region_area) + + if overlap_area / min_area > overlap_threshold: + # Merge regions + new_x = min(cx, x) + new_y = min(cy, y) + new_x2 = max(cx + cw, x + w) + new_y2 = max(cy + ch, y + h) + current = [new_x, new_y, new_x2 - new_x, new_y2 - new_y] + else: + merged.append(tuple(current)) + current = list(region) + + merged.append(tuple(current)) + return merged + + def extract_text_from_region(self, image_path: str, region: Tuple[int, int, int, int]) -> str: + """ + Extract text from a specific region using OCR. + + Args: + image_path: Path to image file + region: Bounding box (x, y, w, h) + + Returns: + Extracted text + """ + from PIL import Image + + # Load image and crop region + img = Image.open(image_path) + x, y, w, h = region + cropped = img.crop((x, y, x + w, y + h)) + + # Save to temp file for OCR (or use in-memory) + import tempfile + with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp: + cropped.save(tmp.name) + text = self.ocr.extract_text(tmp.name) + + # Clean up temp file + Path(tmp.name).unlink() + + return text + + def analyze_frame(self, image_path: str) -> str: + """ + Analyze a frame: detect text regions and OCR them. + + Args: + image_path: Path to image file + + Returns: + Combined text from all detected regions + """ + # Detect text regions + regions = self.detect_text_regions(image_path) + + if not regions: + # Fallback to full-frame OCR if no regions detected + logger.debug(f"No regions detected, using full-frame OCR for {Path(image_path).name}") + raw_text = self.ocr.extract_text(image_path) + return self._cleanup_with_llm(raw_text) if self.use_llm_cleanup else raw_text + + # Sort regions by reading order (top-to-bottom, left-to-right) + regions = self._sort_regions_by_reading_order(regions) + + # Extract text from each region + texts = [] + for idx, region in enumerate(regions): + x, y, w, h = region + text = self.extract_text_from_region(image_path, region) + if text.strip(): + # Add visual separator with region info + section_header = f"[Region {idx+1} at y={y}]" + texts.append(f"{section_header}\n{text.strip()}") + logger.debug(f"Region {idx+1}/{len(regions)} (y={y}): Extracted {len(text)} chars") + + combined = ("\n\n" + "="*60 + "\n\n").join(texts) + logger.debug(f"Total extracted from {len(regions)} regions: {len(combined)} chars") + + # Apply LLM cleanup if enabled + if self.use_llm_cleanup: + combined = self._cleanup_with_llm(combined) + + return combined + + def _sort_regions_by_reading_order(self, regions: List[Tuple[int, int, int, int]]) -> List[Tuple[int, int, int, int]]: + """ + Sort regions in reading order (top-to-bottom, left-to-right). + + Args: + regions: List of (x, y, w, h) tuples + + Returns: + Sorted regions + """ + # Sort primarily by y (top to bottom), secondarily by x (left to right) + # Group regions that are on roughly the same line (within 20px) + sorted_regions = sorted(regions, key=lambda r: (r[1] // 20, r[0])) + return sorted_regions + + def process_frames( + self, + frames_info: List[Tuple[str, float]], + deduplicate: bool = True, + similarity_threshold: float = 0.85 + ) -> List[Dict]: + """ + Process multiple frames with hybrid analysis. + + Args: + frames_info: List of (frame_path, timestamp) tuples + deduplicate: Whether to remove similar consecutive analyses + similarity_threshold: Threshold for considering analyses as duplicates (0-1) + + Returns: + List of dicts with 'timestamp', 'text', and 'frame_path' + """ + results = [] + prev_text = "" + + total = len(frames_info) + logger.info(f"Starting hybrid analysis of {total} frames...") + + for idx, (frame_path, timestamp) in enumerate(frames_info, 1): + logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...") + + text = self.analyze_frame(frame_path) + + if not text: + logger.warning(f"No content extracted from frame at {timestamp:.2f}s") + continue + + # Debug: Show what was extracted + logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars") + logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}") + + # Deduplicate similar consecutive frames + if deduplicate and prev_text: + similarity = self._text_similarity(prev_text, text) + logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})") + if similarity > similarity_threshold: + logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") + continue + + results.append({ + 'timestamp': timestamp, + 'text': text, + 'frame_path': frame_path + }) + + prev_text = text + + logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})") + return results + + def _text_similarity(self, text1: str, text2: str) -> float: + """ + Calculate similarity between two texts. + + Returns: + Similarity score between 0 and 1 + """ + return SequenceMatcher(None, text1, text2).ratio() diff --git a/meetus/ocr_processor.py b/meetus/ocr_processor.py index 6844f28..36606d7 100644 --- a/meetus/ocr_processor.py +++ b/meetus/ocr_processor.py @@ -53,20 +53,25 @@ class OCRProcessor: else: raise ValueError(f"Unknown OCR engine: {self.engine}") - def extract_text(self, image_path: str) -> str: + def extract_text(self, image_path: str, preserve_layout: bool = True) -> str: """ Extract text from a single image. Args: image_path: Path to image file + preserve_layout: Try to preserve whitespace and layout Returns: Extracted text """ if self.engine == "tesseract": from PIL import Image + import pytesseract image = Image.open(image_path) - text = self._ocr_engine.image_to_string(image) + + # Use PSM 6 (uniform block of text) to preserve layout better + config = '--psm 6' if preserve_layout else '' + text = pytesseract.image_to_string(image, config=config) elif self.engine == "easyocr": result = self._ocr_engine.readtext(image_path, detail=0) @@ -81,12 +86,31 @@ class OCRProcessor: return self._clean_text(text) - def _clean_text(self, text: str) -> str: - """Clean up OCR output.""" - # Remove excessive whitespace - text = re.sub(r'\n\s*\n', '\n', text) - text = re.sub(r' +', ' ', text) - return text.strip() + def _clean_text(self, text: str, preserve_indentation: bool = True) -> str: + """ + Clean up OCR output. + + Args: + text: Raw OCR text + preserve_indentation: Keep leading whitespace on lines + + Returns: + Cleaned text + """ + if preserve_indentation: + # Remove excessive blank lines but preserve indentation + lines = text.split('\n') + cleaned_lines = [] + for line in lines: + # Keep line if it has content or is single empty line + if line.strip() or (cleaned_lines and cleaned_lines[-1].strip()): + cleaned_lines.append(line) + return '\n'.join(cleaned_lines).strip() + else: + # Original aggressive cleaning + text = re.sub(r'\n\s*\n', '\n', text) + text = re.sub(r' +', ' ', text) + return text.strip() def process_frames( self, @@ -108,18 +132,24 @@ class OCRProcessor: results = [] prev_text = "" - for frame_path, timestamp in frames_info: - logger.debug(f"Processing frame at {timestamp:.2f}s...") + for idx, (frame_path, timestamp) in enumerate(frames_info, 1): + logger.debug(f"Processing frame {idx}/{len(frames_info)} at {timestamp:.2f}s...") text = self.extract_text(frame_path) if not text: + logger.debug(f"No text extracted from frame at {timestamp:.2f}s") continue + # Debug: Show what was extracted + logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars") + logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}") + # Deduplicate similar consecutive frames - if deduplicate: + if deduplicate and prev_text: similarity = self._text_similarity(prev_text, text) + logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})") if similarity > similarity_threshold: - logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") + logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") continue results.append({ diff --git a/meetus/output_manager.py b/meetus/output_manager.py index e923ed3..8c58a89 100644 --- a/meetus/output_manager.py +++ b/meetus/output_manager.py @@ -36,7 +36,7 @@ class OutputManager: def _get_or_create_output_dir(self) -> Path: """ - Get existing output directory or create a new timestamped one. + Get existing output directory or create a new one with incremental number. Returns: Path to output directory @@ -54,9 +54,29 @@ class OutputManager: logger.info(f"Found existing output: {existing_dirs[0].name}") return existing_dirs[0] - # Create new timestamped directory - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - dir_name = f"{timestamp}-{video_name}" + # Create new directory with date + incremental number + date_str = datetime.now().strftime("%Y%m%d") + + # Find existing runs for today + if self.base_output_dir.exists(): + existing_today = [ + d for d in self.base_output_dir.iterdir() + if d.is_dir() and d.name.startswith(date_str) and d.name.endswith(f"-{video_name}") + ] + + # Extract run numbers and find max + run_numbers = [] + for d in existing_today: + # Format: YYYYMMDD-NNN-videoname + parts = d.name.split('-') + if len(parts) >= 2 and parts[1].isdigit(): + run_numbers.append(int(parts[1])) + + next_run = max(run_numbers) + 1 if run_numbers else 1 + else: + next_run = 1 + + dir_name = f"{date_str}-{next_run:03d}-{video_name}" output_dir = self.base_output_dir / dir_name output_dir.mkdir(parents=True, exist_ok=True) logger.info(f"Created new output directory: {dir_name}") diff --git a/meetus/transcript_merger.py b/meetus/transcript_merger.py index 14e99c5..30b5b40 100644 --- a/meetus/transcript_merger.py +++ b/meetus/transcript_merger.py @@ -6,6 +6,8 @@ from typing import List, Dict, Optional import json from pathlib import Path import logging +import base64 +from io import BytesIO logger = logging.getLogger(__name__) @@ -13,9 +15,16 @@ logger = logging.getLogger(__name__) class TranscriptMerger: """Merge audio transcripts with screen OCR text.""" - def __init__(self): - """Initialize transcript merger.""" - pass + def __init__(self, embed_images: bool = False, embed_quality: int = 80): + """ + Initialize transcript merger. + + Args: + embed_images: Whether to embed frame images as base64 + embed_quality: JPEG quality for embedded images (0-100) + """ + self.embed_images = embed_images + self.embed_quality = embed_quality def load_whisper_transcript(self, transcript_path: str, group_interval: Optional[int] = None) -> List[Dict]: """ @@ -120,6 +129,32 @@ class TranscriptMerger: logger.info(f"Grouped {len(segments)} segments into {len(intervals)} intervals of {interval_seconds}s") return intervals + def _encode_image_base64(self, image_path: str) -> tuple[str, int]: + """ + Encode image as base64 (image already at target quality/size). + + Args: + image_path: Path to image file + + Returns: + Tuple of (base64_string, size_in_bytes) + """ + try: + # Read file directly (already at target quality/resolution) + with open(image_path, 'rb') as f: + img_bytes = f.read() + + # Encode to base64 + b64_string = base64.b64encode(img_bytes).decode('utf-8') + + logger.debug(f"Encoded {Path(image_path).name}: {len(img_bytes)} bytes") + + return b64_string, len(img_bytes) + + except Exception as e: + logger.error(f"Failed to encode image {image_path}: {e}") + return "", 0 + def merge_transcripts( self, audio_segments: List[Dict], @@ -172,10 +207,15 @@ class TranscriptMerger: lines = [] lines.append("=" * 80) lines.append("ENHANCED MEETING TRANSCRIPT") - lines.append("Audio transcript + Screen content") + if self.embed_images: + lines.append("Audio transcript + Embedded frame images (base64)") + else: + lines.append("Audio transcript + Screen content") lines.append("=" * 80) lines.append("") + total_image_bytes = 0 + for seg in segments: timestamp = self._format_timestamp(seg['timestamp']) @@ -186,11 +226,31 @@ class TranscriptMerger: else: # screen lines.append(f"[{timestamp}] SCREEN CONTENT:") - # Indent screen text for visibility - screen_text = seg['text'].replace('\n', '\n | ') - lines.append(f" | {screen_text}") + + # Embed image if requested + if self.embed_images and 'frame_path' in seg: + b64_img, img_size = self._encode_image_base64(seg['frame_path']) + total_image_bytes += img_size + + if b64_img: + lines.append(f" IMAGE (base64, {img_size // 1024}KB):") + lines.append(f" data:image/jpeg;base64,{b64_img}") + lines.append("") + + # Include text content if available (fallback or additional context) + if 'text' in seg and seg['text'].strip(): + screen_text = seg['text'].replace('\n', '\n | ') + lines.append(f" TEXT:") + lines.append(f" | {screen_text}") + lines.append("") + if self.embed_images and total_image_bytes > 0: + total_mb = total_image_bytes / (1024 * 1024) + lines.append("") + lines.append(f"Total embedded images size: {total_mb:.2f} MB") + logger.info(f"Embedded {len([s for s in segments if s['type'] == 'screen'])} images, total size: {total_mb:.2f} MB") + return "\n".join(lines) def _format_compact(self, segments: List[Dict]) -> str: diff --git a/meetus/vision_processor.py b/meetus/vision_processor.py index ff42a43..2e83ff5 100644 --- a/meetus/vision_processor.py +++ b/meetus/vision_processor.py @@ -161,11 +161,16 @@ class VisionProcessor: logger.warning(f"No content extracted from frame at {timestamp:.2f}s") continue + # Debug: Show what was extracted + logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars") + logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}") + # Deduplicate similar consecutive frames - if deduplicate: + if deduplicate and prev_text: similarity = self._text_similarity(prev_text, text) + logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})") if similarity > similarity_threshold: - logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") + logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})") continue results.append({ diff --git a/meetus/workflow.py b/meetus/workflow.py index b60e3bd..b1e5f46 100644 --- a/meetus/workflow.py +++ b/meetus/workflow.py @@ -40,10 +40,21 @@ class WorkflowConfig: # Analysis options self.use_vision = kwargs.get('use_vision', False) + self.use_hybrid = kwargs.get('use_hybrid', False) + self.hybrid_llm_cleanup = kwargs.get('hybrid_llm_cleanup', False) + self.hybrid_llm_model = kwargs.get('hybrid_llm_model', 'llama3.2:3b') self.vision_model = kwargs.get('vision_model', 'llava:13b') self.vision_context = kwargs.get('vision_context', 'meeting') self.ocr_engine = kwargs.get('ocr_engine', 'tesseract') + # Validation: can't use both vision and hybrid + if self.use_vision and self.use_hybrid: + raise ValueError("Cannot use both --use-vision and --use-hybrid. Choose one.") + + # Validation: LLM cleanup requires hybrid mode + if self.hybrid_llm_cleanup and not self.use_hybrid: + raise ValueError("--hybrid-llm-cleanup requires --use-hybrid") + # Processing options self.no_deduplicate = kwargs.get('no_deduplicate', False) self.no_cache = kwargs.get('no_cache', False) @@ -52,6 +63,8 @@ class WorkflowConfig: self.skip_cache_analysis = kwargs.get('skip_cache_analysis', False) self.extract_only = kwargs.get('extract_only', False) self.format = kwargs.get('format', 'detailed') + self.embed_images = kwargs.get('embed_images', False) + self.embed_quality = kwargs.get('embed_quality', 80) def to_dict(self) -> Dict[str, Any]: """Convert config to dictionary for manifest.""" @@ -66,10 +79,10 @@ class WorkflowConfig: "scene_threshold": self.scene_threshold if self.scene_detection else None }, "analysis": { - "method": "vision" if self.use_vision else "ocr", + "method": "vision" if self.use_vision else ("hybrid" if self.use_hybrid else "ocr"), "vision_model": self.vision_model if self.use_vision else None, "vision_context": self.vision_context if self.use_vision else None, - "ocr_engine": self.ocr_engine if not self.use_vision else None, + "ocr_engine": self.ocr_engine if (not self.use_vision) else None, "deduplication": not self.no_deduplicate }, "output_format": self.format @@ -113,10 +126,19 @@ class ProcessingWorkflow: logger.info("MEETING PROCESSOR") logger.info("=" * 80) logger.info(f"Video: {self.config.video_path.name}") - logger.info(f"Analysis: {'Vision Model' if self.config.use_vision else f'OCR ({self.config.ocr_engine})'}") + + # Determine analysis method if self.config.use_vision: - logger.info(f"Vision Model: {self.config.vision_model}") + analysis_method = f"Vision Model ({self.config.vision_model})" + logger.info(f"Analysis: {analysis_method}") logger.info(f"Context: {self.config.vision_context}") + elif self.config.use_hybrid: + analysis_method = f"Hybrid (OpenCV + {self.config.ocr_engine})" + logger.info(f"Analysis: {analysis_method}") + else: + analysis_method = f"OCR ({self.config.ocr_engine})" + logger.info(f"Analysis: {analysis_method}") + logger.info(f"Frame extraction: {'Scene detection' if self.config.scene_detection else f'Every {self.config.interval}s'}") logger.info(f"Caching: {'Disabled' if self.config.no_cache else 'Enabled'}") logger.info("=" * 80) @@ -148,15 +170,16 @@ class ProcessingWorkflow: return self._build_result(transcript_path, screen_segments, enhanced_transcript) def _run_whisper(self) -> Optional[str]: - """Run Whisper transcription if requested.""" - if not self.config.run_whisper: - return self.config.transcript_path - - # Check cache + """Run Whisper transcription if requested, or use cached/provided transcript.""" + # First, check cache (regardless of run_whisper flag) cached = self.cache_mgr.get_whisper_cache() if cached: return str(cached) + # If no cache and not running whisper, use provided transcript path (if any) + if not self.config.run_whisper: + return self.config.transcript_path + logger.info("=" * 80) logger.info("STEP 0: Running Whisper Transcription") logger.info("=" * 80) @@ -195,6 +218,25 @@ class ProcessingWorkflow: if transcript_path.exists(): logger.info(f"✓ Whisper transcription completed: {transcript_path.name}") + + # Debug: Show transcript preview + try: + import json + with open(transcript_path, 'r', encoding='utf-8') as f: + whisper_data = json.load(f) + + if 'segments' in whisper_data: + logger.debug(f"Whisper produced {len(whisper_data['segments'])} segments") + if whisper_data['segments']: + logger.debug(f"First segment: {whisper_data['segments'][0]}") + logger.debug(f"Last segment: {whisper_data['segments'][-1]}") + + if 'text' in whisper_data: + text_preview = whisper_data['text'][:200] + "..." if len(whisper_data.get('text', '')) > 200 else whisper_data.get('text', '') + logger.debug(f"Transcript preview: {text_preview}") + except Exception as e: + logger.debug(f"Could not parse whisper output for debug: {e}") + logger.info("") return str(transcript_path) else: @@ -216,12 +258,24 @@ class ProcessingWorkflow: # Clean up old frames if regenerating if self.config.skip_cache_frames and self.output_mgr.frames_dir.exists(): - logger.info("Cleaning up old frames...") - for old_frame in self.output_mgr.frames_dir.glob("*.jpg"): - old_frame.unlink() + old_frames = list(self.output_mgr.frames_dir.glob("*.jpg")) + if old_frames: + logger.info(f"Cleaning up {len(old_frames)} old frames...") + for old_frame in old_frames: + old_frame.unlink() + logger.info("✓ Cleanup complete") - # Extract frames - extractor = FrameExtractor(str(self.config.video_path), str(self.output_mgr.frames_dir)) + # Extract frames (use embed quality so saved files match embedded images) + if self.config.scene_detection: + logger.info(f"Extracting frames with scene detection (threshold={self.config.scene_threshold})...") + else: + logger.info(f"Extracting frames every {self.config.interval}s...") + + extractor = FrameExtractor( + str(self.config.video_path), + str(self.output_mgr.frames_dir), + quality=self.config.embed_quality + ) if self.config.scene_detection: frames_info = extractor.extract_scene_changes(threshold=self.config.scene_threshold) @@ -232,8 +286,29 @@ class ProcessingWorkflow: return frames_info def _analyze_frames(self, frames_info): - """Analyze frames with vision or OCR.""" - analysis_type = 'vision' if self.config.use_vision else 'ocr' + """Analyze frames with vision, hybrid, or OCR.""" + # Skip analysis if just embedding images + if self.config.embed_images: + logger.info("Step 2: Skipping analysis (images will be embedded)") + # Create minimal segments with just frame paths and timestamps + screen_segments = [ + { + 'timestamp': timestamp, + 'text': '', # No text extraction needed + 'frame_path': frame_path + } + for frame_path, timestamp in frames_info + ] + logger.info(f"✓ Prepared {len(screen_segments)} frames for embedding") + return screen_segments + + # Determine analysis type + if self.config.use_vision: + analysis_type = 'vision' + elif self.config.use_hybrid: + analysis_type = 'hybrid' + else: + analysis_type = 'ocr' # Check cache cached_analysis = self.cache_mgr.get_analysis_cache(analysis_type) @@ -242,6 +317,8 @@ class ProcessingWorkflow: if self.config.use_vision: return self._run_vision_analysis(frames_info) + elif self.config.use_hybrid: + return self._run_hybrid_analysis(frames_info) else: return self._run_ocr_analysis(frames_info) @@ -272,6 +349,13 @@ class ProcessingWorkflow: ) logger.info(f"✓ Analyzed {len(screen_segments)} frames with vision model") + # Debug: Show sample analysis results + if screen_segments: + logger.debug(f"First analysis result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}") + logger.debug(f"First analysis text preview: {screen_segments[0].get('text', '')[:200]}...") + if len(screen_segments) > 1: + logger.debug(f"Last analysis result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}") + # Cache results self.cache_mgr.save_analysis('vision', screen_segments) return screen_segments @@ -285,6 +369,42 @@ class ProcessingWorkflow: cached = self.cache_mgr.get_whisper_cache() return str(cached) if cached else None + def _run_hybrid_analysis(self, frames_info): + """Run hybrid analysis on frames (OpenCV + OCR).""" + if self.config.hybrid_llm_cleanup: + logger.info("Step 2: Running hybrid analysis (OpenCV + OCR + LLM cleanup)...") + else: + logger.info("Step 2: Running hybrid analysis (OpenCV text detection + OCR)...") + + try: + from .hybrid_processor import HybridProcessor + + hybrid = HybridProcessor( + ocr_engine=self.config.ocr_engine, + use_llm_cleanup=self.config.hybrid_llm_cleanup, + llm_model=self.config.hybrid_llm_model + ) + screen_segments = hybrid.process_frames( + frames_info, + deduplicate=not self.config.no_deduplicate + ) + logger.info(f"✓ Processed {len(screen_segments)} frames with hybrid analysis") + + # Debug: Show sample hybrid results + if screen_segments: + logger.debug(f"First hybrid result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}") + logger.debug(f"First hybrid text preview: {screen_segments[0].get('text', '')[:200]}...") + if len(screen_segments) > 1: + logger.debug(f"Last hybrid result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}") + + # Cache results + self.cache_mgr.save_analysis('hybrid', screen_segments) + return screen_segments + + except ImportError as e: + logger.error(f"{e}") + raise + def _run_ocr_analysis(self, frames_info): """Run OCR analysis on frames.""" logger.info("Step 2: Running OCR on extracted frames...") @@ -297,6 +417,13 @@ class ProcessingWorkflow: ) logger.info(f"✓ Processed {len(screen_segments)} frames with OCR") + # Debug: Show sample OCR results + if screen_segments: + logger.debug(f"First OCR result: timestamp={screen_segments[0].get('timestamp')}, text_length={len(screen_segments[0].get('text', ''))}") + logger.debug(f"First OCR text preview: {screen_segments[0].get('text', '')[:200]}...") + if len(screen_segments) > 1: + logger.debug(f"Last OCR result: timestamp={screen_segments[-1].get('timestamp')}, text_length={len(screen_segments[-1].get('text', ''))}") + # Cache results self.cache_mgr.save_analysis('ocr', screen_segments) return screen_segments @@ -309,7 +436,10 @@ class ProcessingWorkflow: def _merge_transcripts(self, transcript_path, screen_segments): """Merge audio and screen transcripts.""" - merger = TranscriptMerger() + merger = TranscriptMerger( + embed_images=self.config.embed_images, + embed_quality=self.config.embed_quality + ) # Load audio transcript if available audio_segments = [] @@ -350,10 +480,18 @@ class ProcessingWorkflow: def _build_result(self, transcript_path=None, screen_segments=None, enhanced_transcript=None): """Build result dictionary.""" + # Determine analysis filename + if self.config.use_vision: + analysis_type = 'vision' + elif self.config.use_hybrid: + analysis_type = 'hybrid' + else: + analysis_type = 'ocr' + return { "output_dir": str(self.output_mgr.output_dir), "transcript": transcript_path, - "analysis": f"{self.config.video_path.stem}_{'vision' if self.config.use_vision else 'ocr'}.json", + "analysis": f"{self.config.video_path.stem}_{analysis_type}.json", "frames_count": len(screen_segments) if screen_segments else 0, "enhanced_transcript": enhanced_transcript, "manifest": str(self.output_mgr.get_path("manifest.json")) diff --git a/process_meeting.py b/process_meeting.py index 78dd5ca..5b9d499 100644 --- a/process_meeting.py +++ b/process_meeting.py @@ -32,23 +32,20 @@ def main(): formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: - # Run Whisper + vision analysis (recommended for code/dashboards) - python process_meeting.py samples/meeting.mkv --run-whisper --use-vision + # Embed images for LLM analysis (recommended - let LLM analyze actual frames) + python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --scene-detection - # Use vision with specific context hint - python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --vision-context code + # Embed with custom quality (lower = smaller file size) + python process_meeting.py samples/meeting.mkv --run-whisper --embed-images --embed-quality 60 --scene-detection - # Traditional OCR approach - python process_meeting.py samples/meeting.mkv --run-whisper + # Hybrid approach: OpenCV + OCR (extracts text, no images) + python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --scene-detection - # Re-run analysis using cached frames and transcript - python process_meeting.py samples/meeting.mkv --use-vision + # Hybrid + LLM cleanup (best for code formatting) + python process_meeting.py samples/meeting.mkv --run-whisper --use-hybrid --hybrid-llm-cleanup --scene-detection - # Force reprocessing (ignore cache) - python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --no-cache - - # Use scene detection for fewer frames - python process_meeting.py samples/meeting.mkv --run-whisper --use-vision --scene-detection + # Iterate on scene threshold (reuse whisper transcript) + python process_meeting.py samples/meeting.mkv --embed-images --scene-detection --scene-threshold 5 --skip-cache-frames --skip-cache-analysis """ ) @@ -119,6 +116,21 @@ Examples: action='store_true', help='Use local vision model (Ollama) instead of OCR for better context understanding' ) + parser.add_argument( + '--use-hybrid', + action='store_true', + help='Use hybrid approach: OpenCV text detection + OCR (more accurate than vision models)' + ) + parser.add_argument( + '--hybrid-llm-cleanup', + action='store_true', + help='Use LLM to clean up OCR output and preserve code formatting (requires --use-hybrid)' + ) + parser.add_argument( + '--hybrid-llm-model', + help='LLM model for cleanup (default: llama3.2:3b)', + default='llama3.2:3b' + ) parser.add_argument( '--vision-model', help='Vision model to use with Ollama (default: llava:13b)', @@ -168,6 +180,17 @@ Examples: help='Output format style (default: detailed)', default='detailed' ) + parser.add_argument( + '--embed-images', + action='store_true', + help='Embed frame images (as base64) in enhanced transcript for LLM analysis' + ) + parser.add_argument( + '--embed-quality', + type=int, + help='JPEG quality for embedded images (default: 80, lower = smaller file)', + default=80 + ) # Logging parser.add_argument(