mitus/meetus/hybrid_processor.py

"""
Hybrid frame analysis: OpenCV text detection + OCR for accurate extraction.
Better than pure vision models which tend to hallucinate text content.
"""
from typing import List, Tuple, Dict, Optional
from pathlib import Path
import logging
import cv2
import numpy as np
from difflib import SequenceMatcher

logger = logging.getLogger(__name__)


class HybridProcessor:
    """Combine OpenCV text detection with OCR for accurate text extraction."""

    def __init__(self, ocr_engine: str = "tesseract", min_confidence: float = 0.5,
                 use_llm_cleanup: bool = False, llm_model: Optional[str] = None):
        """
        Initialize hybrid processor.

        Args:
            ocr_engine: OCR engine to use ('tesseract', 'easyocr', 'paddleocr')
            min_confidence: Minimum confidence for text detection (0-1)
            use_llm_cleanup: Use LLM to clean up OCR output and preserve formatting
            llm_model: Ollama model for cleanup (default: llama3.2:3b for speed)
        """
        from .ocr_processor import OCRProcessor

        self.ocr = OCRProcessor(engine=ocr_engine)
        self.min_confidence = min_confidence
        self.use_llm_cleanup = use_llm_cleanup
        self.llm_model = llm_model or "llama3.2:3b"
        self._llm_client = None

        if use_llm_cleanup:
            self._init_llm()

    def _init_llm(self):
        """Initialize Ollama client for LLM cleanup."""
        try:
            import ollama
            self._llm_client = ollama
            logger.info(f"LLM cleanup enabled using {self.llm_model}")
        except ImportError:
            logger.warning("ollama package not installed. LLM cleanup disabled.")
            self.use_llm_cleanup = False

    def _cleanup_with_llm(self, raw_text: str) -> str:
        """
        Use LLM to clean up OCR output and preserve code formatting.

        Args:
            raw_text: Raw OCR output

        Returns:
            Cleaned up text with proper formatting
        """
        if not self.use_llm_cleanup or not self._llm_client:
            return raw_text

        prompt = """You are cleaning up OCR output from a code editor screenshot.

Your task:
1. Fix any obvious OCR errors (l→1, O→0, etc.)
2. Preserve or restore code indentation and structure
3. Keep the exact text content - don't add explanations or comments
4. If it's code, maintain proper spacing and formatting
5. Return ONLY the cleaned text, nothing else

OCR Text:
"""

        try:
            response = self._llm_client.generate(
                model=self.llm_model,
                prompt=prompt + raw_text,
                options={"temperature": 0.1}  # Low temperature for accuracy
            )
            cleaned = response['response'].strip()
            logger.debug(f"LLM cleanup: {len(raw_text)} → {len(cleaned)} chars")
            return cleaned
        except Exception as e:
            logger.warning(f"LLM cleanup failed: {e}, using raw OCR output")
            return raw_text

    def detect_text_regions(self, image_path: str, min_area: int = 100) -> List[Tuple[int, int, int, int]]:
        """
        Detect text regions in image using OpenCV.

        Args:
            image_path: Path to image file
            min_area: Minimum area for text region (pixels)

        Returns:
            List of bounding boxes (x, y, w, h)
        """
        # Read image
        img = cv2.imread(image_path)
        if img is None:
            logger.warning(f"Could not read image: {image_path}")
            return []

        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Method 1: Morphological operations to find text regions
        # Works well for solid text blocks
        regions = self._detect_by_morphology(gray, min_area)

        if not regions:
            logger.debug(f"No text regions detected in {Path(image_path).name}")

        return regions

    def _detect_by_morphology(self, gray: np.ndarray, min_area: int) -> List[Tuple[int, int, int, int]]:
        """
        Detect text regions using morphological operations.
        Fast and works well for solid text blocks (code editors, terminals).

        Args:
            gray: Grayscale image
            min_area: Minimum area for region

        Returns:
            List of bounding boxes (x, y, w, h)
        """
        # Apply adaptive threshold to handle varying lighting
        binary = cv2.adaptiveThreshold(
            gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
            cv2.THRESH_BINARY_INV, 11, 2
        )

        # Morphological operations to connect text regions
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (15, 3))  # Horizontal kernel for text lines
        dilated = cv2.dilate(binary, kernel, iterations=2)

        # Find contours
        contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

        # Filter and extract bounding boxes
        regions = []
        for contour in contours:
            x, y, w, h = cv2.boundingRect(contour)
            area = w * h

            # Filter by area and aspect ratio
            if area > min_area and w > 20 and h > 10:  # Reasonable text dimensions
                regions.append((x, y, w, h))

        # Merge overlapping regions
        regions = self._merge_overlapping_regions(regions)

        logger.debug(f"Detected {len(regions)} text regions using morphology")
        return regions

    def _merge_overlapping_regions(
        self, regions: List[Tuple[int, int, int, int]],
        overlap_threshold: float = 0.3
    ) -> List[Tuple[int, int, int, int]]:
        """
        Merge overlapping bounding boxes.

        Args:
            regions: List of (x, y, w, h) tuples
            overlap_threshold: Minimum overlap ratio to merge

        Returns:
            Merged regions
        """
        if not regions:
            return []

        # Sort by y-coordinate (top to bottom)
        regions = sorted(regions, key=lambda r: r[1])

        merged = []
        current = list(regions[0])

        for region in regions[1:]:
            x, y, w, h = region
            cx, cy, cw, ch = current

            # Check for overlap
            x_overlap = max(0, min(cx + cw, x + w) - max(cx, x))
            y_overlap = max(0, min(cy + ch, y + h) - max(cy, y))
            overlap_area = x_overlap * y_overlap

            current_area = cw * ch
            region_area = w * h
            min_area = min(current_area, region_area)

            if overlap_area / min_area > overlap_threshold:
                # Merge regions
                new_x = min(cx, x)
                new_y = min(cy, y)
                new_x2 = max(cx + cw, x + w)
                new_y2 = max(cy + ch, y + h)
                current = [new_x, new_y, new_x2 - new_x, new_y2 - new_y]
            else:
                merged.append(tuple(current))
                current = list(region)

        merged.append(tuple(current))
        return merged

    def extract_text_from_region(self, image_path: str, region: Tuple[int, int, int, int]) -> str:
        """
        Extract text from a specific region using OCR.

        Args:
            image_path: Path to image file
            region: Bounding box (x, y, w, h)

        Returns:
            Extracted text
        """
        from PIL import Image

        # Load image and crop region
        img = Image.open(image_path)
        x, y, w, h = region
        cropped = img.crop((x, y, x + w, y + h))

        # Save to temp file for OCR (or use in-memory)
        import tempfile
        with tempfile.NamedTemporaryFile(suffix='.png', delete=False) as tmp:
            cropped.save(tmp.name)
            text = self.ocr.extract_text(tmp.name)

        # Clean up temp file
        Path(tmp.name).unlink()

        return text

    def analyze_frame(self, image_path: str) -> str:
        """
        Analyze a frame: detect text regions and OCR them.

        Args:
            image_path: Path to image file

        Returns:
            Combined text from all detected regions
        """
        # Detect text regions
        regions = self.detect_text_regions(image_path)

        if not regions:
            # Fallback to full-frame OCR if no regions detected
            logger.debug(f"No regions detected, using full-frame OCR for {Path(image_path).name}")
            raw_text = self.ocr.extract_text(image_path)
            return self._cleanup_with_llm(raw_text) if self.use_llm_cleanup else raw_text

        # Sort regions by reading order (top-to-bottom, left-to-right)
        regions = self._sort_regions_by_reading_order(regions)

        # Extract text from each region
        texts = []
        for idx, region in enumerate(regions):
            x, y, w, h = region
            text = self.extract_text_from_region(image_path, region)
            if text.strip():
                # Add visual separator with region info
                section_header = f"[Region {idx+1} at y={y}]"
                texts.append(f"{section_header}\n{text.strip()}")
                logger.debug(f"Region {idx+1}/{len(regions)} (y={y}): Extracted {len(text)} chars")

        combined = ("\n\n" + "="*60 + "\n\n").join(texts)
        logger.debug(f"Total extracted from {len(regions)} regions: {len(combined)} chars")

        # Apply LLM cleanup if enabled
        if self.use_llm_cleanup:
            combined = self._cleanup_with_llm(combined)

        return combined

    def _sort_regions_by_reading_order(self, regions: List[Tuple[int, int, int, int]]) -> List[Tuple[int, int, int, int]]:
        """
        Sort regions in reading order (top-to-bottom, left-to-right).

        Args:
            regions: List of (x, y, w, h) tuples

        Returns:
            Sorted regions
        """
        # Sort primarily by y (top to bottom), secondarily by x (left to right)
        # Group regions that are on roughly the same line (within 20px)
        sorted_regions = sorted(regions, key=lambda r: (r[1] // 20, r[0]))
        return sorted_regions

    def process_frames(
        self,
        frames_info: List[Tuple[str, float]],
        deduplicate: bool = True,
        similarity_threshold: float = 0.85
    ) -> List[Dict]:
        """
        Process multiple frames with hybrid analysis.

        Args:
            frames_info: List of (frame_path, timestamp) tuples
            deduplicate: Whether to remove similar consecutive analyses
            similarity_threshold: Threshold for considering analyses as duplicates (0-1)

        Returns:
            List of dicts with 'timestamp', 'text', and 'frame_path'
        """
        results = []
        prev_text = ""

        total = len(frames_info)
        logger.info(f"Starting hybrid analysis of {total} frames...")

        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
            logger.info(f"Analyzing frame {idx}/{total} at {timestamp:.2f}s...")

            text = self.analyze_frame(frame_path)

            if not text:
                logger.warning(f"No content extracted from frame at {timestamp:.2f}s")
                continue

            # Debug: Show what was extracted
            logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars")
            logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}")

            # Deduplicate similar consecutive frames
            if deduplicate and prev_text:
                similarity = self._text_similarity(prev_text, text)
                logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})")
                if similarity > similarity_threshold:
                    logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
                    continue

            results.append({
                'timestamp': timestamp,
                'text': text,
                'frame_path': frame_path
            })

            prev_text = text

        logger.info(f"Extracted content from {len(results)} frames (deduplication: {deduplicate})")
        return results

    def _text_similarity(self, text1: str, text2: str) -> float:
        """
        Calculate similarity between two texts.

        Returns:
            Similarity score between 0 and 1
        """
        return SequenceMatcher(None, text1, text2).ratio()