embed images

2025-10-28 08:02:45 -03:00
parent b1e1daf278
commit 118ef04223
12 changed files with 1016 additions and 61 deletions
--- a/meetus/ocr_processor.py
+++ b/meetus/ocr_processor.py
@@ -53,20 +53,25 @@ class OCRProcessor:
        else:
            raise ValueError(f"Unknown OCR engine: {self.engine}")

-    def extract_text(self, image_path: str) -> str:
+    def extract_text(self, image_path: str, preserve_layout: bool = True) -> str:
        """
        Extract text from a single image.

        Args:
            image_path: Path to image file
+            preserve_layout: Try to preserve whitespace and layout

        Returns:
            Extracted text
        """
        if self.engine == "tesseract":
            from PIL import Image
+            import pytesseract
            image = Image.open(image_path)
-            text = self._ocr_engine.image_to_string(image)
+
+            # Use PSM 6 (uniform block of text) to preserve layout better
+            config = '--psm 6' if preserve_layout else ''
+            text = pytesseract.image_to_string(image, config=config)

        elif self.engine == "easyocr":
            result = self._ocr_engine.readtext(image_path, detail=0)
@@ -81,12 +86,31 @@ class OCRProcessor:

        return self._clean_text(text)

-    def _clean_text(self, text: str) -> str:
-        """Clean up OCR output."""
-        # Remove excessive whitespace
-        text = re.sub(r'\n\s*\n', '\n', text)
-        text = re.sub(r' +', ' ', text)
-        return text.strip()
+    def _clean_text(self, text: str, preserve_indentation: bool = True) -> str:
+        """
+        Clean up OCR output.
+
+        Args:
+            text: Raw OCR text
+            preserve_indentation: Keep leading whitespace on lines
+
+        Returns:
+            Cleaned text
+        """
+        if preserve_indentation:
+            # Remove excessive blank lines but preserve indentation
+            lines = text.split('\n')
+            cleaned_lines = []
+            for line in lines:
+                # Keep line if it has content or is single empty line
+                if line.strip() or (cleaned_lines and cleaned_lines[-1].strip()):
+                    cleaned_lines.append(line)
+            return '\n'.join(cleaned_lines).strip()
+        else:
+            # Original aggressive cleaning
+            text = re.sub(r'\n\s*\n', '\n', text)
+            text = re.sub(r' +', ' ', text)
+            return text.strip()

    def process_frames(
        self,
@@ -108,18 +132,24 @@ class OCRProcessor:
        results = []
        prev_text = ""

-        for frame_path, timestamp in frames_info:
-            logger.debug(f"Processing frame at {timestamp:.2f}s...")
+        for idx, (frame_path, timestamp) in enumerate(frames_info, 1):
+            logger.debug(f"Processing frame {idx}/{len(frames_info)} at {timestamp:.2f}s...")
            text = self.extract_text(frame_path)

            if not text:
+                logger.debug(f"No text extracted from frame at {timestamp:.2f}s")
                continue

+            # Debug: Show what was extracted
+            logger.debug(f"Frame {idx} ({timestamp:.2f}s): Extracted {len(text)} chars")
+            logger.debug(f"Content preview: {text[:150]}{'...' if len(text) > 150 else ''}")
+
            # Deduplicate similar consecutive frames
-            if deduplicate:
+            if deduplicate and prev_text:
                similarity = self._text_similarity(prev_text, text)
+                logger.debug(f"Similarity to previous frame: {similarity:.2f} (threshold: {similarity_threshold})")
                if similarity > similarity_threshold:
-                    logger.debug(f"Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
+                    logger.debug(f"⊘ Skipping duplicate frame at {timestamp:.2f}s (similarity: {similarity:.2f})")
                    continue

            results.append({