phase 12

2026-03-26 06:10:19 -03:00
parent 731964ca10
commit e27cb5bcc3
41 changed files with 2079 additions and 95 deletions
--- a/gpu/models/preprocess.py
+++ b/gpu/models/preprocess.py
@@ -0,0 +1,117 @@
+"""
+Image preprocessing pipeline for crops before OCR.
+
+Each step is independently toggleable via config.
+Operates on numpy arrays (BGR or RGB), returns processed array.
+"""
+
+from __future__ import annotations
+
+import logging
+
+import numpy as np
+
+logger = logging.getLogger(__name__)
+
+
+def binarize(image: np.ndarray, threshold: int = 128) -> np.ndarray:
+    """Convert to grayscale and apply Otsu binarization."""
+    import cv2
+
+    if len(image.shape) == 3:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = image
+
+    _, binary = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
+
+    # Convert back to 3-channel for downstream compatibility
+    result = cv2.cvtColor(binary, cv2.COLOR_GRAY2RGB)
+    return result
+
+
+def deskew(image: np.ndarray) -> np.ndarray:
+    """Correct slight rotation using minimum area rectangle."""
+    import cv2
+
+    if len(image.shape) == 3:
+        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
+    else:
+        gray = image
+
+    coords = np.column_stack(np.where(gray < 128))
+    if len(coords) < 10:
+        return image
+
+    rect = cv2.minAreaRect(coords)
+    angle = rect[-1]
+
+    # Normalize angle
+    if angle < -45:
+        angle = -(90 + angle)
+    else:
+        angle = -angle
+
+    if abs(angle) < 0.5:
+        return image
+
+    h, w = image.shape[:2]
+    center = (w // 2, h // 2)
+    rotation_matrix = cv2.getRotationMatrix2D(center, angle, 1.0)
+    result = cv2.warpAffine(
+        image, rotation_matrix, (w, h),
+        flags=cv2.INTER_LINEAR,
+        borderMode=cv2.BORDER_REPLICATE,
+    )
+    return result
+
+
+def enhance_contrast(image: np.ndarray) -> np.ndarray:
+    """Apply CLAHE (adaptive histogram equalization) for contrast normalization."""
+    import cv2
+
+    if len(image.shape) == 3:
+        lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
+        l_channel = lab[:, :, 0]
+    else:
+        l_channel = image
+
+    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
+    enhanced = clahe.apply(l_channel)
+
+    if len(image.shape) == 3:
+        lab[:, :, 0] = enhanced
+        result = cv2.cvtColor(lab, cv2.COLOR_LAB2RGB)
+    else:
+        result = enhanced
+
+    return result
+
+
+def preprocess(
+    image: np.ndarray,
+    do_binarize: bool = False,
+    do_deskew: bool = False,
+    do_contrast: bool = True,
+) -> np.ndarray:
+    """
+    Run the preprocessing pipeline on a crop image.
+
+    Each step is independently toggleable. Order: contrast → deskew → binarize.
+    Contrast first (works best on color), binarize last (destroys color info).
+    """
+    result = image
+
+    if do_contrast:
+        result = enhance_contrast(result)
+        logger.debug("Preprocessing: contrast enhanced")
+
+    if do_deskew:
+        result = deskew(result)
+        logger.debug("Preprocessing: deskewed")
+
+    if do_binarize:
+        result = binarize(result)
+        logger.debug("Preprocessing: binarized")
+
+    return result
--- a/gpu/requirements.txt
+++ b/gpu/requirements.txt
@@ -25,3 +25,6 @@ paddleocr>=3.0.0
 # (all_tied_weights_keys API change). Also needs accelerate for device_map.
 transformers>=4.40.0,<5
 accelerate>=0.27.0
+
+# Preprocessing (phase 12)
+opencv-python-headless>=4.8.0
--- a/gpu/server.py
+++ b/gpu/server.py
@@ -73,6 +73,17 @@ class OCRResponse(BaseModel):
    results: list[OCRTextResult]


+class PreprocessRequest(BaseModel):
+    image: str
+    binarize: bool = False
+    deskew: bool = False
+    contrast: bool = True
+
+
+class PreprocessResponse(BaseModel):
+    image: str  # base64 JPEG of processed image
+
+
 class VLMRequest(BaseModel):
    image: str
    prompt: str
@@ -183,6 +194,34 @@ def ocr(req: OCRRequest):
    return OCRResponse(results=[OCRTextResult(**r) for r in results])


+@app.post("/preprocess", response_model=PreprocessResponse)
+def preprocess_image(req: PreprocessRequest):
+    try:
+        image = _decode_image(req.image)
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Bad image: {e}")
+
+    try:
+        from models.preprocess import preprocess
+        processed = preprocess(
+            image,
+            do_binarize=req.binarize,
+            do_deskew=req.deskew,
+            do_contrast=req.contrast,
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Preprocessing failed: {e}")
+
+    from PIL import Image as PILImage
+    import io
+    img = PILImage.fromarray(processed)
+    buf = io.BytesIO()
+    img.save(buf, format="JPEG", quality=90)
+    result_b64 = base64.b64encode(buf.getvalue()).decode()
+
+    return PreprocessResponse(image=result_b64)
+
+
@app.post("/vlm", response_model=VLMResponse)
 def vlm(req: VLMRequest):
    try: