""" Stage 3.5 — Preprocessing Runs between YOLO detection and OCR. Applies configurable image preprocessing to each detected region crop: contrast enhancement, deskewing, binarization. Operates on the crops derived from boxes_by_frame, produces preprocessed_crops keyed by (frame_sequence, box_index). """ from __future__ import annotations import logging import numpy as np from detect import emit from detect.models import BoundingBox, Frame logger = logging.getLogger(__name__) def _crop_region(frame: Frame, box: BoundingBox) -> np.ndarray: h, w = frame.image.shape[:2] x1 = max(0, box.x) y1 = max(0, box.y) x2 = min(w, box.x + box.w) y2 = min(h, box.y + box.h) return frame.image[y1:y2, x1:x2] def preprocess_regions( frames: list[Frame], boxes_by_frame: dict[int, list[BoundingBox]], do_contrast: bool = True, do_deskew: bool = False, do_binarize: bool = False, inference_url: str | None = None, job_id: str | None = None, ) -> dict[str, np.ndarray]: """ Preprocess cropped regions from YOLO detections. Returns dict keyed by "{frame_seq}_{box_idx}" → preprocessed crop. These are passed to the OCR stage instead of raw crops. """ total_regions = sum(len(boxes) for boxes in boxes_by_frame.values()) any_active = do_contrast or do_deskew or do_binarize if not any_active: emit.log(job_id, "Preprocess", "INFO", f"Preprocessing disabled, passing {total_regions} regions through") return {} mode = "remote" if inference_url else "local" emit.log(job_id, "Preprocess", "INFO", f"Preprocessing {total_regions} regions (mode={mode}, " f"contrast={do_contrast}, deskew={do_deskew}, binarize={do_binarize})") frame_map = {f.sequence: f for f in frames} preprocessed: dict[str, np.ndarray] = {} processed_count = 0 for seq, boxes in boxes_by_frame.items(): frame = frame_map.get(seq) if not frame: continue for idx, box in enumerate(boxes): crop = _crop_region(frame, box) if crop.size == 0: continue key = f"{seq}_{idx}" if inference_url: result = _preprocess_remote(crop, inference_url, do_contrast, do_deskew, do_binarize) else: result = _preprocess_local(crop, do_contrast, do_deskew, do_binarize) preprocessed[key] = result processed_count += 1 emit.log(job_id, "Preprocess", "INFO", f"Preprocessed {processed_count} regions") return preprocessed def _preprocess_remote(crop: np.ndarray, inference_url: str, do_contrast: bool, do_deskew: bool, do_binarize: bool) -> np.ndarray: """Call GPU server /preprocess endpoint.""" import base64 import io import requests from PIL import Image img = Image.fromarray(crop) buf = io.BytesIO() img.save(buf, format="JPEG", quality=85) image_b64 = base64.b64encode(buf.getvalue()).decode() resp = requests.post( f"{inference_url.rstrip('/')}/preprocess", json={ "image": image_b64, "contrast": do_contrast, "deskew": do_deskew, "binarize": do_binarize, }, timeout=30, ) resp.raise_for_status() data = resp.json() result_bytes = base64.b64decode(data["image"]) result_img = Image.open(io.BytesIO(result_bytes)).convert("RGB") return np.array(result_img) def _preprocess_local(crop: np.ndarray, do_contrast: bool, do_deskew: bool, do_binarize: bool) -> np.ndarray: """Run preprocessing in-process (requires opencv-python-headless).""" from gpu.models.preprocess import preprocess return preprocess(crop, do_binarize=do_binarize, do_deskew=do_deskew, do_contrast=do_contrast)