"""PaddleOCR 3.x text extraction wrapper.""" from __future__ import annotations import logging from models import registry from config import get_config logger = logging.getLogger(__name__) def _load(languages: list[str]): from paddleocr import PaddleOCR key = f"ocr_{'_'.join(languages)}" model = PaddleOCR(lang=languages[0]) registry.put(key, model) return model def _get(languages: list[str] | None = None): langs = languages or get_config()["ocr_languages"] key = f"ocr_{'_'.join(langs)}" model = registry.get(key) if model is None: model = _load(langs) return model def _parse_raw(raw) -> list[tuple[list, str, float]]: """ Parse PaddleOCR output into (points, text, confidence) tuples. PaddleOCR 3.x changed the result format. Two known layouts: Layout A — dict-based (new pipeline API): raw = [{'rec_texts': [...], 'rec_scores': [...], 'dt_polys': [...]}] Layout B — nested list (2.x compat / some 3.x builds): raw = [[ [points, [text, score]], ... ]] raw = [[ [points, [text, score], [cls, cls_score]], ... ]] # with angle cls """ results = [] for page in raw: if not page: continue # Layout A: dict with parallel lists if isinstance(page, dict): texts = page.get("rec_texts", []) scores = page.get("rec_scores", []) polys = page.get("dt_polys", []) for points, text, confidence in zip(polys, texts, scores): results.append((points, text, float(confidence))) continue # Layout B: list of per-line entries for line in page: if not line: continue # line[0] is always the polygon points points = line[0] # line[1] is [text, score] — ignore any extra elements (angle cls etc.) rec = line[1] if isinstance(rec, (list, tuple)) and len(rec) >= 2: text, confidence = rec[0], rec[1] else: logger.warning("Unexpected OCR line format: %s", line) continue results.append((points, str(text), float(confidence))) return results def ocr(image, languages: list[str] | None = None, min_confidence: float | None = None) -> list[dict]: """Run OCR on an image, return list of text result dicts.""" cfg = get_config() min_conf = min_confidence if min_confidence is not None else cfg["ocr_min_confidence"] model = _get(languages) raw = model.ocr(image) logger.debug("OCR raw: %s", raw) parsed = _parse_raw(raw) results = [] for points, text, confidence in parsed: if confidence < min_conf: continue xs = [p[0] for p in points] ys = [p[1] for p in points] results.append({ "text": text, "confidence": confidence, "bbox": [int(min(xs)), int(min(ys)), int(max(xs) - min(xs)), int(max(ys) - min(ys))], }) return results