mediaproc/gpu/models/ocr.py

"""PaddleOCR 3.x text extraction wrapper."""

from __future__ import annotations

import logging

from models import registry
from config import get_config

logger = logging.getLogger(__name__)


def _load(languages: list[str]):
    from paddleocr import PaddleOCR
    key = f"ocr_{'_'.join(languages)}"
    model = PaddleOCR(lang=languages[0])
    registry.put(key, model)
    return model


def _get(languages: list[str] | None = None):
    langs = languages or get_config()["ocr_languages"]
    key = f"ocr_{'_'.join(langs)}"
    model = registry.get(key)
    if model is None:
        model = _load(langs)
    return model


def _parse_raw(raw) -> list[tuple[list, str, float]]:
    """
    Parse PaddleOCR output into (points, text, confidence) tuples.

    PaddleOCR 3.x changed the result format. Two known layouts:

    Layout A — dict-based (new pipeline API):
        raw = [{'rec_texts': [...], 'rec_scores': [...], 'dt_polys': [...]}]

    Layout B — nested list (2.x compat / some 3.x builds):
        raw = [[  [points, [text, score]], ...  ]]
        raw = [[  [points, [text, score], [cls, cls_score]], ...  ]]  # with angle cls
    """
    results = []

    for page in raw:
        if not page:
            continue

        # Layout A: dict with parallel lists
        if isinstance(page, dict):
            texts  = page.get("rec_texts", [])
            scores = page.get("rec_scores", [])
            polys  = page.get("dt_polys", [])
            for points, text, confidence in zip(polys, texts, scores):
                results.append((points, text, float(confidence)))
            continue

        # Layout B: list of per-line entries
        for line in page:
            if not line:
                continue

            # line[0] is always the polygon points
            points = line[0]

            # line[1] is [text, score] — ignore any extra elements (angle cls etc.)
            rec = line[1]
            if isinstance(rec, (list, tuple)) and len(rec) >= 2:
                text, confidence = rec[0], rec[1]
            else:
                logger.warning("Unexpected OCR line format: %s", line)
                continue

            results.append((points, str(text), float(confidence)))

    return results


def ocr(image, languages: list[str] | None = None, min_confidence: float | None = None) -> list[dict]:
    """Run OCR on an image, return list of text result dicts."""
    cfg = get_config()
    min_conf = min_confidence if min_confidence is not None else cfg["ocr_min_confidence"]
    model = _get(languages)

    raw = model.ocr(image)
    logger.debug("OCR raw: %s", raw)

    parsed = _parse_raw(raw)

    results = []
    for points, text, confidence in parsed:
        if confidence < min_conf:
            continue

        xs = [p[0] for p in points]
        ys = [p[1] for p in points]

        results.append({
            "text": text,
            "confidence": confidence,
            "bbox": [int(min(xs)), int(min(ys)),
                     int(max(xs) - min(xs)), int(max(ys) - min(ys))],
        })

    return results