lambda_studio/functions/extract_metadata/handler.py

"""Extract metadata from a single PDF in S3.

Called once per PDF by the SFN Map state — input is one S3 key, output is
that PDF's metadata. Designed for parallel invocation.

Input event:
    {"key": "2026/04/document.pdf"}     (bucket from BUCKET_NAME env var)
    {"bucket": "...", "key": "..."}     (full override)

Output:
    {"key": "...", "pages": N, "title": "..."|null, "author": "..."|null,
     "size_bytes": N}

A failed parse (corrupt PDF, unsupported encryption) returns a row with
pages=0 and an "error" field — the Map state continues with the rest;
the bad PDF shows up later in the aggregate as a parse error count.
"""
import io
import os

import boto3
from pypdf import PdfReader
from pypdf.errors import PdfReadError

_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)


def _clean(value) -> str | None:
    if value is None:
        return None
    s = str(value).strip()
    return s or None


def handler(event, context):
    bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
    key = event["key"]

    obj = _s3.get_object(Bucket=bucket, Key=key)
    body = obj["Body"].read()
    size_bytes = len(body)

    try:
        reader = PdfReader(io.BytesIO(body))
        pages = len(reader.pages)
        info = reader.metadata or {}
        title = _clean(info.get("/Title"))
        author = _clean(info.get("/Author"))
    except (PdfReadError, Exception) as exc:
        return {
            "key": key,
            "pages": 0,
            "title": None,
            "author": None,
            "size_bytes": size_bytes,
            "error": f"{type(exc).__name__}: {exc}",
        }

    return {
        "key": key,
        "pages": pages,
        "title": title,
        "author": author,
        "size_bytes": size_bytes,
    }