66 lines
1.8 KiB
Python
66 lines
1.8 KiB
Python
"""Extract metadata from a single PDF in S3.
|
|
|
|
Called once per PDF by the SFN Map state — input is one S3 key, output is
|
|
that PDF's metadata. Designed for parallel invocation.
|
|
|
|
Input event:
|
|
{"key": "2026/04/document.pdf"} (bucket from BUCKET_NAME env var)
|
|
{"bucket": "...", "key": "..."} (full override)
|
|
|
|
Output:
|
|
{"key": "...", "pages": N, "title": "..."|null, "author": "..."|null,
|
|
"size_bytes": N}
|
|
|
|
A failed parse (corrupt PDF, unsupported encryption) returns a row with
|
|
pages=0 and an "error" field — the Map state continues with the rest;
|
|
the bad PDF shows up later in the aggregate as a parse error count.
|
|
"""
|
|
import io
|
|
import os
|
|
|
|
import boto3
|
|
from pypdf import PdfReader
|
|
from pypdf.errors import PdfReadError
|
|
|
|
_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)
|
|
|
|
|
|
def _clean(value) -> str | None:
|
|
if value is None:
|
|
return None
|
|
s = str(value).strip()
|
|
return s or None
|
|
|
|
|
|
def handler(event, context):
|
|
bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
|
|
key = event["key"]
|
|
|
|
obj = _s3.get_object(Bucket=bucket, Key=key)
|
|
body = obj["Body"].read()
|
|
size_bytes = len(body)
|
|
|
|
try:
|
|
reader = PdfReader(io.BytesIO(body))
|
|
pages = len(reader.pages)
|
|
info = reader.metadata or {}
|
|
title = _clean(info.get("/Title"))
|
|
author = _clean(info.get("/Author"))
|
|
except (PdfReadError, Exception) as exc:
|
|
return {
|
|
"key": key,
|
|
"pages": 0,
|
|
"title": None,
|
|
"author": None,
|
|
"size_bytes": size_bytes,
|
|
"error": f"{type(exc).__name__}: {exc}",
|
|
}
|
|
|
|
return {
|
|
"key": key,
|
|
"pages": pages,
|
|
"title": title,
|
|
"author": author,
|
|
"size_bytes": size_bytes,
|
|
}
|