metadata pipeline via Step Functions

2026-05-18 07:59:13 -03:00
parent d3008676e0
commit e297f97e18
6 changed files with 280 additions and 0 deletions
--- a/functions/list_pdfs/handler.py
+++ b/functions/list_pdfs/handler.py
@@ -0,0 +1,32 @@
+"""List every .pdf key under the configured bucket+prefix.
+
+Output:
+    {"keys": ["2026/04/a.pdf", ...], "count": N, "pages": N}
+
+Used as the first state in the metadata-index pipeline. The "keys" array
+feeds an SFN Map state that runs ExtractMetadata in parallel per key.
+"""
+import os
+
+import boto3
+
+_s3 = boto3.client("s3", endpoint_url=os.environ.get("S3_ENDPOINT_URL") or None)
+
+
+def handler(event, context):
+    bucket = event.get("bucket") or os.environ["BUCKET_NAME"]
+    prefix = event.get("prefix") or os.environ["PREFIX"]
+
+    keys = []
+    pages = 0
+    paginator = _s3.get_paginator("list_objects_v2")
+    for page in paginator.paginate(
+        Bucket=bucket, Prefix=prefix, PaginationConfig={"PageSize": 1000}
+    ):
+        pages += 1
+        for obj in page.get("Contents", []) or []:
+            key = obj["Key"]
+            if key.lower().endswith(".pdf"):
+                keys.append(key)
+
+    return {"keys": keys, "count": len(keys), "pages": pages}