add tester ui, and restructure folders

This commit is contained in:
2026-05-13 17:00:00 -03:00
parent 7c5aa14409
commit 6652cb26e6
17 changed files with 2656 additions and 1251 deletions

View File

@@ -0,0 +1 @@
{}

View File

@@ -0,0 +1,81 @@
import asyncio
import json
import os
import uuid
import aioboto3
import aiofiles
BUCKET = os.environ.get("BUCKET_NAME", "my-company-reports-bucket")
PREFIX = os.environ.get("PREFIX", "2026/04/")
EXPIRY = int(os.environ.get("URL_EXPIRY_SECONDS", "900"))
ENDPOINT = os.environ.get("S3_ENDPOINT_URL") or None
QUEUE_MAX = int(os.environ.get("QUEUE_MAX", "2000"))
_DONE = object()
async def _run():
session = aioboto3.Session()
async with session.client("s3", endpoint_url=ENDPOINT) as s3:
queue: asyncio.Queue = asyncio.Queue(maxsize=QUEUE_MAX)
manifest_path = f"/tmp/{uuid.uuid4()}.jsonl"
async def producer():
paginator = s3.get_paginator("list_objects_v2")
try:
async for page in paginator.paginate(Bucket=BUCKET, Prefix=PREFIX, PaginationConfig={"PageSize": 100}):
for obj in page.get("Contents", []) or []:
key = obj["Key"]
if key.lower().endswith(".pdf"):
await queue.put(key)
finally:
await queue.put(_DONE)
async def consumer():
count = 0
async with aiofiles.open(manifest_path, "w") as f:
while True:
item = await queue.get()
if item is _DONE:
break
url = await s3.generate_presigned_url(
"get_object",
Params={"Bucket": BUCKET, "Key": item},
ExpiresIn=EXPIRY,
)
await f.write(json.dumps({"key": item, "url": url}) + "\n")
count += 1
return count
prod_task = asyncio.create_task(producer())
count = await consumer()
await prod_task
manifest_key = f"manifests/{uuid.uuid4()}.jsonl"
async with aiofiles.open(manifest_path, "rb") as f:
body = await f.read()
await s3.put_object(
Bucket=BUCKET,
Key=manifest_key,
Body=body,
ContentType="application/x-ndjson",
)
manifest_url = await s3.generate_presigned_url(
"get_object",
Params={"Bucket": BUCKET, "Key": manifest_key},
ExpiresIn=EXPIRY,
)
os.unlink(manifest_path)
return {
"count": count,
"manifest_key": manifest_key,
"manifest_url": manifest_url,
}
def handler(event, context):
result = asyncio.run(_run())
return {"statusCode": 200, "body": json.dumps(result)}

View File

@@ -0,0 +1,6 @@
# Deps for the sign_pdfs lambda. Bundled into its deployment zip when
# uploading to AWS; locally, the runner pod installs the union of all
# per-function requirements (see Dockerfile.lambda).
aioboto3>=15.0 # async S3 client used in handler.py
aiofiles>=23.2 # async file I/O for the JSONL manifest in /tmp
boto3>=1.40 # sync S3 client used by seed.py (data setup utility)

View File

@@ -0,0 +1,76 @@
import os
import sys
import boto3
from botocore.client import Config
from botocore.exceptions import ClientError
BUCKET = os.environ.get("BUCKET_NAME", "my-company-reports-bucket")
PREFIX = os.environ.get("PREFIX", "2026/04/")
ENDPOINT = os.environ.get("S3_ENDPOINT_URL", "http://localhost:9000")
DECOY_EXTS = ()
def _client():
return boto3.client(
"s3",
endpoint_url=ENDPOINT,
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID", "minioadmin"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY", "minioadmin"),
region_name=os.environ.get("AWS_REGION", "us-east-1"),
config=Config(signature_version="s3v4"),
)
def _ensure_bucket(s3, name):
try:
s3.head_bucket(Bucket=name)
except ClientError:
s3.create_bucket(Bucket=name)
def _walk(source_dir):
for root, _, files in os.walk(source_dir):
for name in files:
yield os.path.join(root, name)
def main():
source_dir = sys.argv[1] if len(sys.argv) > 1 else os.environ.get("SOURCE_DIR")
if not source_dir:
print("usage: SOURCE_DIR=<path> python seed.py (or pass as argv[1])", file=sys.stderr)
sys.exit(2)
if not os.path.isdir(source_dir):
print(f"not a directory: {source_dir}", file=sys.stderr)
sys.exit(2)
s3 = _client()
_ensure_bucket(s3, BUCKET)
pdf_n = decoy_n = 0
for path in _walk(source_dir):
lower = path.lower()
is_pdf = lower.endswith(".pdf")
is_decoy = lower.endswith(DECOY_EXTS)
if not (is_pdf or is_decoy):
continue
rel = os.path.relpath(path, source_dir).replace(os.sep, "/")
key = f"{PREFIX}{rel}"
try:
s3.upload_file(path, BUCKET, key)
except (ClientError, OSError) as exc:
print(f" skip {path}: {exc}", file=sys.stderr)
continue
if is_pdf:
pdf_n += 1
else:
decoy_n += 1
if (pdf_n + decoy_n) % 100 == 0:
print(f" uploaded {pdf_n} pdfs / {decoy_n} decoys ...")
print(f"done: {pdf_n} pdfs and {decoy_n} decoys uploaded to s3://{BUCKET}/{PREFIX}")
if __name__ == "__main__":
main()