From 116d4032e210b3d60bf80dfafc433f895144bcf9 Mon Sep 17 00:00:00 2001
From: buenosairesam <pensalo@gmail.com>
Date: Mon, 29 Dec 2025 14:40:06 -0300
Subject: [PATCH] first claude draft

---
 .gitignore                                    |   1 +
 .woodpecker.yml                               | 184 +++++++
 .woodpecker/build.yml                         |  43 ++
 .woodpecker/deploy.yml                        |  61 +++
 .woodpecker/test.yml                          |  40 ++
 CLAUDE.md                                     | 492 ++++++++++++++++++
 Tiltfile                                      | 119 +++++
 ctlptl.yaml                                   |  32 ++
 docker-compose.override.yml                   |  48 ++
 docker-compose.yml                            | 154 ++++++
 docs/architecture/01-system-overview.dot      |  78 +++
 docs/architecture/01-system-overview.svg      | 193 +++++++
 docs/architecture/02-data-flow.dot            |  83 +++
 docs/architecture/02-data-flow.svg            | 217 ++++++++
 docs/architecture/03-deployment.dot           |  95 ++++
 docs/architecture/03-deployment.svg           | 221 ++++++++
 docs/architecture/04-grpc-services.dot        |  67 +++
 docs/architecture/04-grpc-services.svg        | 171 ++++++
 docs/architecture/graph.html                  | 120 +++++
 docs/architecture/index.html                  | 207 ++++++++
 docs/architecture/styles.css                  | 343 ++++++++++++
 infra/aws/lambdas/aggregator/placeholder.txt  |   1 +
 infra/aws/lambdas/aggregator/placeholder.zip  | Bin 0 -> 192 bytes
 infra/aws/lambdas/compactor/placeholder.txt   |   1 +
 infra/aws/lambdas/compactor/placeholder.zip   | Bin 0 -> 192 bytes
 infra/aws/terraform/ec2.tf                    | 148 ++++++
 infra/aws/terraform/lambda.tf                 | 203 ++++++++
 infra/aws/terraform/main.tf                   |  58 +++
 infra/aws/terraform/outputs.tf                |  36 ++
 infra/aws/terraform/terraform.tfvars.example  |  16 +
 infra/aws/terraform/variables.tf              |  70 +++
 k8s/base/aggregator/configmap.yaml            |  15 +
 k8s/base/aggregator/deployment.yaml           |  46 ++
 k8s/base/aggregator/kustomization.yaml        |  11 +
 k8s/base/aggregator/service.yaml              |  11 +
 k8s/base/alerts/configmap.yaml                |  14 +
 k8s/base/alerts/deployment.yaml               |  33 ++
 k8s/base/alerts/kustomization.yaml            |  10 +
 k8s/base/gateway/configmap.yaml               |  16 +
 k8s/base/gateway/deployment.yaml              |  48 ++
 k8s/base/gateway/kustomization.yaml           |  11 +
 k8s/base/gateway/service.yaml                 |  11 +
 k8s/base/kustomization.yaml                   |  17 +
 k8s/base/namespace.yaml                       |   6 +
 k8s/base/redis/deployment.yaml                |  37 ++
 k8s/base/redis/kustomization.yaml             |  10 +
 k8s/base/redis/service.yaml                   |  11 +
 k8s/base/timescaledb/configmap.yaml           |  94 ++++
 k8s/base/timescaledb/kustomization.yaml       |  11 +
 k8s/base/timescaledb/service.yaml             |  12 +
 k8s/base/timescaledb/statefulset.yaml         |  65 +++
 k8s/overlays/local/kustomization.yaml         |  22 +
 .../local/patches/reduce-resources.yaml       |  50 ++
 k8s/overlays/local/secrets.yaml               |   8 +
 proto/metrics.proto                           | 159 ++++++
 scripts/generate-diagrams.sh                  |  22 +
 scripts/init-db.sql                           | 158 ++++++
 services/aggregator/Dockerfile                |  47 ++
 services/aggregator/requirements.txt          |   9 +
 services/alerts/Dockerfile                    |  35 ++
 services/alerts/requirements.txt              |   6 +
 services/collector/Dockerfile                 |  55 ++
 services/collector/requirements.txt           |   7 +
 services/gateway/Dockerfile                   |  44 ++
 services/gateway/requirements.txt             |  13 +
 shared/events/__init__.py                     |  34 ++
 shared/events/base.py                         | 117 +++++
 shared/events/factory.py                      | 101 ++++
 shared/events/redis_pubsub.py                 | 142 +++++
 69 files changed, 5020 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 .woodpecker.yml
 create mode 100644 .woodpecker/build.yml
 create mode 100644 .woodpecker/deploy.yml
 create mode 100644 .woodpecker/test.yml
 create mode 100644 CLAUDE.md
 create mode 100644 Tiltfile
 create mode 100644 ctlptl.yaml
 create mode 100644 docker-compose.override.yml
 create mode 100644 docker-compose.yml
 create mode 100644 docs/architecture/01-system-overview.dot
 create mode 100644 docs/architecture/01-system-overview.svg
 create mode 100644 docs/architecture/02-data-flow.dot
 create mode 100644 docs/architecture/02-data-flow.svg
 create mode 100644 docs/architecture/03-deployment.dot
 create mode 100644 docs/architecture/03-deployment.svg
 create mode 100644 docs/architecture/04-grpc-services.dot
 create mode 100644 docs/architecture/04-grpc-services.svg
 create mode 100644 docs/architecture/graph.html
 create mode 100644 docs/architecture/index.html
 create mode 100644 docs/architecture/styles.css
 create mode 100644 infra/aws/lambdas/aggregator/placeholder.txt
 create mode 100644 infra/aws/lambdas/aggregator/placeholder.zip
 create mode 100644 infra/aws/lambdas/compactor/placeholder.txt
 create mode 100644 infra/aws/lambdas/compactor/placeholder.zip
 create mode 100644 infra/aws/terraform/ec2.tf
 create mode 100644 infra/aws/terraform/lambda.tf
 create mode 100644 infra/aws/terraform/main.tf
 create mode 100644 infra/aws/terraform/outputs.tf
 create mode 100644 infra/aws/terraform/terraform.tfvars.example
 create mode 100644 infra/aws/terraform/variables.tf
 create mode 100644 k8s/base/aggregator/configmap.yaml
 create mode 100644 k8s/base/aggregator/deployment.yaml
 create mode 100644 k8s/base/aggregator/kustomization.yaml
 create mode 100644 k8s/base/aggregator/service.yaml
 create mode 100644 k8s/base/alerts/configmap.yaml
 create mode 100644 k8s/base/alerts/deployment.yaml
 create mode 100644 k8s/base/alerts/kustomization.yaml
 create mode 100644 k8s/base/gateway/configmap.yaml
 create mode 100644 k8s/base/gateway/deployment.yaml
 create mode 100644 k8s/base/gateway/kustomization.yaml
 create mode 100644 k8s/base/gateway/service.yaml
 create mode 100644 k8s/base/kustomization.yaml
 create mode 100644 k8s/base/namespace.yaml
 create mode 100644 k8s/base/redis/deployment.yaml
 create mode 100644 k8s/base/redis/kustomization.yaml
 create mode 100644 k8s/base/redis/service.yaml
 create mode 100644 k8s/base/timescaledb/configmap.yaml
 create mode 100644 k8s/base/timescaledb/kustomization.yaml
 create mode 100644 k8s/base/timescaledb/service.yaml
 create mode 100644 k8s/base/timescaledb/statefulset.yaml
 create mode 100644 k8s/overlays/local/kustomization.yaml
 create mode 100644 k8s/overlays/local/patches/reduce-resources.yaml
 create mode 100644 k8s/overlays/local/secrets.yaml
 create mode 100644 proto/metrics.proto
 create mode 100755 scripts/generate-diagrams.sh
 create mode 100644 scripts/init-db.sql
 create mode 100644 services/aggregator/Dockerfile
 create mode 100644 services/aggregator/requirements.txt
 create mode 100644 services/alerts/Dockerfile
 create mode 100644 services/alerts/requirements.txt
 create mode 100644 services/collector/Dockerfile
 create mode 100644 services/collector/requirements.txt
 create mode 100644 services/gateway/Dockerfile
 create mode 100644 services/gateway/requirements.txt
 create mode 100644 shared/events/__init__.py
 create mode 100644 shared/events/base.py
 create mode 100644 shared/events/factory.py
 create mode 100644 shared/events/redis_pubsub.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..24c5735
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+def
diff --git a/.woodpecker.yml b/.woodpecker.yml
new file mode 100644
index 0000000..06dd3c6
--- /dev/null
+++ b/.woodpecker.yml
@@ -0,0 +1,184 @@
+# Woodpecker CI Pipeline
+# https://woodpecker-ci.org/docs/usage/pipeline-syntax
+
+variables:
+  - &python_image python:3.11-slim
+  - &docker_image docker:24-dind
+
+# Clone settings
+clone:
+  git:
+    image: woodpeckerci/plugin-git
+    settings:
+      depth: 50
+
+# Pipeline steps
+steps:
+  # ==========================================================================
+  # Lint and Test
+  # ==========================================================================
+
+  lint:
+    image: *python_image
+    commands:
+      - pip install ruff mypy
+      - ruff check services/ shared/
+      - ruff format --check services/ shared/
+    when:
+      event: [push, pull_request]
+
+  test-shared:
+    image: *python_image
+    commands:
+      - pip install pytest pytest-asyncio redis asyncpg
+      - pip install -r shared/events/requirements.txt || true
+      - pytest shared/ -v --tb=short
+    when:
+      event: [push, pull_request]
+
+  test-services:
+    image: *python_image
+    commands:
+      - pip install pytest pytest-asyncio grpcio grpcio-tools
+      - |
+        for svc in collector aggregator gateway alerts; do
+          if [ -f "services/$svc/requirements.txt" ]; then
+            pip install -r "services/$svc/requirements.txt"
+          fi
+        done
+      - pytest services/ -v --tb=short || true
+    when:
+      event: [push, pull_request]
+
+  # ==========================================================================
+  # Build Docker Images
+  # ==========================================================================
+
+  build-aggregator:
+    image: *docker_image
+    commands:
+      - docker build -t sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} -f services/aggregator/Dockerfile --target production .
+      - docker tag sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} sysmonstm/aggregator:latest
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    when:
+      event: push
+      branch: main
+
+  build-gateway:
+    image: *docker_image
+    commands:
+      - docker build -t sysmonstm/gateway:${CI_COMMIT_SHA:0:7} -f services/gateway/Dockerfile --target production .
+      - docker tag sysmonstm/gateway:${CI_COMMIT_SHA:0:7} sysmonstm/gateway:latest
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    when:
+      event: push
+      branch: main
+
+  build-collector:
+    image: *docker_image
+    commands:
+      - docker build -t sysmonstm/collector:${CI_COMMIT_SHA:0:7} -f services/collector/Dockerfile --target production .
+      - docker tag sysmonstm/collector:${CI_COMMIT_SHA:0:7} sysmonstm/collector:latest
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    when:
+      event: push
+      branch: main
+
+  build-alerts:
+    image: *docker_image
+    commands:
+      - docker build -t sysmonstm/alerts:${CI_COMMIT_SHA:0:7} -f services/alerts/Dockerfile --target production .
+      - docker tag sysmonstm/alerts:${CI_COMMIT_SHA:0:7} sysmonstm/alerts:latest
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    when:
+      event: push
+      branch: main
+
+  # ==========================================================================
+  # Push to Registry
+  # ==========================================================================
+
+  push-images:
+    image: *docker_image
+    commands:
+      - echo "$REGISTRY_PASSWORD" | docker login -u "$REGISTRY_USER" --password-stdin "$REGISTRY_URL"
+      - |
+        for img in aggregator gateway collector alerts; do
+          docker tag sysmonstm/$img:latest $REGISTRY_URL/sysmonstm/$img:${CI_COMMIT_SHA:0:7}
+          docker tag sysmonstm/$img:latest $REGISTRY_URL/sysmonstm/$img:latest
+          docker push $REGISTRY_URL/sysmonstm/$img:${CI_COMMIT_SHA:0:7}
+          docker push $REGISTRY_URL/sysmonstm/$img:latest
+        done
+    secrets: [registry_user, registry_password, registry_url]
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+    when:
+      event: push
+      branch: main
+
+  # ==========================================================================
+  # Deploy to EC2
+  # ==========================================================================
+
+  deploy-staging:
+    image: appleboy/drone-ssh
+    settings:
+      host:
+        from_secret: deploy_host
+      username:
+        from_secret: deploy_user
+      key:
+        from_secret: deploy_key
+      script:
+        - cd /home/ec2-user/sysmonstm
+        - git pull origin main
+        - docker-compose pull
+        - docker-compose up -d --remove-orphans
+        - docker system prune -f
+    when:
+      event: push
+      branch: main
+
+  # ==========================================================================
+  # Notifications
+  # ==========================================================================
+
+  notify-success:
+    image: plugins/webhook
+    settings:
+      urls:
+        from_secret: webhook_url
+      content_type: application/json
+      template: |
+        {
+          "text": "✅ Build succeeded: ${CI_REPO_NAME}#${CI_BUILD_NUMBER}",
+          "commit": "${CI_COMMIT_SHA:0:7}",
+          "branch": "${CI_COMMIT_BRANCH}",
+          "author": "${CI_COMMIT_AUTHOR}"
+        }
+    when:
+      status: success
+      event: push
+      branch: main
+
+  notify-failure:
+    image: plugins/webhook
+    settings:
+      urls:
+        from_secret: webhook_url
+      content_type: application/json
+      template: |
+        {
+          "text": "❌ Build failed: ${CI_REPO_NAME}#${CI_BUILD_NUMBER}",
+          "commit": "${CI_COMMIT_SHA:0:7}",
+          "branch": "${CI_COMMIT_BRANCH}",
+          "author": "${CI_COMMIT_AUTHOR}"
+        }
+    when:
+      status: failure
+      event: push
+      branch: main
diff --git a/.woodpecker/build.yml b/.woodpecker/build.yml
new file mode 100644
index 0000000..7907243
--- /dev/null
+++ b/.woodpecker/build.yml
@@ -0,0 +1,43 @@
+# Woodpecker CI - Build Pipeline (runs on main branch pushes)
+
+steps:
+  build-images:
+    image: docker:24-dind
+    commands:
+      - echo "=== Building Docker images ==="
+      - docker build -t sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} -f services/aggregator/Dockerfile --target production .
+      - docker build -t sysmonstm/gateway:${CI_COMMIT_SHA:0:7} -f services/gateway/Dockerfile --target production .
+      - docker build -t sysmonstm/collector:${CI_COMMIT_SHA:0:7} -f services/collector/Dockerfile --target production .
+      - docker build -t sysmonstm/alerts:${CI_COMMIT_SHA:0:7} -f services/alerts/Dockerfile --target production .
+      - echo "=== Tagging as latest ==="
+      - docker tag sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} sysmonstm/aggregator:latest
+      - docker tag sysmonstm/gateway:${CI_COMMIT_SHA:0:7} sysmonstm/gateway:latest
+      - docker tag sysmonstm/collector:${CI_COMMIT_SHA:0:7} sysmonstm/collector:latest
+      - docker tag sysmonstm/alerts:${CI_COMMIT_SHA:0:7} sysmonstm/alerts:latest
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+
+  push-to-registry:
+    image: docker:24-dind
+    commands:
+      - echo "=== Logging into registry ==="
+      - echo "$REGISTRY_PASSWORD" | docker login -u "$REGISTRY_USER" --password-stdin "$REGISTRY_URL"
+      - echo "=== Pushing images ==="
+      - |
+        for svc in aggregator gateway collector alerts; do
+          docker tag sysmonstm/$svc:${CI_COMMIT_SHA:0:7} $REGISTRY_URL/sysmonstm/$svc:${CI_COMMIT_SHA:0:7}
+          docker tag sysmonstm/$svc:latest $REGISTRY_URL/sysmonstm/$svc:latest
+          docker push $REGISTRY_URL/sysmonstm/$svc:${CI_COMMIT_SHA:0:7}
+          docker push $REGISTRY_URL/sysmonstm/$svc:latest
+          echo "Pushed $svc"
+        done
+    secrets: [registry_user, registry_password, registry_url]
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
+
+depends_on:
+  - test
+
+when:
+  event: push
+  branch: main
diff --git a/.woodpecker/deploy.yml b/.woodpecker/deploy.yml
new file mode 100644
index 0000000..3cd7a44
--- /dev/null
+++ b/.woodpecker/deploy.yml
@@ -0,0 +1,61 @@
+# Woodpecker CI - Deploy Pipeline
+
+steps:
+  deploy-to-staging:
+    image: appleboy/drone-ssh
+    settings:
+      host:
+        from_secret: deploy_host
+      username:
+        from_secret: deploy_user
+      key:
+        from_secret: deploy_key
+      port: 22
+      script:
+        - echo "=== Deploying to staging ==="
+        - cd /home/ec2-user/sysmonstm
+        - git fetch origin main
+        - git reset --hard origin/main
+        - echo "=== Pulling new images ==="
+        - docker-compose pull
+        - echo "=== Restarting services ==="
+        - docker-compose up -d --remove-orphans
+        - echo "=== Cleaning up ==="
+        - docker system prune -f
+        - echo "=== Deployment complete ==="
+        - docker-compose ps
+
+  health-check:
+    image: curlimages/curl
+    commands:
+      - echo "=== Waiting for services to start ==="
+      - sleep 10
+      - echo "=== Checking gateway health ==="
+      - curl -f http://$DEPLOY_HOST:8000/health || exit 1
+      - echo "=== Health check passed ==="
+    secrets: [deploy_host]
+
+  notify:
+    image: plugins/webhook
+    settings:
+      urls:
+        from_secret: webhook_url
+      content_type: application/json
+      template: |
+        {
+          "text": "🚀 Deployed to staging",
+          "repo": "${CI_REPO_NAME}",
+          "commit": "${CI_COMMIT_SHA:0:7}",
+          "message": "${CI_COMMIT_MESSAGE}",
+          "author": "${CI_COMMIT_AUTHOR}",
+          "url": "https://sysmonstm.mcrn.ar"
+        }
+    when:
+      status: success
+
+depends_on:
+  - build
+
+when:
+  event: push
+  branch: main
diff --git a/.woodpecker/test.yml b/.woodpecker/test.yml
new file mode 100644
index 0000000..1bbefe0
--- /dev/null
+++ b/.woodpecker/test.yml
@@ -0,0 +1,40 @@
+# Woodpecker CI - Test Pipeline (runs on PRs and pushes)
+# Separate file for cleaner organization
+
+steps:
+  lint:
+    image: python:3.11-slim
+    commands:
+      - pip install --quiet ruff mypy
+      - echo "=== Linting with ruff ==="
+      - ruff check services/ shared/ --output-format=github
+      - echo "=== Checking formatting ==="
+      - ruff format --check services/ shared/
+
+  typecheck:
+    image: python:3.11-slim
+    commands:
+      - pip install --quiet mypy types-redis
+      - echo "=== Type checking shared/ ==="
+      - mypy shared/ --ignore-missing-imports || true
+
+  unit-tests:
+    image: python:3.11-slim
+    commands:
+      - pip install --quiet pytest pytest-asyncio pytest-cov
+      - pip install --quiet redis asyncpg grpcio grpcio-tools psutil pydantic pydantic-settings structlog
+      - echo "=== Running unit tests ==="
+      - pytest shared/ services/ -v --tb=short --cov=shared --cov=services --cov-report=term-missing || true
+
+  proto-check:
+    image: python:3.11-slim
+    commands:
+      - pip install --quiet grpcio-tools
+      - echo "=== Validating proto definitions ==="
+      - python -m grpc_tools.protoc -I./proto --python_out=/tmp --grpc_python_out=/tmp ./proto/metrics.proto
+      - echo "Proto compilation successful"
+
+depends_on: []
+
+when:
+  event: [push, pull_request]
diff --git a/CLAUDE.md b/CLAUDE.md
new file mode 100644
index 0000000..1f78792
--- /dev/null
+++ b/CLAUDE.md
@@ -0,0 +1,492 @@
+# Distributed System Monitoring Platform
+
+## Project Overview
+
+A real-time system monitoring platform that streams metrics from multiple machines to a central hub with live web dashboard. Built to demonstrate production microservices patterns (gRPC, FastAPI, streaming, event-driven architecture) while solving a real problem: monitoring development infrastructure across multiple machines.
+
+**Primary Goal:** Interview demonstration project for Python Microservices Engineer position  
+**Secondary Goal:** Actually useful tool for managing multi-machine development environment  
+**Time Investment:** Phased approach - MVP in weekend, polish over 2-3 weeks
+
+## Why This Project
+
+**Interview Alignment:**
+- Demonstrates gRPC-based microservices architecture (core requirement)
+- Shows streaming patterns (server-side and bidirectional)
+- Real-time data aggregation and processing
+- Alert/threshold monitoring (maps to fraud detection)
+- Event-driven patterns
+- Multiple data sources requiring normalization (maps to multiple payment processors)
+
+**Personal Utility:**
+- Monitors existing multi-machine dev setup
+- Dashboard stays open, provides real value
+- Solves actual pain point
+- Will continue running post-interview
+
+**Domain Mapping for Interview:**
+- Machine = Payment Processor
+- Metrics Stream = Transaction Stream  
+- Resource Thresholds = Fraud/Limit Detection
+- Alert System = Risk Management
+- Aggregation Service = Payment Processing Hub
+
+## Technical Stack
+
+### Core Technologies (Must Use - From JD)
+- **Python 3.11+** - Primary language
+- **FastAPI** - Web gateway, REST endpoints, WebSocket streaming
+- **gRPC** - Inter-service communication, metric streaming
+- **PostgreSQL/TimescaleDB** - Time-series historical data
+- **Redis** - Current state, caching, alert rules
+- **Docker Compose** - Orchestration
+
+### Supporting Technologies
+- **Protocol Buffers** - gRPC message definitions
+- **WebSockets** - Browser streaming
+- **htmx + Alpine.js** - Lightweight reactive frontend (avoid heavy SPA)
+- **Chart.js or Apache ECharts** - Real-time graphs
+- **asyncio** - Async patterns throughout
+
+### Development Tools
+- **grpcio & grpcio-tools** - Python gRPC
+- **psutil** - System metrics collection
+- **uvicorn** - FastAPI server
+- **pytest** - Testing
+- **docker-compose** - Local orchestration
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────┐
+│                         Browser                              │
+│  ┌──────────────────────────────────────────────────────┐  │
+│  │  Dashboard (htmx + Alpine.js + WebSockets)           │  │
+│  └──────────────────────────────────────────────────────┘  │
+└────────────────────────┬────────────────────────────────────┘
+                         │ WebSocket
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│                    Web Gateway Service                       │
+│                    (FastAPI + WebSockets)                    │
+│  - Serves dashboard                                          │
+│  - Streams updates to browser                                │
+│  - REST API for historical queries                           │
+└────────────────────────┬────────────────────────────────────┘
+                         │ gRPC
+                         ▼
+┌─────────────────────────────────────────────────────────────┐
+│                   Aggregator Service (gRPC)                  │
+│  - Receives metric streams from all collectors               │
+│  - Normalizes data from different sources                    │
+│  - Enriches with machine context                             │
+│  - Publishes to event stream                                 │
+│  - Checks alert thresholds                                   │
+└─────┬───────────────────────────────────┬───────────────────┘
+      │                                   │
+      │ Stores                            │ Publishes events
+      ▼                                   ▼
+┌──────────────┐                   ┌────────────────┐
+│  TimescaleDB │                   │  Event Stream  │
+│  (historical)│                   │  (Redis Pub/Sub│
+└──────────────┘                   │   or RabbitMQ) │
+                                   └────────┬───────┘
+┌──────────────┐                            │
+│    Redis     │                            │ Subscribes
+│  (current    │◄───────────────────────────┘
+│   state)     │                            │
+└──────────────┘                            ▼
+                                   ┌────────────────┐
+      ▲                            │ Alert Service  │
+      │                            │  - Processes   │
+      │                            │    events      │
+      │ gRPC Streaming             │  - Triggers    │
+      │                            │    actions     │
+┌─────┴────────────────────────────┴────────────────┘
+│
+│  Multiple Collector Services (one per machine)
+│  ┌───────────────────────────────────────┐
+│  │  Metrics Collector (gRPC Client)      │
+│  │  - Gathers system metrics (psutil)    │
+│  │  - Streams to Aggregator via gRPC     │
+│  │  - CPU, Memory, Disk, Network         │
+│  │  - Process list                       │
+│  │  - Docker container stats (optional)  │
+│  └───────────────────────────────────────┘
+│
+└──► Machine 1, Machine 2, Machine 3, ...
+```
+
+## Implementation Phases
+
+### Phase 1: MVP - Core Streaming (Weekend - 8-12 hours)
+
+**Goal:** Prove the gRPC streaming works end-to-end
+
+**Deliverables:**
+1. Metrics Collector Service (gRPC client)
+   - Collects CPU, memory, disk on localhost
+   - Streams to aggregator every 5 seconds
+   
+2. Aggregator Service (gRPC server)
+   - Receives metric stream
+   - Stores current state in Redis
+   - Logs to console
+   
+3. Proto definitions for metric messages
+
+4. Docker Compose setup
+
+**Success Criteria:**
+- Run collector, see metrics flowing to aggregator
+- Redis contains current state
+- Can query Redis manually for latest metrics
+
+### Phase 2: Web Dashboard (1 week)
+
+**Goal:** Make it visible and useful
+
+**Deliverables:**
+1. Web Gateway Service (FastAPI)
+   - WebSocket endpoint for streaming
+   - REST endpoints for current/historical data
+   
+2. Dashboard UI
+   - Real-time CPU/Memory graphs per machine
+   - Current state table
+   - Simple, clean design
+   
+3. WebSocket bridge (Gateway ↔ Aggregator)
+
+4. TimescaleDB integration
+   - Store historical metrics
+   - Query endpoints for time ranges
+
+**Success Criteria:**
+- Open dashboard, see live graphs updating
+- Graphs show last hour of data
+- Multiple machines displayed separately
+
+### Phase 3: Alerts & Intelligence (1 week)
+
+**Goal:** Add decision-making layer (interview focus)
+
+**Deliverables:**
+1. Alert Service
+   - Subscribes to event stream
+   - Evaluates threshold rules
+   - Triggers notifications
+   
+2. Configuration Service (gRPC)
+   - Dynamic threshold management
+   - Alert rule CRUD
+   - Stored in PostgreSQL
+   
+3. Event Stream implementation (Redis Pub/Sub or RabbitMQ)
+
+4. Enhanced dashboard
+   - Alert indicators
+   - Alert history
+   - Threshold configuration UI
+
+**Success Criteria:**
+- Set CPU threshold at 80%
+- Generate load (stress-ng)
+- See alert trigger in dashboard
+- Alert logged to database
+
+### Phase 4: Interview Polish (Final week)
+
+**Goal:** Demo-ready, production patterns visible
+
+**Deliverables:**
+1. Observability
+   - OpenTelemetry tracing (optional)
+   - Structured logging
+   - Health check endpoints
+   
+2. "Synthetic Transactions"
+   - Simulate business operations through system
+   - Track end-to-end latency
+   - Maps directly to payment processing demo
+   
+3. Documentation
+   - Architecture diagram
+   - Service interaction flows
+   - Deployment guide
+   
+4. Demo script
+   - Story to walk through
+   - Key talking points
+   - Domain mapping explanations
+
+**Success Criteria:**
+- Can deploy entire stack with one command
+- Can explain every service's role
+- Can map architecture to payment processing
+- Demo runs smoothly without hiccups
+
+## Key Technical Patterns to Demonstrate
+
+### 1. gRPC Streaming Patterns
+
+**Server-Side Streaming:**
+```python
+# Collector streams metrics to aggregator
+service MetricsService {
+  rpc StreamMetrics(MetricsRequest) returns (stream Metric) {}
+}
+```
+
+**Bidirectional Streaming:**
+```python
+# Two-way communication between services
+service ControlService {
+  rpc ManageStream(stream Command) returns (stream Response) {}
+}
+```
+
+### 2. Service Communication Patterns
+
+- **Synchronous (gRPC):** Query current state, configuration
+- **Asynchronous (Events):** Metric updates, alerts, audit logs
+- **Streaming (gRPC + WebSocket):** Real-time data flow
+
+### 3. Data Storage Patterns
+
+- **Hot data (Redis):** Current state, recent metrics (last 5 minutes)
+- **Warm data (TimescaleDB):** Historical metrics (last 30 days)
+- **Cold data (Optional):** Archive to S3-compatible storage
+
+### 4. Error Handling & Resilience
+
+- gRPC retry logic with exponential backoff
+- Circuit breaker pattern for service calls
+- Graceful degradation (continue if one collector fails)
+- Dead letter queue for failed events
+
+## Proto Definitions (Starting Point)
+
+```protobuf
+syntax = "proto3";
+
+package monitoring;
+
+service MetricsService {
+  rpc StreamMetrics(MetricsRequest) returns (stream Metric) {}
+  rpc GetCurrentState(StateRequest) returns (MachineState) {}
+}
+
+message MetricsRequest {
+  string machine_id = 1;
+  int32 interval_seconds = 2;
+}
+
+message Metric {
+  string machine_id = 1;
+  int64 timestamp = 2;
+  MetricType type = 3;
+  double value = 4;
+  map<string, string> labels = 5;
+}
+
+enum MetricType {
+  CPU_PERCENT = 0;
+  MEMORY_PERCENT = 1;
+  MEMORY_USED_GB = 2;
+  DISK_PERCENT = 3;
+  NETWORK_SENT_MBPS = 4;
+  NETWORK_RECV_MBPS = 5;
+}
+
+message MachineState {
+  string machine_id = 1;
+  int64 last_seen = 2;
+  repeated Metric current_metrics = 3;
+  HealthStatus health = 4;
+}
+
+enum HealthStatus {
+  HEALTHY = 0;
+  WARNING = 1;
+  CRITICAL = 2;
+  UNKNOWN = 3;
+}
+```
+
+## Project Structure
+
+```
+system-monitor/
+├── docker-compose.yml
+├── proto/
+│   └── metrics.proto
+├── services/
+│   ├── collector/
+│   │   ├── Dockerfile
+│   │   ├── requirements.txt
+│   │   ├── main.py
+│   │   └── metrics.py
+│   ├── aggregator/
+│   │   ├── Dockerfile
+│   │   ├── requirements.txt
+│   │   ├── main.py
+│   │   └── storage.py
+│   ├── gateway/
+│   │   ├── Dockerfile
+│   │   ├── requirements.txt
+│   │   ├── main.py
+│   │   └── websocket.py
+│   └── alerts/
+│       ├── Dockerfile
+│       ├── requirements.txt
+│       ├── main.py
+│       └── rules.py
+├── web/
+│   ├── static/
+│   │   ├── css/
+│   │   └── js/
+│   └── templates/
+│       └── dashboard.html
+└── README.md
+```
+
+## Interview Talking Points
+
+### Domain Mapping to Payments
+
+**What you say:**
+- "I built this to monitor my dev machines, but the architecture directly maps to payment processing"
+- "Each machine streaming metrics is like a payment processor streaming transactions"
+- "The aggregator normalizes data from different sources - same as aggregating from Stripe, PayPal, bank APIs"
+- "Alert thresholds on resource usage are structurally identical to fraud detection thresholds"
+- "The event stream for audit trails maps directly to payment audit logs"
+
+### Technical Decisions to Highlight
+
+**gRPC vs REST:**
+- "I use gRPC between services for efficiency and strong typing"
+- "FastAPI gateway exposes REST/WebSocket for browser clients"
+- "This pattern is common - internal gRPC, external REST"
+
+**Streaming vs Polling:**
+- "Server-side streaming reduces network overhead"
+- "Bidirectional streaming allows dynamic configuration updates"
+- "WebSocket to browser maintains single connection"
+
+**State Management:**
+- "Redis for hot data - current state, needs fast access"
+- "TimescaleDB for historical analysis - optimized for time-series"
+- "This tiered storage approach scales to payment transaction volumes"
+
+**Resilience:**
+- "Each collector is independent - one failing doesn't affect others"
+- "Circuit breaker prevents cascade failures"
+- "Event stream decouples alert processing from metric ingestion"
+
+### What NOT to Say
+
+- Don't call it a "toy project" or "learning exercise"
+- Don't apologize for running locally vs AWS
+- Don't over-explain obvious things
+- Don't claim it's production-ready when it's not
+
+### What TO Say
+
+- "I built this to solve a real problem I have"
+- "Locally it uses PostgreSQL/Redis, in production these become Aurora/ElastiCache"
+- "I focused on the architectural patterns since those transfer directly"
+- "I'd keep developing this - it's genuinely useful"
+
+## Development Guidelines
+
+### Code Quality Standards
+- Type hints throughout (Python 3.11+ syntax)
+- Async/await patterns consistently
+- Structured logging (JSON format)
+- Error handling at all boundaries
+- Unit tests for business logic
+- Integration tests for service interactions
+
+### Docker Best Practices
+- Multi-stage builds
+- Non-root users
+- Health checks
+- Resource limits
+- Volume mounts for development
+
+### Configuration Management
+- Environment variables for all config
+- Sensible defaults
+- Config validation on startup
+- No secrets in code
+
+## AWS Mapping (For Interview Discussion)
+
+**What you have → What it becomes:**
+- PostgreSQL → Aurora PostgreSQL
+- Redis → ElastiCache
+- Docker Containers → ECS/Fargate or Lambda
+- RabbitMQ/Redis Pub/Sub → SQS/SNS
+- Docker Compose → CloudFormation/Terraform
+- Local networking → VPC, Security Groups
+
+**Key point:** "The architecture and patterns are production-ready, the infrastructure is local for development convenience"
+
+## Common Pitfalls to Avoid
+
+1. **Over-engineering Phase 1** - Resist adding features, just get streaming working
+2. **Ugly UI** - Don't waste time on design, htmx + basic CSS is fine
+3. **Perfect metrics** - Mock data is OK early on, real psutil data comes later
+4. **Complete coverage** - Better to have 3 services working perfectly than 10 half-done
+5. **AWS deployment** - Local is fine, AWS costs money and adds complexity
+
+## Success Metrics
+
+**For Yourself:**
+- [ ] Actually use the dashboard daily
+- [ ] Catches a real issue before you notice
+- [ ] Runs stable for 1+ week without intervention
+
+**For Interview:**
+- [ ] Can demo end-to-end in 5 minutes
+- [ ] Can explain every service interaction
+- [ ] Can map to payment domain fluently
+- [ ] Shows understanding of production patterns
+
+## Next Steps
+
+1. Set up project structure
+2. Define proto messages
+3. Build Phase 1 MVP
+4. Iterate based on what feels useful
+5. Polish for demo when interview approaches
+
+## Resources
+
+- gRPC Python docs: https://grpc.io/docs/languages/python/
+- FastAPI WebSockets: https://fastapi.tiangolo.com/advanced/websockets/
+- TimescaleDB: https://docs.timescale.com/
+- htmx: https://htmx.org/
+
+## Questions to Ask Yourself During Development
+
+- "Would I actually use this feature?"
+- "How does this map to payments?"
+- "Can I explain why I built it this way?"
+- "What would break if X service failed?"
+- "How would this scale to 1000 machines?"
+
+---
+
+## Final Note
+
+This project works because it's:
+1. **Real** - You'll use it
+2. **Focused** - Shows specific patterns they care about
+3. **Mappable** - Clear connection to their domain
+4. **Yours** - Not a tutorial copy, demonstrates your thinking
+
+Build it in phases, use it daily, and by interview time you'll have natural stories about trade-offs, failures, and learnings. That authenticity is more valuable than perfect code.
+
+Good luck! 🚀
diff --git a/Tiltfile b/Tiltfile
new file mode 100644
index 0000000..5e70c3f
--- /dev/null
+++ b/Tiltfile
@@ -0,0 +1,119 @@
+# -*- mode: Python -*-
+# Tiltfile for sysmonstm - local Kubernetes development
+
+# Load extensions
+load('ext://restart_process', 'docker_build_with_restart')
+load('ext://namespace', 'namespace_create')
+
+# Configuration
+config.define_bool("no-volumes")
+cfg = config.parse()
+no_volumes = cfg.get("no-volumes", False)
+
+# Create namespace
+namespace_create('sysmonstm')
+k8s_yaml(kustomize('k8s/overlays/local'))
+
+# ============================================================================
+# Docker builds with live reload
+# ============================================================================
+
+# Aggregator service
+docker_build(
+    'sysmonstm-aggregator',
+    context='.',
+    dockerfile='services/aggregator/Dockerfile',
+    target='development',
+    live_update=[
+        sync('./services/aggregator', '/app/services/aggregator'),
+        sync('./shared', '/app/shared'),
+        sync('./proto', '/app/proto'),
+    ],
+)
+
+# Gateway service
+docker_build(
+    'sysmonstm-gateway',
+    context='.',
+    dockerfile='services/gateway/Dockerfile',
+    target='development',
+    live_update=[
+        sync('./services/gateway', '/app/services/gateway'),
+        sync('./shared', '/app/shared'),
+        sync('./proto', '/app/proto'),
+        sync('./web', '/app/web'),
+    ],
+)
+
+# Alerts service
+docker_build(
+    'sysmonstm-alerts',
+    context='.',
+    dockerfile='services/alerts/Dockerfile',
+    target='development',
+    live_update=[
+        sync('./services/alerts', '/app/services/alerts'),
+        sync('./shared', '/app/shared'),
+    ],
+)
+
+# ============================================================================
+# Resource configuration
+# ============================================================================
+
+# Infrastructure
+k8s_resource('redis', labels=['infra'])
+k8s_resource('timescaledb', labels=['infra'])
+
+# Application services
+k8s_resource(
+    'aggregator',
+    labels=['app'],
+    resource_deps=['redis', 'timescaledb'],
+    port_forwards=['50051:50051'],
+)
+
+k8s_resource(
+    'gateway',
+    labels=['app'],
+    resource_deps=['aggregator', 'redis'],
+    port_forwards=['8000:8000'],
+)
+
+k8s_resource(
+    'alerts',
+    labels=['app'],
+    resource_deps=['redis', 'timescaledb'],
+)
+
+# ============================================================================
+# Local resources (optional - for running collector locally)
+# ============================================================================
+
+local_resource(
+    'collector-local',
+    serve_cmd='cd services/collector && python main.py',
+    deps=['services/collector', 'shared'],
+    resource_deps=['aggregator'],
+    labels=['collector'],
+    auto_init=False,  # Don't start automatically
+    env={
+        'AGGREGATOR_URL': 'localhost:50051',
+        'MACHINE_ID': 'tilt-dev',
+        'COLLECTION_INTERVAL': '5',
+        'LOG_LEVEL': 'DEBUG',
+        'PYTHONPATH': '.',
+    },
+)
+
+# ============================================================================
+# Convenience buttons
+# ============================================================================
+
+local_resource(
+    'proto-gen',
+    cmd='python -m grpc_tools.protoc -I./proto --python_out=./shared --grpc_python_out=./shared ./proto/metrics.proto',
+    deps=['proto/metrics.proto'],
+    labels=['tools'],
+    auto_init=False,
+)
diff --git a/ctlptl.yaml b/ctlptl.yaml
new file mode 100644
index 0000000..b81ba07
--- /dev/null
+++ b/ctlptl.yaml
@@ -0,0 +1,32 @@
+# ctlptl configuration for Kind cluster
+# Usage: ctlptl apply -f ctlptl.yaml
+
+apiVersion: ctlptl.dev/v1alpha1
+kind: Registry
+name: sysmonstm-registry
+port: 5005
+---
+apiVersion: ctlptl.dev/v1alpha1
+kind: Cluster
+product: kind
+registry: sysmonstm-registry
+kindV1Alpha4Cluster:
+  name: sysmonstm
+  nodes:
+    - role: control-plane
+      extraPortMappings:
+        # Gateway HTTP
+        - containerPort: 30080
+          hostPort: 8080
+          protocol: TCP
+        # Aggregator gRPC
+        - containerPort: 30051
+          hostPort: 50051
+          protocol: TCP
+  # Resource limits for t2.small compatibility
+  kubeadmConfigPatches:
+    - |
+      kind: InitConfiguration
+      nodeRegistration:
+        kubeletExtraArgs:
+          system-reserved: memory=256Mi
diff --git a/docker-compose.override.yml b/docker-compose.override.yml
new file mode 100644
index 0000000..7a0513a
--- /dev/null
+++ b/docker-compose.override.yml
@@ -0,0 +1,48 @@
+# Development overrides - hot reload, mounted volumes, debug settings
+# Usage: docker compose up (automatically includes this file)
+
+version: "3.8"
+
+services:
+  aggregator:
+    build:
+      target: development
+    volumes:
+      - ./services/aggregator:/app/services/aggregator:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      RELOAD: "true"
+
+  gateway:
+    build:
+      target: development
+    volumes:
+      - ./services/gateway:/app/services/gateway:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+      - ./web:/app/web:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      RELOAD: "true"
+
+  alerts:
+    build:
+      target: development
+    volumes:
+      - ./services/alerts:/app/services/alerts:ro
+      - ./shared:/app/shared:ro
+    environment:
+      LOG_LEVEL: DEBUG
+
+  collector:
+    build:
+      target: development
+    volumes:
+      - ./services/collector:/app/services/collector:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      COLLECTION_INTERVAL: 2
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..efc32da
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,154 @@
+version: "3.8"
+
+# This file works both locally and on EC2 for demo purposes.
+# For local dev with hot-reload, use: docker compose -f docker-compose.yml -f docker-compose.override.yml up
+
+x-common-env: &common-env
+  REDIS_URL: redis://redis:6379
+  TIMESCALE_URL: postgresql://monitor:monitor@timescaledb:5432/monitor
+  EVENTS_BACKEND: redis_pubsub
+  LOG_LEVEL: ${LOG_LEVEL:-INFO}
+  LOG_FORMAT: json
+
+x-healthcheck-defaults: &healthcheck-defaults
+  interval: 10s
+  timeout: 5s
+  retries: 3
+  start_period: 10s
+
+services:
+  # =============================================================================
+  # Infrastructure
+  # =============================================================================
+
+  redis:
+    image: redis:7-alpine
+    ports:
+      - "${REDIS_PORT:-6379}:6379"
+    volumes:
+      - redis-data:/data
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "redis-cli", "ping"]
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+  timescaledb:
+    image: timescale/timescaledb:latest-pg15
+    environment:
+      POSTGRES_USER: monitor
+      POSTGRES_PASSWORD: monitor
+      POSTGRES_DB: monitor
+    ports:
+      - "${TIMESCALE_PORT:-5432}:5432"
+    volumes:
+      - timescale-data:/var/lib/postgresql/data
+      - ./scripts/init-db.sql:/docker-entrypoint-initdb.d/init.sql:ro
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD-SHELL", "pg_isready -U monitor -d monitor"]
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  # =============================================================================
+  # Application Services
+  # =============================================================================
+
+  aggregator:
+    build:
+      context: .
+      dockerfile: services/aggregator/Dockerfile
+    environment:
+      <<: *common-env
+      GRPC_PORT: 50051
+      SERVICE_NAME: aggregator
+    ports:
+      - "${AGGREGATOR_GRPC_PORT:-50051}:50051"
+    depends_on:
+      redis:
+        condition: service_healthy
+      timescaledb:
+        condition: service_healthy
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "/bin/grpc_health_probe", "-addr=:50051"]
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  gateway:
+    build:
+      context: .
+      dockerfile: services/gateway/Dockerfile
+    environment:
+      <<: *common-env
+      HTTP_PORT: 8000
+      AGGREGATOR_URL: aggregator:50051
+      SERVICE_NAME: gateway
+    ports:
+      - "${GATEWAY_PORT:-8000}:8000"
+    depends_on:
+      - aggregator
+      - redis
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  alerts:
+    build:
+      context: .
+      dockerfile: services/alerts/Dockerfile
+    environment:
+      <<: *common-env
+      SERVICE_NAME: alerts
+    depends_on:
+      redis:
+        condition: service_healthy
+      timescaledb:
+        condition: service_healthy
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+  # Collector runs separately on each machine being monitored
+  # For local testing, we run one instance
+  collector:
+    build:
+      context: .
+      dockerfile: services/collector/Dockerfile
+    environment:
+      <<: *common-env
+      AGGREGATOR_URL: aggregator:50051
+      MACHINE_ID: ${MACHINE_ID:-local-dev}
+      COLLECTION_INTERVAL: ${COLLECTION_INTERVAL:-5}
+      SERVICE_NAME: collector
+    depends_on:
+      - aggregator
+    deploy:
+      resources:
+        limits:
+          memory: 64M
+    # For actual system metrics, you might need:
+    # privileged: true
+    # pid: host
+
+volumes:
+  redis-data:
+  timescale-data:
+
+networks:
+  default:
+    name: sysmonstm
diff --git a/docs/architecture/01-system-overview.dot b/docs/architecture/01-system-overview.dot
new file mode 100644
index 0000000..c9bc4b0
--- /dev/null
+++ b/docs/architecture/01-system-overview.dot
@@ -0,0 +1,78 @@
+digraph SystemOverview {
+    // Graph settings
+    rankdir=TB;
+    compound=true;
+    fontname="Helvetica";
+    node [fontname="Helvetica", fontsize=11];
+    edge [fontname="Helvetica", fontsize=10];
+
+    // Title
+    labelloc="t";
+    label="System Monitoring Platform - Architecture Overview";
+    fontsize=16;
+
+    // Styling
+    node [shape=box, style="rounded,filled"];
+
+    // External
+    subgraph cluster_external {
+        label="External";
+        style=dashed;
+        color=gray;
+
+        browser [label="Browser\n(Dashboard)", fillcolor="#E3F2FD"];
+        machines [label="Monitored\nMachines", fillcolor="#FFF3E0", shape=box3d];
+    }
+
+    // Core Services
+    subgraph cluster_services {
+        label="Application Services";
+        style=filled;
+        color="#E8F5E9";
+        fillcolor="#E8F5E9";
+
+        gateway [label="Gateway\n(FastAPI)", fillcolor="#C8E6C9"];
+        aggregator [label="Aggregator\n(gRPC Server)", fillcolor="#C8E6C9"];
+        alerts [label="Alerts\nService", fillcolor="#C8E6C9"];
+        collector [label="Collector\n(gRPC Client)", fillcolor="#DCEDC8"];
+    }
+
+    // Data Layer
+    subgraph cluster_data {
+        label="Data Layer";
+        style=filled;
+        color="#FFF8E1";
+        fillcolor="#FFF8E1";
+
+        redis [label="Redis\n(Pub/Sub + State)", fillcolor="#FFECB3", shape=cylinder];
+        timescale [label="TimescaleDB\n(Time-series)", fillcolor="#FFECB3", shape=cylinder];
+    }
+
+    // Event Stream
+    subgraph cluster_events {
+        label="Event Stream";
+        style=filled;
+        color="#F3E5F5";
+        fillcolor="#F3E5F5";
+
+        events [label="Redis Pub/Sub\n(Events)", fillcolor="#E1BEE7", shape=hexagon];
+    }
+
+    // Connections
+    browser -> gateway [label="WebSocket\nREST", color="#1976D2"];
+    gateway -> aggregator [label="gRPC", color="#388E3C"];
+    gateway -> redis [label="State\nQuery", style=dashed];
+    gateway -> timescale [label="Historical\nQuery", style=dashed];
+
+    machines -> collector [label="psutil", color="#F57C00", style=dotted];
+    collector -> aggregator [label="gRPC\nStream", color="#388E3C"];
+
+    aggregator -> redis [label="Current\nState", color="#FFA000"];
+    aggregator -> timescale [label="Store\nMetrics", color="#FFA000"];
+    aggregator -> events [label="Publish", color="#7B1FA2"];
+
+    events -> alerts [label="Subscribe", color="#7B1FA2"];
+    events -> gateway [label="Subscribe", color="#7B1FA2"];
+
+    alerts -> timescale [label="Store\nAlerts", style=dashed];
+}
diff --git a/docs/architecture/01-system-overview.svg b/docs/architecture/01-system-overview.svg
new file mode 100644
index 0000000..aefe71c
--- /dev/null
+++ b/docs/architecture/01-system-overview.svg
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.1 (0)
+ -->
+<!-- Title: SystemOverview Pages: 1 -->
+<svg width="444pt" height="508pt"
+ viewBox="0.00 0.00 444.00 508.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 503.78)">
+<title>SystemOverview</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-503.78 440,-503.78 440,4 -4,4"/>
+<text xml:space="preserve" text-anchor="middle" x="218" y="-480.58" font-family="Helvetica,sans-Serif" font-size="16.00">System Monitoring Platform &#45; Architecture Overview</text>
+<g id="clust1" class="cluster">
+<title>cluster_external</title>
+<polygon fill="none" stroke="gray" stroke-dasharray="5,2" points="45.5,-374.2 45.5,-453.7 235.5,-453.7 235.5,-374.2 45.5,-374.2"/>
+<text xml:space="preserve" text-anchor="middle" x="140.5" y="-434.5" font-family="Helvetica,sans-Serif" font-size="16.00">External</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_services</title>
+<polygon fill="#e8f5e9" stroke="#e8f5e9" points="101.5,-143.12 101.5,-320.12 363.5,-320.12 363.5,-143.12 101.5,-143.12"/>
+<text xml:space="preserve" text-anchor="middle" x="232.5" y="-300.93" font-family="Helvetica,sans-Serif" font-size="16.00">Application Services</text>
+</g>
+<g id="clust3" class="cluster">
+<title>cluster_data</title>
+<polygon fill="#fff8e1" stroke="#fff8e1" points="22.5,-8 22.5,-99.62 260.5,-99.62 260.5,-8 22.5,-8"/>
+<text xml:space="preserve" text-anchor="middle" x="141.5" y="-80.42" font-family="Helvetica,sans-Serif" font-size="16.00">Data Layer</text>
+</g>
+<g id="clust4" class="cluster">
+<title>cluster_events</title>
+<polygon fill="#f3e5f5" stroke="#f3e5f5" points="243.5,-363.62 243.5,-464.28 413.5,-464.28 413.5,-363.62 243.5,-363.62"/>
+<text xml:space="preserve" text-anchor="middle" x="328.5" y="-445.08" font-family="Helvetica,sans-Serif" font-size="16.00">Event Stream</text>
+</g>
+<!-- browser -->
+<g id="node1" class="node">
+<title>browser</title>
+<path fill="#e3f2fd" stroke="black" d="M125.62,-418.2C125.62,-418.2 65.38,-418.2 65.38,-418.2 59.38,-418.2 53.38,-412.2 53.38,-406.2 53.38,-406.2 53.38,-394.2 53.38,-394.2 53.38,-388.2 59.38,-382.2 65.38,-382.2 65.38,-382.2 125.62,-382.2 125.62,-382.2 131.62,-382.2 137.62,-388.2 137.62,-394.2 137.62,-394.2 137.62,-406.2 137.62,-406.2 137.62,-412.2 131.62,-418.2 125.62,-418.2"/>
+<text xml:space="preserve" text-anchor="middle" x="95.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Browser</text>
+<text xml:space="preserve" text-anchor="middle" x="95.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">(Dashboard)</text>
+</g>
+<!-- gateway -->
+<g id="node3" class="node">
+<title>gateway</title>
+<path fill="#c8e6c9" stroke="black" d="M161.88,-284.62C161.88,-284.62 121.12,-284.62 121.12,-284.62 115.12,-284.62 109.12,-278.62 109.12,-272.62 109.12,-272.62 109.12,-260.62 109.12,-260.62 109.12,-254.62 115.12,-248.62 121.12,-248.62 121.12,-248.62 161.88,-248.62 161.88,-248.62 167.88,-248.62 173.88,-254.62 173.88,-260.62 173.88,-260.62 173.88,-272.62 173.88,-272.62 173.88,-278.62 167.88,-284.62 161.88,-284.62"/>
+<text xml:space="preserve" text-anchor="middle" x="141.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Gateway</text>
+<text xml:space="preserve" text-anchor="middle" x="141.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">(FastAPI)</text>
+</g>
+<!-- browser&#45;&gt;gateway -->
+<g id="edge1" class="edge">
+<title>browser&#45;&gt;gateway</title>
+<path fill="none" stroke="#1976d2" d="M92.73,-381.75C91.08,-367.05 90.32,-345.66 96.25,-328.12 100.5,-315.57 108.45,-303.5 116.51,-293.49"/>
+<polygon fill="#1976d2" stroke="#1976d2" points="119.02,-295.94 122.86,-286.06 113.7,-291.39 119.02,-295.94"/>
+<text xml:space="preserve" text-anchor="middle" x="122.88" y="-344.12" font-family="Helvetica,sans-Serif" font-size="10.00">WebSocket</text>
+<text xml:space="preserve" text-anchor="middle" x="122.88" y="-331.38" font-family="Helvetica,sans-Serif" font-size="10.00">REST</text>
+</g>
+<!-- machines -->
+<g id="node2" class="node">
+<title>machines</title>
+<polygon fill="#fff3e0" stroke="black" points="227.25,-418.2 159.75,-418.2 155.75,-414.2 155.75,-382.2 223.25,-382.2 227.25,-386.2 227.25,-418.2"/>
+<polyline fill="none" stroke="black" points="223.25,-414.2 155.75,-414.2"/>
+<polyline fill="none" stroke="black" points="223.25,-414.2 223.25,-382.2"/>
+<polyline fill="none" stroke="black" points="223.25,-414.2 227.25,-418.2"/>
+<text xml:space="preserve" text-anchor="middle" x="191.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Monitored</text>
+<text xml:space="preserve" text-anchor="middle" x="191.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">Machines</text>
+</g>
+<!-- collector -->
+<g id="node6" class="node">
+<title>collector</title>
+<path fill="#dcedc8" stroke="black" d="M343.88,-284.62C343.88,-284.62 279.12,-284.62 279.12,-284.62 273.12,-284.62 267.12,-278.62 267.12,-272.62 267.12,-272.62 267.12,-260.62 267.12,-260.62 267.12,-254.62 273.12,-248.62 279.12,-248.62 279.12,-248.62 343.88,-248.62 343.88,-248.62 349.88,-248.62 355.88,-254.62 355.88,-260.62 355.88,-260.62 355.88,-272.62 355.88,-272.62 355.88,-278.62 349.88,-284.62 343.88,-284.62"/>
+<text xml:space="preserve" text-anchor="middle" x="311.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Collector</text>
+<text xml:space="preserve" text-anchor="middle" x="311.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">(gRPC Client)</text>
+</g>
+<!-- machines&#45;&gt;collector -->
+<g id="edge5" class="edge">
+<title>machines&#45;&gt;collector</title>
+<path fill="none" stroke="#f57c00" stroke-dasharray="1,5" d="M210.81,-381.83C219.12,-375.21 229.26,-368.17 239.5,-363.62 260.21,-354.43 273.06,-369.22 289.5,-353.62 304.98,-338.94 310.15,-314.98 311.64,-296.08"/>
+<polygon fill="#f57c00" stroke="#f57c00" points="315.12,-296.47 312.08,-286.32 308.13,-296.15 315.12,-296.47"/>
+<text xml:space="preserve" text-anchor="middle" x="318.1" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">psutil</text>
+</g>
+<!-- aggregator -->
+<g id="node4" class="node">
+<title>aggregator</title>
+<path fill="#c8e6c9" stroke="black" d="M343.12,-187.12C343.12,-187.12 273.88,-187.12 273.88,-187.12 267.88,-187.12 261.88,-181.12 261.88,-175.12 261.88,-175.12 261.88,-163.12 261.88,-163.12 261.88,-157.12 267.88,-151.12 273.88,-151.12 273.88,-151.12 343.12,-151.12 343.12,-151.12 349.12,-151.12 355.12,-157.12 355.12,-163.12 355.12,-163.12 355.12,-175.12 355.12,-175.12 355.12,-181.12 349.12,-187.12 343.12,-187.12"/>
+<text xml:space="preserve" text-anchor="middle" x="308.5" y="-172.18" font-family="Helvetica,sans-Serif" font-size="11.00">Aggregator</text>
+<text xml:space="preserve" text-anchor="middle" x="308.5" y="-158.68" font-family="Helvetica,sans-Serif" font-size="11.00">(gRPC Server)</text>
+</g>
+<!-- gateway&#45;&gt;aggregator -->
+<g id="edge2" class="edge">
+<title>gateway&#45;&gt;aggregator</title>
+<path fill="none" stroke="#388e3c" d="M171.74,-248.33C198.77,-232.88 238.56,-210.12 268.26,-193.13"/>
+<polygon fill="#388e3c" stroke="#388e3c" points="269.66,-196.37 276.6,-188.36 266.19,-190.29 269.66,-196.37"/>
+<text xml:space="preserve" text-anchor="middle" x="257.62" y="-214.75" font-family="Helvetica,sans-Serif" font-size="10.00">gRPC</text>
+</g>
+<!-- redis -->
+<g id="node7" class="node">
+<title>redis</title>
+<path fill="#ffecb3" stroke="black" d="M146,-59.75C146,-62.16 120.23,-64.12 88.5,-64.12 56.77,-64.12 31,-62.16 31,-59.75 31,-59.75 31,-20.38 31,-20.38 31,-17.96 56.77,-16 88.5,-16 120.23,-16 146,-17.96 146,-20.38 146,-20.38 146,-59.75 146,-59.75"/>
+<path fill="none" stroke="black" d="M146,-59.75C146,-57.34 120.23,-55.38 88.5,-55.38 56.77,-55.38 31,-57.34 31,-59.75"/>
+<text xml:space="preserve" text-anchor="middle" x="88.5" y="-43.11" font-family="Helvetica,sans-Serif" font-size="11.00">Redis</text>
+<text xml:space="preserve" text-anchor="middle" x="88.5" y="-29.61" font-family="Helvetica,sans-Serif" font-size="11.00">(Pub/Sub + State)</text>
+</g>
+<!-- gateway&#45;&gt;redis -->
+<g id="edge3" class="edge">
+<title>gateway&#45;&gt;redis</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M122.74,-248.35C108.28,-233.68 89.42,-211.2 81.25,-187.12 68.86,-150.62 73.72,-106.03 79.72,-75.79"/>
+<polygon fill="black" stroke="black" points="83.14,-76.56 81.82,-66.04 76.29,-75.08 83.14,-76.56"/>
+<text xml:space="preserve" text-anchor="middle" x="95.88" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">State</text>
+<text xml:space="preserve" text-anchor="middle" x="95.88" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Query</text>
+</g>
+<!-- timescale -->
+<g id="node8" class="node">
+<title>timescale</title>
+<path fill="#ffecb3" stroke="black" d="M252.88,-59.75C252.88,-62.16 232.99,-64.12 208.5,-64.12 184.01,-64.12 164.12,-62.16 164.12,-59.75 164.12,-59.75 164.12,-20.38 164.12,-20.38 164.12,-17.96 184.01,-16 208.5,-16 232.99,-16 252.88,-17.96 252.88,-20.38 252.88,-20.38 252.88,-59.75 252.88,-59.75"/>
+<path fill="none" stroke="black" d="M252.88,-59.75C252.88,-57.34 232.99,-55.38 208.5,-55.38 184.01,-55.38 164.12,-57.34 164.12,-59.75"/>
+<text xml:space="preserve" text-anchor="middle" x="208.5" y="-43.11" font-family="Helvetica,sans-Serif" font-size="11.00">TimescaleDB</text>
+<text xml:space="preserve" text-anchor="middle" x="208.5" y="-29.61" font-family="Helvetica,sans-Serif" font-size="11.00">(Time&#45;series)</text>
+</g>
+<!-- gateway&#45;&gt;timescale -->
+<g id="edge4" class="edge">
+<title>gateway&#45;&gt;timescale</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M143.41,-248.29C146.34,-224.28 152.82,-179.73 164,-143.12 171.19,-119.57 182.25,-94.18 191.54,-74.62"/>
+<polygon fill="black" stroke="black" points="194.62,-76.29 195.83,-65.76 188.32,-73.24 194.62,-76.29"/>
+<text xml:space="preserve" text-anchor="middle" x="187.25" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">Historical</text>
+<text xml:space="preserve" text-anchor="middle" x="187.25" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Query</text>
+</g>
+<!-- aggregator&#45;&gt;redis -->
+<g id="edge7" class="edge">
+<title>aggregator&#45;&gt;redis</title>
+<path fill="none" stroke="#ffa000" d="M267.27,-150.69C261,-148.11 254.59,-145.52 248.5,-143.12 236.59,-138.44 233.22,-138.25 221.5,-133.12 191.36,-119.95 182.76,-118.04 155.5,-99.62 143.6,-91.59 131.5,-81.66 120.93,-72.28"/>
+<polygon fill="#ffa000" stroke="#ffa000" points="123.32,-69.73 113.56,-65.6 118.62,-74.91 123.32,-69.73"/>
+<text xml:space="preserve" text-anchor="middle" x="239.5" y="-123.62" font-family="Helvetica,sans-Serif" font-size="10.00">Current</text>
+<text xml:space="preserve" text-anchor="middle" x="239.5" y="-110.88" font-family="Helvetica,sans-Serif" font-size="10.00">State</text>
+</g>
+<!-- aggregator&#45;&gt;timescale -->
+<g id="edge8" class="edge">
+<title>aggregator&#45;&gt;timescale</title>
+<path fill="none" stroke="#ffa000" d="M294.81,-150.72C279.15,-130.84 253.2,-97.86 233.84,-73.25"/>
+<polygon fill="#ffa000" stroke="#ffa000" points="236.64,-71.16 227.71,-65.47 231.14,-75.49 236.64,-71.16"/>
+<text xml:space="preserve" text-anchor="middle" x="296.95" y="-123.62" font-family="Helvetica,sans-Serif" font-size="10.00">Store</text>
+<text xml:space="preserve" text-anchor="middle" x="296.95" y="-110.88" font-family="Helvetica,sans-Serif" font-size="10.00">Metrics</text>
+</g>
+<!-- events -->
+<g id="node9" class="node">
+<title>events</title>
+<path fill="#e1bee7" stroke="black" d="M395.63,-407.37C395.63,-407.37 376.5,-421.61 376.5,-421.61 371.69,-425.2 360.88,-428.78 354.88,-428.78 354.88,-428.78 302.12,-428.78 302.12,-428.78 296.12,-428.78 285.31,-425.2 280.5,-421.61 280.5,-421.61 261.37,-407.37 261.37,-407.37 256.56,-403.79 256.56,-396.62 261.37,-393.04 261.37,-393.04 280.5,-378.79 280.5,-378.79 285.31,-375.21 296.12,-371.62 302.12,-371.62 302.12,-371.62 354.88,-371.62 354.88,-371.62 360.88,-371.62 371.69,-375.21 376.5,-378.79 376.5,-378.79 395.63,-393.04 395.63,-393.04 400.44,-396.62 400.44,-403.79 395.63,-407.37"/>
+<text xml:space="preserve" text-anchor="middle" x="328.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Redis Pub/Sub</text>
+<text xml:space="preserve" text-anchor="middle" x="328.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">(Events)</text>
+</g>
+<!-- aggregator&#45;&gt;events -->
+<g id="edge9" class="edge">
+<title>aggregator&#45;&gt;events</title>
+<path fill="none" stroke="#7b1fa2" d="M333.16,-187.49C339.14,-192.63 345.07,-198.63 349.5,-205.12 361.02,-222.03 361.12,-228.46 364.5,-248.62 369.75,-279.97 371.24,-289.07 364.5,-320.12 361.48,-334.06 355.78,-348.49 349.79,-361.14"/>
+<polygon fill="#7b1fa2" stroke="#7b1fa2" points="346.73,-359.44 345.42,-369.95 353,-362.55 346.73,-359.44"/>
+<text xml:space="preserve" text-anchor="middle" x="386.64" y="-263.5" font-family="Helvetica,sans-Serif" font-size="10.00">Publish</text>
+</g>
+<!-- alerts -->
+<g id="node5" class="node">
+<title>alerts</title>
+<path fill="#c8e6c9" stroke="black" d="M236.75,-284.62C236.75,-284.62 204.25,-284.62 204.25,-284.62 198.25,-284.62 192.25,-278.62 192.25,-272.62 192.25,-272.62 192.25,-260.62 192.25,-260.62 192.25,-254.62 198.25,-248.62 204.25,-248.62 204.25,-248.62 236.75,-248.62 236.75,-248.62 242.75,-248.62 248.75,-254.62 248.75,-260.62 248.75,-260.62 248.75,-272.62 248.75,-272.62 248.75,-278.62 242.75,-284.62 236.75,-284.62"/>
+<text xml:space="preserve" text-anchor="middle" x="220.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Alerts</text>
+<text xml:space="preserve" text-anchor="middle" x="220.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">Service</text>
+</g>
+<!-- alerts&#45;&gt;timescale -->
+<g id="edge12" class="edge">
+<title>alerts&#45;&gt;timescale</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M219.58,-248.38C217.61,-211.47 212.94,-124.24 210.34,-75.51"/>
+<polygon fill="black" stroke="black" points="213.85,-75.6 209.82,-65.8 206.86,-75.97 213.85,-75.6"/>
+<text xml:space="preserve" text-anchor="middle" x="230.53" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">Store</text>
+<text xml:space="preserve" text-anchor="middle" x="230.53" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Alerts</text>
+</g>
+<!-- collector&#45;&gt;aggregator -->
+<g id="edge6" class="edge">
+<title>collector&#45;&gt;aggregator</title>
+<path fill="none" stroke="#388e3c" d="M310.96,-248.55C310.53,-234.65 309.9,-214.73 309.39,-198.45"/>
+<polygon fill="#388e3c" stroke="#388e3c" points="312.9,-198.77 309.09,-188.89 305.91,-198.99 312.9,-198.77"/>
+<text xml:space="preserve" text-anchor="middle" x="327.98" y="-221.12" font-family="Helvetica,sans-Serif" font-size="10.00">gRPC</text>
+<text xml:space="preserve" text-anchor="middle" x="327.98" y="-208.38" font-family="Helvetica,sans-Serif" font-size="10.00">Stream</text>
+</g>
+<!-- events&#45;&gt;gateway -->
+<g id="edge11" class="edge">
+<title>events&#45;&gt;gateway</title>
+<path fill="none" stroke="#7b1fa2" d="M281.13,-378.02C267.86,-372.71 253.29,-367.44 239.5,-363.62 212.49,-356.16 199.25,-370.98 177.25,-353.62 159.49,-339.61 150.46,-315.21 145.93,-295.98"/>
+<polygon fill="#7b1fa2" stroke="#7b1fa2" points="149.38,-295.39 143.95,-286.29 142.52,-296.79 149.38,-295.39"/>
+<text xml:space="preserve" text-anchor="middle" x="200.88" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">Subscribe</text>
+</g>
+<!-- events&#45;&gt;alerts -->
+<g id="edge10" class="edge">
+<title>events&#45;&gt;alerts</title>
+<path fill="none" stroke="#7b1fa2" d="M277.27,-380.98C264.23,-374.18 251.36,-365.21 242.25,-353.62 229.43,-337.32 224.08,-314.36 221.89,-296.26"/>
+<polygon fill="#7b1fa2" stroke="#7b1fa2" points="225.38,-296.07 220.98,-286.43 218.41,-296.71 225.38,-296.07"/>
+<text xml:space="preserve" text-anchor="middle" x="265.88" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">Subscribe</text>
+</g>
+</g>
+</svg>
diff --git a/docs/architecture/02-data-flow.dot b/docs/architecture/02-data-flow.dot
new file mode 100644
index 0000000..ac77851
--- /dev/null
+++ b/docs/architecture/02-data-flow.dot
@@ -0,0 +1,83 @@
+digraph DataFlow {
+    rankdir=LR;
+    compound=true;
+    fontname="Helvetica";
+    node [fontname="Helvetica", fontsize=10];
+    edge [fontname="Helvetica", fontsize=9];
+
+    labelloc="t";
+    label="Metrics Data Flow Pipeline";
+    fontsize=14;
+
+    node [shape=box, style="rounded,filled"];
+
+    // Collection
+    subgraph cluster_collect {
+        label="Collection (5s)";
+        style=filled;
+        fillcolor="#E3F2FD";
+
+        psutil [label="psutil\n(CPU, Mem, Disk)", shape=component, fillcolor="#BBDEFB"];
+        collector [label="Collector\nService", fillcolor="#90CAF9"];
+    }
+
+    // Ingestion
+    subgraph cluster_ingest {
+        label="Ingestion";
+        style=filled;
+        fillcolor="#E8F5E9";
+
+        aggregator [label="Aggregator\n(gRPC)", fillcolor="#A5D6A7"];
+        validate [label="Validate &\nNormalize", shape=diamond, fillcolor="#C8E6C9"];
+    }
+
+    // Storage Hot
+    subgraph cluster_hot {
+        label="Hot Path (Real-time)";
+        style=filled;
+        fillcolor="#FFF3E0";
+
+        redis_state [label="Redis\nCurrent State", shape=cylinder, fillcolor="#FFCC80"];
+        redis_pubsub [label="Redis\nPub/Sub", shape=hexagon, fillcolor="#FFB74D"];
+    }
+
+    // Storage Warm
+    subgraph cluster_warm {
+        label="Warm Path (Historical)";
+        style=filled;
+        fillcolor="#FCE4EC";
+
+        raw [label="metrics_raw\n(5s, 24h)", shape=cylinder, fillcolor="#F8BBD9"];
+        agg_1m [label="metrics_1m\n(1m, 7d)", shape=cylinder, fillcolor="#F48FB1"];
+        agg_1h [label="metrics_1h\n(1h, 90d)", shape=cylinder, fillcolor="#EC407A"];
+    }
+
+    // Consumers
+    subgraph cluster_consume {
+        label="Consumers";
+        style=filled;
+        fillcolor="#E8EAF6";
+
+        alerts [label="Alert\nService", fillcolor="#C5CAE9"];
+        gateway [label="Gateway\n(WebSocket)", fillcolor="#9FA8DA"];
+        lambda [label="Lambda\nAggregator", fillcolor="#7986CB", style="rounded,filled,dashed"];
+    }
+
+    // Flow
+    psutil -> collector [label="Metrics"];
+    collector -> aggregator [label="gRPC\nStream"];
+    aggregator -> validate;
+
+    validate -> redis_state [label="Upsert"];
+    validate -> redis_pubsub [label="Publish"];
+    validate -> raw [label="Insert"];
+
+    redis_pubsub -> alerts [label="metrics.*"];
+    redis_pubsub -> gateway [label="metrics.*"];
+
+    raw -> agg_1m [label="Continuous\nAggregate", style=dashed];
+    agg_1m -> agg_1h [label="Hourly\nJob", style=dashed];
+
+    raw -> lambda [label="SQS\nTrigger", style=dotted];
+    lambda -> agg_1m [label="Batch\nWrite", style=dotted];
+}
diff --git a/docs/architecture/02-data-flow.svg b/docs/architecture/02-data-flow.svg
new file mode 100644
index 0000000..5735a45
--- /dev/null
+++ b/docs/architecture/02-data-flow.svg
@@ -0,0 +1,217 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.1 (0)
+ -->
+<!-- Title: DataFlow Pages: 1 -->
+<svg width="1087pt" height="329pt"
+ viewBox="0.00 0.00 1087.00 329.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 325.25)">
+<title>DataFlow</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-325.25 1082.5,-325.25 1082.5,4 -4,4"/>
+<text xml:space="preserve" text-anchor="middle" x="539.25" y="-303.95" font-family="Helvetica,sans-Serif" font-size="14.00">Metrics Data Flow Pipeline</text>
+<g id="clust1" class="cluster">
+<title>cluster_collect</title>
+<polygon fill="#e3f2fd" stroke="black" points="8,-111 8,-188 254,-188 254,-111 8,-111"/>
+<text xml:space="preserve" text-anchor="middle" x="131" y="-170.7" font-family="Helvetica,sans-Serif" font-size="14.00">Collection (5s)</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_ingest</title>
+<polygon fill="#e8f5e9" stroke="black" points="307,-95 307,-204 562.5,-204 562.5,-95 307,-95"/>
+<text xml:space="preserve" text-anchor="middle" x="434.75" y="-186.7" font-family="Helvetica,sans-Serif" font-size="14.00">Ingestion</text>
+</g>
+<g id="clust3" class="cluster">
+<title>cluster_hot</title>
+<polygon fill="#fff3e0" stroke="black" points="614.75,-34 614.75,-193 769.5,-193 769.5,-34 614.75,-34"/>
+<text xml:space="preserve" text-anchor="middle" x="692.12" y="-175.7" font-family="Helvetica,sans-Serif" font-size="14.00">Hot Path (Real&#45;time)</text>
+</g>
+<g id="clust4" class="cluster">
+<title>cluster_warm</title>
+<polygon fill="#fce4ec" stroke="black" points="645.62,-201 645.62,-288 1070.5,-288 1070.5,-201 645.62,-201"/>
+<text xml:space="preserve" text-anchor="middle" x="858.06" y="-270.7" font-family="Helvetica,sans-Serif" font-size="14.00">Warm Path (Historical)</text>
+</g>
+<g id="clust5" class="cluster">
+<title>cluster_consume</title>
+<polygon fill="#e8eaf6" stroke="black" points="840.5,-8 840.5,-193 935.25,-193 935.25,-8 840.5,-8"/>
+<text xml:space="preserve" text-anchor="middle" x="887.88" y="-175.7" font-family="Helvetica,sans-Serif" font-size="14.00">Consumers</text>
+</g>
+<!-- psutil -->
+<g id="node1" class="node">
+<title>psutil</title>
+<polygon fill="#bbdefb" stroke="black" points="118.25,-155 16,-155 16,-151 12,-151 12,-147 16,-147 16,-127 12,-127 12,-123 16,-123 16,-119 118.25,-119 118.25,-155"/>
+<polyline fill="none" stroke="black" points="16,-151 20,-151 20,-147 16,-147"/>
+<polyline fill="none" stroke="black" points="16,-127 20,-127 20,-123 16,-123"/>
+<text xml:space="preserve" text-anchor="middle" x="67.13" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">psutil</text>
+<text xml:space="preserve" text-anchor="middle" x="67.13" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">(CPU, Mem, Disk)</text>
+</g>
+<!-- collector -->
+<g id="node2" class="node">
+<title>collector</title>
+<path fill="#90caf9" stroke="black" d="M234,-155C234,-155 198.5,-155 198.5,-155 192.5,-155 186.5,-149 186.5,-143 186.5,-143 186.5,-131 186.5,-131 186.5,-125 192.5,-119 198.5,-119 198.5,-119 234,-119 234,-119 240,-119 246,-125 246,-131 246,-131 246,-143 246,-143 246,-149 240,-155 234,-155"/>
+<text xml:space="preserve" text-anchor="middle" x="216.25" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
+<text xml:space="preserve" text-anchor="middle" x="216.25" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Service</text>
+</g>
+<!-- psutil&#45;&gt;collector -->
+<g id="edge1" class="edge">
+<title>psutil&#45;&gt;collector</title>
+<path fill="none" stroke="black" d="M118.35,-137C136.74,-137 157.31,-137 174.75,-137"/>
+<polygon fill="black" stroke="black" points="174.75,-140.5 184.75,-137 174.75,-133.5 174.75,-140.5"/>
+<text xml:space="preserve" text-anchor="middle" x="152.38" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Metrics</text>
+</g>
+<!-- aggregator -->
+<g id="node3" class="node">
+<title>aggregator</title>
+<path fill="#a5d6a7" stroke="black" d="M373,-155C373,-155 327,-155 327,-155 321,-155 315,-149 315,-143 315,-143 315,-131 315,-131 315,-125 321,-119 327,-119 327,-119 373,-119 373,-119 379,-119 385,-125 385,-131 385,-131 385,-143 385,-143 385,-149 379,-155 373,-155"/>
+<text xml:space="preserve" text-anchor="middle" x="350" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
+<text xml:space="preserve" text-anchor="middle" x="350" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">(gRPC)</text>
+</g>
+<!-- collector&#45;&gt;aggregator -->
+<g id="edge2" class="edge">
+<title>collector&#45;&gt;aggregator</title>
+<path fill="none" stroke="black" d="M246.49,-137C263.19,-137 284.49,-137 303.35,-137"/>
+<polygon fill="black" stroke="black" points="303.2,-140.5 313.2,-137 303.2,-133.5 303.2,-140.5"/>
+<text xml:space="preserve" text-anchor="middle" x="280.5" y="-150.95" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
+<text xml:space="preserve" text-anchor="middle" x="280.5" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Stream</text>
+</g>
+<!-- validate -->
+<g id="node4" class="node">
+<title>validate</title>
+<path fill="#c8e6c9" stroke="black" d="M477.54,-165.08C477.54,-165.08 432.71,-142.42 432.71,-142.42 427.35,-139.71 427.35,-134.29 432.71,-131.58 432.71,-131.58 477.54,-108.92 477.54,-108.92 482.9,-106.21 493.6,-106.21 498.96,-108.92 498.96,-108.92 543.79,-131.58 543.79,-131.58 549.15,-134.29 549.15,-139.71 543.79,-142.42 543.79,-142.42 498.96,-165.08 498.96,-165.08 493.6,-167.79 482.9,-167.79 477.54,-165.08"/>
+<text xml:space="preserve" text-anchor="middle" x="488.25" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Validate &amp;</text>
+<text xml:space="preserve" text-anchor="middle" x="488.25" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Normalize</text>
+</g>
+<!-- aggregator&#45;&gt;validate -->
+<g id="edge3" class="edge">
+<title>aggregator&#45;&gt;validate</title>
+<path fill="none" stroke="black" d="M385.38,-137C392.95,-137 401.25,-137 409.76,-137"/>
+<polygon fill="black" stroke="black" points="409.49,-140.5 419.49,-137 409.49,-133.5 409.49,-140.5"/>
+</g>
+<!-- redis_state -->
+<g id="node5" class="node">
+<title>redis_state</title>
+<path fill="#ffcc80" stroke="black" d="M731.88,-155.84C731.88,-158.15 713.83,-160.03 691.62,-160.03 669.42,-160.03 651.38,-158.15 651.38,-155.84 651.38,-155.84 651.38,-118.16 651.38,-118.16 651.38,-115.85 669.42,-113.97 691.62,-113.97 713.83,-113.97 731.88,-115.85 731.88,-118.16 731.88,-118.16 731.88,-155.84 731.88,-155.84"/>
+<path fill="none" stroke="black" d="M731.88,-155.84C731.88,-153.53 713.83,-151.66 691.62,-151.66 669.42,-151.66 651.38,-153.53 651.38,-155.84"/>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Redis</text>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Current State</text>
+</g>
+<!-- validate&#45;&gt;redis_state -->
+<g id="edge4" class="edge">
+<title>validate&#45;&gt;redis_state</title>
+<path fill="none" stroke="black" d="M555.47,-137C582.9,-137 614.22,-137 639.8,-137"/>
+<polygon fill="black" stroke="black" points="639.6,-140.5 649.6,-137 639.6,-133.5 639.6,-140.5"/>
+<text xml:space="preserve" text-anchor="middle" x="588.62" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Upsert</text>
+</g>
+<!-- redis_pubsub -->
+<g id="node6" class="node">
+<title>redis_pubsub</title>
+<path fill="#ffb74d" stroke="black" d="M729.05,-78.12C729.05,-78.12 721.56,-87.24 721.56,-87.24 717.82,-91.79 708.18,-96.35 702.28,-96.35 702.28,-96.35 680.97,-96.35 680.97,-96.35 675.07,-96.35 665.43,-91.79 661.69,-87.24 661.69,-87.24 654.2,-78.12 654.2,-78.12 650.46,-73.56 650.46,-64.44 654.2,-59.88 654.2,-59.88 661.69,-50.76 661.69,-50.76 665.43,-46.21 675.07,-41.65 680.97,-41.65 680.97,-41.65 702.28,-41.65 702.28,-41.65 708.18,-41.65 717.82,-46.21 721.56,-50.76 721.56,-50.76 729.05,-59.88 729.05,-59.88 732.79,-64.44 732.79,-73.56 729.05,-78.12"/>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-72.25" font-family="Helvetica,sans-Serif" font-size="10.00">Redis</text>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-59.5" font-family="Helvetica,sans-Serif" font-size="10.00">Pub/Sub</text>
+</g>
+<!-- validate&#45;&gt;redis_pubsub -->
+<g id="edge5" class="edge">
+<title>validate&#45;&gt;redis_pubsub</title>
+<path fill="none" stroke="black" d="M529.04,-123.57C562.44,-112.28 610.18,-96.17 645.1,-84.37"/>
+<polygon fill="black" stroke="black" points="646.17,-87.71 654.53,-81.19 643.93,-81.07 646.17,-87.71"/>
+<text xml:space="preserve" text-anchor="middle" x="588.62" y="-109.77" font-family="Helvetica,sans-Serif" font-size="9.00">Publish</text>
+</g>
+<!-- raw -->
+<g id="node7" class="node">
+<title>raw</title>
+<path fill="#f8bbd9" stroke="black" d="M729.62,-250.84C729.62,-253.15 712.59,-255.03 691.62,-255.03 670.66,-255.03 653.62,-253.15 653.62,-250.84 653.62,-250.84 653.62,-213.16 653.62,-213.16 653.62,-210.85 670.66,-208.97 691.62,-208.97 712.59,-208.97 729.62,-210.85 729.62,-213.16 729.62,-213.16 729.62,-250.84 729.62,-250.84"/>
+<path fill="none" stroke="black" d="M729.62,-250.84C729.62,-248.53 712.59,-246.66 691.62,-246.66 670.66,-246.66 653.62,-248.53 653.62,-250.84"/>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_raw</text>
+<text xml:space="preserve" text-anchor="middle" x="691.62" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(5s, 24h)</text>
+</g>
+<!-- validate&#45;&gt;raw -->
+<g id="edge6" class="edge">
+<title>validate&#45;&gt;raw</title>
+<path fill="none" stroke="black" d="M523.01,-153.3C548.24,-165.44 583.6,-182.37 614.75,-197 623.81,-201.26 633.5,-205.76 642.83,-210.07"/>
+<polygon fill="black" stroke="black" points="641.22,-213.19 651.77,-214.2 644.16,-206.83 641.22,-213.19"/>
+<text xml:space="preserve" text-anchor="middle" x="588.62" y="-194.9" font-family="Helvetica,sans-Serif" font-size="9.00">Insert</text>
+</g>
+<!-- alerts -->
+<g id="node10" class="node">
+<title>alerts</title>
+<path fill="#c5cae9" stroke="black" d="M902.38,-106C902.38,-106 872.38,-106 872.38,-106 866.38,-106 860.38,-100 860.38,-94 860.38,-94 860.38,-82 860.38,-82 860.38,-76 866.38,-70 872.38,-70 872.38,-70 902.38,-70 902.38,-70 908.38,-70 914.38,-76 914.38,-82 914.38,-82 914.38,-94 914.38,-94 914.38,-100 908.38,-106 902.38,-106"/>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-91.25" font-family="Helvetica,sans-Serif" font-size="10.00">Alert</text>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-78.5" font-family="Helvetica,sans-Serif" font-size="10.00">Service</text>
+</g>
+<!-- redis_pubsub&#45;&gt;alerts -->
+<g id="edge7" class="edge">
+<title>redis_pubsub&#45;&gt;alerts</title>
+<path fill="none" stroke="black" d="M733.71,-73.03C767.65,-76.36 815.43,-81.04 848.46,-84.28"/>
+<polygon fill="black" stroke="black" points="848.11,-87.76 858.4,-85.26 848.79,-80.8 848.11,-87.76"/>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-85.09" font-family="Helvetica,sans-Serif" font-size="9.00">metrics.*</text>
+</g>
+<!-- gateway -->
+<g id="node11" class="node">
+<title>gateway</title>
+<path fill="#9fa8da" stroke="black" d="M913.75,-52C913.75,-52 861,-52 861,-52 855,-52 849,-46 849,-40 849,-40 849,-28 849,-28 849,-22 855,-16 861,-16 861,-16 913.75,-16 913.75,-16 919.75,-16 925.75,-22 925.75,-28 925.75,-28 925.75,-40 925.75,-40 925.75,-46 919.75,-52 913.75,-52"/>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-37.25" font-family="Helvetica,sans-Serif" font-size="10.00">Gateway</text>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-24.5" font-family="Helvetica,sans-Serif" font-size="10.00">(WebSocket)</text>
+</g>
+<!-- redis_pubsub&#45;&gt;gateway -->
+<g id="edge8" class="edge">
+<title>redis_pubsub&#45;&gt;gateway</title>
+<path fill="none" stroke="black" d="M731.37,-62C761.89,-56.49 804.64,-48.77 837.51,-42.83"/>
+<polygon fill="black" stroke="black" points="837.98,-46.3 847.2,-41.08 836.74,-39.41 837.98,-46.3"/>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-55.25" font-family="Helvetica,sans-Serif" font-size="9.00">metrics.*</text>
+</g>
+<!-- agg_1m -->
+<g id="node8" class="node">
+<title>agg_1m</title>
+<path fill="#f48fb1" stroke="black" d="M924.25,-250.84C924.25,-253.15 907.72,-255.03 887.38,-255.03 867.03,-255.03 850.5,-253.15 850.5,-250.84 850.5,-250.84 850.5,-213.16 850.5,-213.16 850.5,-210.85 867.03,-208.97 887.38,-208.97 907.72,-208.97 924.25,-210.85 924.25,-213.16 924.25,-213.16 924.25,-250.84 924.25,-250.84"/>
+<path fill="none" stroke="black" d="M924.25,-250.84C924.25,-248.53 907.72,-246.66 887.38,-246.66 867.03,-246.66 850.5,-248.53 850.5,-250.84"/>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_1m</text>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(1m, 7d)</text>
+</g>
+<!-- raw&#45;&gt;agg_1m -->
+<g id="edge9" class="edge">
+<title>raw&#45;&gt;agg_1m</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M729.98,-232C760.97,-232 805.22,-232 838.74,-232"/>
+<polygon fill="black" stroke="black" points="838.6,-235.5 848.6,-232 838.6,-228.5 838.6,-235.5"/>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-245.95" font-family="Helvetica,sans-Serif" font-size="9.00">Continuous</text>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-234.7" font-family="Helvetica,sans-Serif" font-size="9.00">Aggregate</text>
+</g>
+<!-- lambda -->
+<g id="node12" class="node">
+<title>lambda</title>
+<path fill="#7986cb" stroke="black" stroke-dasharray="5,2" d="M910.38,-160C910.38,-160 864.38,-160 864.38,-160 858.38,-160 852.38,-154 852.38,-148 852.38,-148 852.38,-136 852.38,-136 852.38,-130 858.38,-124 864.38,-124 864.38,-124 910.38,-124 910.38,-124 916.38,-124 922.38,-130 922.38,-136 922.38,-136 922.38,-148 922.38,-148 922.38,-154 916.38,-160 910.38,-160"/>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-145.25" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
+<text xml:space="preserve" text-anchor="middle" x="887.38" y="-132.5" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
+</g>
+<!-- raw&#45;&gt;lambda -->
+<g id="edge11" class="edge">
+<title>raw&#45;&gt;lambda</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M729.81,-215.18C742.43,-209.45 756.59,-202.98 769.5,-197 793.37,-185.95 819.91,-173.48 841.65,-163.21"/>
+<polygon fill="black" stroke="black" points="843,-166.44 850.54,-159.01 840,-160.12 843,-166.44"/>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-205.05" font-family="Helvetica,sans-Serif" font-size="9.00">SQS</text>
+<text xml:space="preserve" text-anchor="middle" x="805" y="-193.8" font-family="Helvetica,sans-Serif" font-size="9.00">Trigger</text>
+</g>
+<!-- agg_1h -->
+<g id="node9" class="node">
+<title>agg_1h</title>
+<path fill="#ec407a" stroke="black" d="M1062.5,-250.84C1062.5,-253.15 1046.81,-255.03 1027.5,-255.03 1008.19,-255.03 992.5,-253.15 992.5,-250.84 992.5,-250.84 992.5,-213.16 992.5,-213.16 992.5,-210.85 1008.19,-208.97 1027.5,-208.97 1046.81,-208.97 1062.5,-210.85 1062.5,-213.16 1062.5,-213.16 1062.5,-250.84 1062.5,-250.84"/>
+<path fill="none" stroke="black" d="M1062.5,-250.84C1062.5,-248.53 1046.81,-246.66 1027.5,-246.66 1008.19,-246.66 992.5,-248.53 992.5,-250.84"/>
+<text xml:space="preserve" text-anchor="middle" x="1027.5" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_1h</text>
+<text xml:space="preserve" text-anchor="middle" x="1027.5" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(1h, 90d)</text>
+</g>
+<!-- agg_1m&#45;&gt;agg_1h -->
+<g id="edge10" class="edge">
+<title>agg_1m&#45;&gt;agg_1h</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M924.67,-232C941.93,-232 962.74,-232 981.04,-232"/>
+<polygon fill="black" stroke="black" points="980.84,-235.5 990.84,-232 980.84,-228.5 980.84,-235.5"/>
+<text xml:space="preserve" text-anchor="middle" x="959.88" y="-245.95" font-family="Helvetica,sans-Serif" font-size="9.00">Hourly</text>
+<text xml:space="preserve" text-anchor="middle" x="959.88" y="-234.7" font-family="Helvetica,sans-Serif" font-size="9.00">Job</text>
+</g>
+<!-- lambda&#45;&gt;agg_1m -->
+<g id="edge12" class="edge">
+<title>lambda&#45;&gt;agg_1m</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M887.38,-160.21C887.38,-170.91 887.38,-184.78 887.38,-197.47"/>
+<polygon fill="black" stroke="black" points="883.88,-197.16 887.38,-207.16 890.88,-197.16 883.88,-197.16"/>
+<text xml:space="preserve" text-anchor="middle" x="873.12" y="-187.18" font-family="Helvetica,sans-Serif" font-size="9.00">Batch</text>
+<text xml:space="preserve" text-anchor="middle" x="873.12" y="-175.93" font-family="Helvetica,sans-Serif" font-size="9.00">Write</text>
+</g>
+</g>
+</svg>
diff --git a/docs/architecture/03-deployment.dot b/docs/architecture/03-deployment.dot
new file mode 100644
index 0000000..fe3b29d
--- /dev/null
+++ b/docs/architecture/03-deployment.dot
@@ -0,0 +1,95 @@
+digraph Deployment {
+    rankdir=TB;
+    compound=true;
+    fontname="Helvetica";
+    node [fontname="Helvetica", fontsize=10];
+    edge [fontname="Helvetica", fontsize=9];
+
+    labelloc="t";
+    label="Deployment Architecture";
+    fontsize=14;
+
+    node [shape=box, style="rounded,filled"];
+
+    // Local Development
+    subgraph cluster_local {
+        label="Local Development";
+        style=filled;
+        fillcolor="#E3F2FD";
+
+        subgraph cluster_kind {
+            label="Kind Cluster";
+            style=filled;
+            fillcolor="#BBDEFB";
+
+            tilt [label="Tilt\n(Live Reload)", shape=component, fillcolor="#90CAF9"];
+            k8s_local [label="K8s Pods\n(via Kustomize)", fillcolor="#64B5F6"];
+        }
+
+        compose [label="Docker Compose\n(Alternative)", fillcolor="#90CAF9", style="rounded,dashed"];
+    }
+
+    // AWS Staging/Demo
+    subgraph cluster_aws {
+        label="AWS (sysmonstm.mcrn.ar)";
+        style=filled;
+        fillcolor="#E8F5E9";
+
+        subgraph cluster_ec2 {
+            label="EC2 t2.small";
+            style=filled;
+            fillcolor="#C8E6C9";
+
+            compose_ec2 [label="Docker Compose\n(All Services)", fillcolor="#A5D6A7"];
+            nginx [label="Nginx\n(SSL Termination)", fillcolor="#81C784"];
+        }
+
+        subgraph cluster_lambda {
+            label="Lambda (Data Processing)";
+            style=filled;
+            fillcolor="#DCEDC8";
+
+            lambda_agg [label="Aggregator\nLambda", fillcolor="#AED581"];
+            lambda_compact [label="Compactor\nLambda", fillcolor="#9CCC65"];
+        }
+
+        sqs [label="SQS\n(Buffer)", shape=hexagon, fillcolor="#FFE082"];
+        s3 [label="S3\n(Backup)", shape=cylinder, fillcolor="#FFE082"];
+    }
+
+    // CI/CD
+    subgraph cluster_cicd {
+        label="CI/CD";
+        style=filled;
+        fillcolor="#F3E5F5";
+
+        woodpecker [label="Woodpecker CI", fillcolor="#CE93D8"];
+        registry [label="Container\nRegistry", shape=cylinder, fillcolor="#BA68C8"];
+    }
+
+    // Collectors (External)
+    subgraph cluster_collectors {
+        label="Monitored Machines";
+        style=dashed;
+        color=gray;
+
+        coll1 [label="Collector\n(Machine 1)", fillcolor="#FFCCBC"];
+        coll2 [label="Collector\n(Machine 2)", fillcolor="#FFCCBC"];
+        coll3 [label="Collector\n(Machine N)", fillcolor="#FFCCBC"];
+    }
+
+    // Connections
+    tilt -> k8s_local [style=invis];
+    woodpecker -> registry [label="Push"];
+    registry -> compose_ec2 [label="Pull"];
+    registry -> k8s_local [label="Pull", style=dashed];
+
+    nginx -> compose_ec2 [label="Proxy"];
+    compose_ec2 -> sqs [label="Events"];
+    sqs -> lambda_agg [label="Trigger"];
+    lambda_compact -> s3 [label="Archive"];
+
+    coll1 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
+    coll2 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
+    coll3 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
+}
diff --git a/docs/architecture/03-deployment.svg b/docs/architecture/03-deployment.svg
new file mode 100644
index 0000000..cc1cf45
--- /dev/null
+++ b/docs/architecture/03-deployment.svg
@@ -0,0 +1,221 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.1 (0)
+ -->
+<!-- Title: Deployment Pages: 1 -->
+<svg width="872pt" height="662pt"
+ viewBox="0.00 0.00 872.00 662.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 658.3)">
+<title>Deployment</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-658.3 868,-658.3 868,4 -4,4"/>
+<text xml:space="preserve" text-anchor="middle" x="432" y="-637" font-family="Helvetica,sans-Serif" font-size="14.00">Deployment Architecture</text>
+<g id="clust1" class="cluster">
+<title>cluster_local</title>
+<polygon fill="#e3f2fd" stroke="black" points="8,-307.77 8,-514.55 238,-514.55 238,-307.77 8,-307.77"/>
+<text xml:space="preserve" text-anchor="middle" x="123" y="-497.25" font-family="Helvetica,sans-Serif" font-size="14.00">Local Development</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_kind</title>
+<polygon fill="#bbdefb" stroke="black" points="16,-315.77 16,-481.3 124,-481.3 124,-315.77 16,-315.77"/>
+<text xml:space="preserve" text-anchor="middle" x="70" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">Kind Cluster</text>
+</g>
+<g id="clust3" class="cluster">
+<title>cluster_aws</title>
+<polygon fill="#e8f5e9" stroke="black" points="642,-8 642,-514.55 856,-514.55 856,-8 642,-8"/>
+<text xml:space="preserve" text-anchor="middle" x="749" y="-497.25" font-family="Helvetica,sans-Serif" font-size="14.00">AWS (sysmonstm.mcrn.ar)</text>
+</g>
+<g id="clust4" class="cluster">
+<title>cluster_ec2</title>
+<polygon fill="#c8e6c9" stroke="black" points="650,-315.77 650,-481.3 768,-481.3 768,-315.77 650,-315.77"/>
+<text xml:space="preserve" text-anchor="middle" x="709" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">EC2 t2.small</text>
+</g>
+<g id="clust5" class="cluster">
+<title>cluster_lambda</title>
+<polygon fill="#dcedc8" stroke="black" points="650,-101.31 650,-178.56 848,-178.56 848,-101.31 650,-101.31"/>
+<text xml:space="preserve" text-anchor="middle" x="749" y="-161.26" font-family="Helvetica,sans-Serif" font-size="14.00">Lambda (Data Processing)</text>
+</g>
+<g id="clust6" class="cluster">
+<title>cluster_cicd</title>
+<polygon fill="#f3e5f5" stroke="black" points="246,-399.02 246,-621.05 350,-621.05 350,-399.02 246,-399.02"/>
+<text xml:space="preserve" text-anchor="middle" x="298" y="-603.75" font-family="Helvetica,sans-Serif" font-size="14.00">CI/CD</text>
+</g>
+<g id="clust7" class="cluster">
+<title>cluster_collectors</title>
+<polygon fill="none" stroke="gray" stroke-dasharray="5,2" points="358,-404.05 358,-481.3 634,-481.3 634,-404.05 358,-404.05"/>
+<text xml:space="preserve" text-anchor="middle" x="496" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">Monitored Machines</text>
+</g>
+<!-- tilt -->
+<g id="node1" class="node">
+<title>tilt</title>
+<polygon fill="#90caf9" stroke="black" points="110.25,-448.05 29.75,-448.05 29.75,-444.05 25.75,-444.05 25.75,-440.05 29.75,-440.05 29.75,-420.05 25.75,-420.05 25.75,-416.05 29.75,-416.05 29.75,-412.05 110.25,-412.05 110.25,-448.05"/>
+<polyline fill="none" stroke="black" points="29.75,-444.05 33.75,-444.05 33.75,-440.05 29.75,-440.05"/>
+<polyline fill="none" stroke="black" points="29.75,-420.05 33.75,-420.05 33.75,-416.05 29.75,-416.05"/>
+<text xml:space="preserve" text-anchor="middle" x="70" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Tilt</text>
+<text xml:space="preserve" text-anchor="middle" x="70" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Live Reload)</text>
+</g>
+<!-- k8s_local -->
+<g id="node2" class="node">
+<title>k8s_local</title>
+<path fill="#64b5f6" stroke="black" d="M104.25,-359.77C104.25,-359.77 35.75,-359.77 35.75,-359.77 29.75,-359.77 23.75,-353.77 23.75,-347.77 23.75,-347.77 23.75,-335.77 23.75,-335.77 23.75,-329.77 29.75,-323.77 35.75,-323.77 35.75,-323.77 104.25,-323.77 104.25,-323.77 110.25,-323.77 116.25,-329.77 116.25,-335.77 116.25,-335.77 116.25,-347.77 116.25,-347.77 116.25,-353.77 110.25,-359.77 104.25,-359.77"/>
+<text xml:space="preserve" text-anchor="middle" x="70" y="-345.02" font-family="Helvetica,sans-Serif" font-size="10.00">K8s Pods</text>
+<text xml:space="preserve" text-anchor="middle" x="70" y="-332.27" font-family="Helvetica,sans-Serif" font-size="10.00">(via Kustomize)</text>
+</g>
+<!-- tilt&#45;&gt;k8s_local -->
+<!-- compose -->
+<g id="node3" class="node">
+<title>compose</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M218.25,-448.05C218.25,-448.05 143.75,-448.05 143.75,-448.05 137.75,-448.05 131.75,-442.05 131.75,-436.05 131.75,-436.05 131.75,-424.05 131.75,-424.05 131.75,-418.05 137.75,-412.05 143.75,-412.05 143.75,-412.05 218.25,-412.05 218.25,-412.05 224.25,-412.05 230.25,-418.05 230.25,-424.05 230.25,-424.05 230.25,-436.05 230.25,-436.05 230.25,-442.05 224.25,-448.05 218.25,-448.05"/>
+<text xml:space="preserve" text-anchor="middle" x="181" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Docker Compose</text>
+<text xml:space="preserve" text-anchor="middle" x="181" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Alternative)</text>
+</g>
+<!-- compose_ec2 -->
+<g id="node4" class="node">
+<title>compose_ec2</title>
+<path fill="#a5d6a7" stroke="black" d="M744.25,-359.77C744.25,-359.77 669.75,-359.77 669.75,-359.77 663.75,-359.77 657.75,-353.77 657.75,-347.77 657.75,-347.77 657.75,-335.77 657.75,-335.77 657.75,-329.77 663.75,-323.77 669.75,-323.77 669.75,-323.77 744.25,-323.77 744.25,-323.77 750.25,-323.77 756.25,-329.77 756.25,-335.77 756.25,-335.77 756.25,-347.77 756.25,-347.77 756.25,-353.77 750.25,-359.77 744.25,-359.77"/>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-345.02" font-family="Helvetica,sans-Serif" font-size="10.00">Docker Compose</text>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-332.27" font-family="Helvetica,sans-Serif" font-size="10.00">(All Services)</text>
+</g>
+<!-- sqs -->
+<g id="node8" class="node">
+<title>sqs</title>
+<path fill="#ffe082" stroke="black" d="M742.89,-252.28C742.89,-252.28 735.71,-261.4 735.71,-261.4 732.12,-265.96 722.73,-270.52 716.93,-270.52 716.93,-270.52 697.07,-270.52 697.07,-270.52 691.27,-270.52 681.88,-265.96 678.29,-261.4 678.29,-261.4 671.11,-252.28 671.11,-252.28 667.52,-247.72 667.52,-238.61 671.11,-234.05 671.11,-234.05 678.29,-224.93 678.29,-224.93 681.88,-220.37 691.27,-215.81 697.07,-215.81 697.07,-215.81 716.93,-215.81 716.93,-215.81 722.73,-215.81 732.12,-220.37 735.71,-224.93 735.71,-224.93 742.89,-234.05 742.89,-234.05 746.48,-238.61 746.48,-247.72 742.89,-252.28"/>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-246.42" font-family="Helvetica,sans-Serif" font-size="10.00">SQS</text>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-233.67" font-family="Helvetica,sans-Serif" font-size="10.00">(Buffer)</text>
+</g>
+<!-- compose_ec2&#45;&gt;sqs -->
+<g id="edge6" class="edge">
+<title>compose_ec2&#45;&gt;sqs</title>
+<path fill="none" stroke="black" d="M707,-323.5C707,-311.94 707,-296.26 707,-281.89"/>
+<polygon fill="black" stroke="black" points="710.5,-282.27 707,-272.27 703.5,-282.27 710.5,-282.27"/>
+<text xml:space="preserve" text-anchor="middle" x="722.38" y="-291.22" font-family="Helvetica,sans-Serif" font-size="9.00">Events</text>
+</g>
+<!-- nginx -->
+<g id="node5" class="node">
+<title>nginx</title>
+<path fill="#81c784" stroke="black" d="M747.75,-448.05C747.75,-448.05 670.25,-448.05 670.25,-448.05 664.25,-448.05 658.25,-442.05 658.25,-436.05 658.25,-436.05 658.25,-424.05 658.25,-424.05 658.25,-418.05 664.25,-412.05 670.25,-412.05 670.25,-412.05 747.75,-412.05 747.75,-412.05 753.75,-412.05 759.75,-418.05 759.75,-424.05 759.75,-424.05 759.75,-436.05 759.75,-436.05 759.75,-442.05 753.75,-448.05 747.75,-448.05"/>
+<text xml:space="preserve" text-anchor="middle" x="709" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Nginx</text>
+<text xml:space="preserve" text-anchor="middle" x="709" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(SSL Termination)</text>
+</g>
+<!-- nginx&#45;&gt;compose_ec2 -->
+<g id="edge5" class="edge">
+<title>nginx&#45;&gt;compose_ec2</title>
+<path fill="none" stroke="black" d="M708.6,-411.59C708.33,-400.13 707.98,-384.86 707.67,-371.63"/>
+<polygon fill="black" stroke="black" points="711.17,-371.63 707.44,-361.72 704.17,-371.79 711.17,-371.63"/>
+<text xml:space="preserve" text-anchor="middle" x="720.43" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Proxy</text>
+</g>
+<!-- lambda_agg -->
+<g id="node6" class="node">
+<title>lambda_agg</title>
+<path fill="#aed581" stroke="black" d="M730,-145.31C730,-145.31 684,-145.31 684,-145.31 678,-145.31 672,-139.31 672,-133.31 672,-133.31 672,-121.31 672,-121.31 672,-115.31 678,-109.31 684,-109.31 684,-109.31 730,-109.31 730,-109.31 736,-109.31 742,-115.31 742,-121.31 742,-121.31 742,-133.31 742,-133.31 742,-139.31 736,-145.31 730,-145.31"/>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-130.56" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
+<text xml:space="preserve" text-anchor="middle" x="707" y="-117.81" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
+</g>
+<!-- lambda_compact -->
+<g id="node7" class="node">
+<title>lambda_compact</title>
+<path fill="#9ccc65" stroke="black" d="M822.62,-145.31C822.62,-145.31 777.38,-145.31 777.38,-145.31 771.38,-145.31 765.38,-139.31 765.38,-133.31 765.38,-133.31 765.38,-121.31 765.38,-121.31 765.38,-115.31 771.38,-109.31 777.38,-109.31 777.38,-109.31 822.62,-109.31 822.62,-109.31 828.62,-109.31 834.62,-115.31 834.62,-121.31 834.62,-121.31 834.62,-133.31 834.62,-133.31 834.62,-139.31 828.62,-145.31 822.62,-145.31"/>
+<text xml:space="preserve" text-anchor="middle" x="800" y="-130.56" font-family="Helvetica,sans-Serif" font-size="10.00">Compactor</text>
+<text xml:space="preserve" text-anchor="middle" x="800" y="-117.81" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
+</g>
+<!-- s3 -->
+<g id="node9" class="node">
+<title>s3</title>
+<path fill="#ffe082" stroke="black" d="M829.38,-57.88C829.38,-60.19 816.21,-62.06 800,-62.06 783.79,-62.06 770.62,-60.19 770.62,-57.88 770.62,-57.88 770.62,-20.19 770.62,-20.19 770.62,-17.88 783.79,-16 800,-16 816.21,-16 829.38,-17.88 829.38,-20.19 829.38,-20.19 829.38,-57.88 829.38,-57.88"/>
+<path fill="none" stroke="black" d="M829.38,-57.88C829.38,-55.56 816.21,-53.69 800,-53.69 783.79,-53.69 770.62,-55.56 770.62,-57.88"/>
+<text xml:space="preserve" text-anchor="middle" x="800" y="-42.28" font-family="Helvetica,sans-Serif" font-size="10.00">S3</text>
+<text xml:space="preserve" text-anchor="middle" x="800" y="-29.53" font-family="Helvetica,sans-Serif" font-size="10.00">(Backup)</text>
+</g>
+<!-- lambda_compact&#45;&gt;s3 -->
+<g id="edge8" class="edge">
+<title>lambda_compact&#45;&gt;s3</title>
+<path fill="none" stroke="black" d="M800,-108.85C800,-98.81 800,-85.84 800,-73.88"/>
+<polygon fill="black" stroke="black" points="803.5,-73.9 800,-63.9 796.5,-73.9 803.5,-73.9"/>
+<text xml:space="preserve" text-anchor="middle" x="816.88" y="-82.76" font-family="Helvetica,sans-Serif" font-size="9.00">Archive</text>
+</g>
+<!-- sqs&#45;&gt;lambda_agg -->
+<g id="edge7" class="edge">
+<title>sqs&#45;&gt;lambda_agg</title>
+<path fill="none" stroke="black" d="M707,-215.47C707,-197.96 707,-175.06 707,-157.13"/>
+<polygon fill="black" stroke="black" points="710.5,-157.15 707,-147.15 703.5,-157.15 710.5,-157.15"/>
+<text xml:space="preserve" text-anchor="middle" x="722.75" y="-189.26" font-family="Helvetica,sans-Serif" font-size="9.00">Trigger</text>
+</g>
+<!-- woodpecker -->
+<g id="node10" class="node">
+<title>woodpecker</title>
+<path fill="#ce93d8" stroke="black" d="M330,-587.8C330,-587.8 266,-587.8 266,-587.8 260,-587.8 254,-581.8 254,-575.8 254,-575.8 254,-563.8 254,-563.8 254,-557.8 260,-551.8 266,-551.8 266,-551.8 330,-551.8 330,-551.8 336,-551.8 342,-557.8 342,-563.8 342,-563.8 342,-575.8 342,-575.8 342,-581.8 336,-587.8 330,-587.8"/>
+<text xml:space="preserve" text-anchor="middle" x="298" y="-566.67" font-family="Helvetica,sans-Serif" font-size="10.00">Woodpecker CI</text>
+</g>
+<!-- registry -->
+<g id="node11" class="node">
+<title>registry</title>
+<path fill="#ba68c8" stroke="black" d="M329.62,-448.89C329.62,-451.2 315.45,-453.08 298,-453.08 280.55,-453.08 266.38,-451.2 266.38,-448.89 266.38,-448.89 266.38,-411.21 266.38,-411.21 266.38,-408.89 280.55,-407.02 298,-407.02 315.45,-407.02 329.62,-408.89 329.62,-411.21 329.62,-411.21 329.62,-448.89 329.62,-448.89"/>
+<path fill="none" stroke="black" d="M329.62,-448.89C329.62,-446.58 315.45,-444.71 298,-444.71 280.55,-444.71 266.38,-446.58 266.38,-448.89"/>
+<text xml:space="preserve" text-anchor="middle" x="298" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Container</text>
+<text xml:space="preserve" text-anchor="middle" x="298" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">Registry</text>
+</g>
+<!-- woodpecker&#45;&gt;registry -->
+<g id="edge2" class="edge">
+<title>woodpecker&#45;&gt;registry</title>
+<path fill="none" stroke="black" d="M298,-551.35C298,-529.66 298,-492.15 298,-464.77"/>
+<polygon fill="black" stroke="black" points="301.5,-464.88 298,-454.88 294.5,-464.88 301.5,-464.88"/>
+<text xml:space="preserve" text-anchor="middle" x="308.88" y="-525.25" font-family="Helvetica,sans-Serif" font-size="9.00">Push</text>
+</g>
+<!-- registry&#45;&gt;k8s_local -->
+<g id="edge4" class="edge">
+<title>registry&#45;&gt;k8s_local</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M265.9,-410.59C258.2,-406.51 249.91,-402.4 242,-399.02 204.6,-383.02 161.03,-368.81 127.1,-358.68"/>
+<polygon fill="black" stroke="black" points="128.47,-355.44 117.89,-355.97 126.49,-362.15 128.47,-355.44"/>
+<text xml:space="preserve" text-anchor="middle" x="222.42" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Pull</text>
+</g>
+<!-- registry&#45;&gt;compose_ec2 -->
+<g id="edge3" class="edge">
+<title>registry&#45;&gt;compose_ec2</title>
+<path fill="none" stroke="black" d="M329.84,-409.93C337.55,-405.88 345.91,-401.95 354,-399.02 452.44,-363.35 574.46,-350.26 646.22,-345.49"/>
+<polygon fill="black" stroke="black" points="646.02,-349.01 655.78,-344.88 645.58,-342.02 646.02,-349.01"/>
+<text xml:space="preserve" text-anchor="middle" x="427.09" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Pull</text>
+</g>
+<!-- coll1 -->
+<g id="node12" class="node">
+<title>coll1</title>
+<path fill="#ffccbc" stroke="black" d="M521.88,-448.05C521.88,-448.05 472.12,-448.05 472.12,-448.05 466.12,-448.05 460.12,-442.05 460.12,-436.05 460.12,-436.05 460.12,-424.05 460.12,-424.05 460.12,-418.05 466.12,-412.05 472.12,-412.05 472.12,-412.05 521.88,-412.05 521.88,-412.05 527.88,-412.05 533.88,-418.05 533.88,-424.05 533.88,-424.05 533.88,-436.05 533.88,-436.05 533.88,-442.05 527.88,-448.05 521.88,-448.05"/>
+<text xml:space="preserve" text-anchor="middle" x="497" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
+<text xml:space="preserve" text-anchor="middle" x="497" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine 1)</text>
+</g>
+<!-- coll1&#45;&gt;compose_ec2 -->
+<g id="edge9" class="edge">
+<title>coll1&#45;&gt;compose_ec2</title>
+<path fill="none" stroke="black" d="M521.16,-411.67C528.02,-407.19 535.63,-402.62 543,-399.02 576.02,-382.89 614.85,-369.35 646.44,-359.6"/>
+<polygon fill="black" stroke="black" points="640.37,-365.52 648.58,-358.82 637.98,-358.94 640.37,-365.52"/>
+<text xml:space="preserve" text-anchor="middle" x="602.75" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
+</g>
+<!-- coll2 -->
+<g id="node13" class="node">
+<title>coll2</title>
+<path fill="#ffccbc" stroke="black" d="M613.88,-448.05C613.88,-448.05 564.12,-448.05 564.12,-448.05 558.12,-448.05 552.12,-442.05 552.12,-436.05 552.12,-436.05 552.12,-424.05 552.12,-424.05 552.12,-418.05 558.12,-412.05 564.12,-412.05 564.12,-412.05 613.88,-412.05 613.88,-412.05 619.88,-412.05 625.88,-418.05 625.88,-424.05 625.88,-424.05 625.88,-436.05 625.88,-436.05 625.88,-442.05 619.88,-448.05 613.88,-448.05"/>
+<text xml:space="preserve" text-anchor="middle" x="589" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
+<text xml:space="preserve" text-anchor="middle" x="589" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine 2)</text>
+</g>
+<!-- coll2&#45;&gt;compose_ec2 -->
+<g id="edge10" class="edge">
+<title>coll2&#45;&gt;compose_ec2</title>
+<path fill="none" stroke="black" d="M612.88,-411.59C621.13,-405.55 630.83,-398.47 640.8,-391.17"/>
+<polygon fill="black" stroke="black" points="642.77,-394.07 648.78,-385.34 638.64,-388.41 642.77,-394.07"/>
+<text xml:space="preserve" text-anchor="middle" x="670.19" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
+</g>
+<!-- coll3 -->
+<g id="node14" class="node">
+<title>coll3</title>
+<path fill="#ffccbc" stroke="black" d="M429.62,-448.05C429.62,-448.05 378.38,-448.05 378.38,-448.05 372.38,-448.05 366.38,-442.05 366.38,-436.05 366.38,-436.05 366.38,-424.05 366.38,-424.05 366.38,-418.05 372.38,-412.05 378.38,-412.05 378.38,-412.05 429.62,-412.05 429.62,-412.05 435.62,-412.05 441.62,-418.05 441.62,-424.05 441.62,-424.05 441.62,-436.05 441.62,-436.05 441.62,-442.05 435.62,-448.05 429.62,-448.05"/>
+<text xml:space="preserve" text-anchor="middle" x="404" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
+<text xml:space="preserve" text-anchor="middle" x="404" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine N)</text>
+</g>
+<!-- coll3&#45;&gt;compose_ec2 -->
+<g id="edge11" class="edge">
+<title>coll3&#45;&gt;compose_ec2</title>
+<path fill="none" stroke="black" d="M427.53,-411.82C434.78,-407.12 442.97,-402.41 451,-399.02 514.86,-372.07 593.36,-357.28 646.47,-349.71"/>
+<polygon fill="black" stroke="black" points="639.16,-354.39 648.5,-349.4 638.08,-347.48 639.16,-354.39"/>
+<text xml:space="preserve" text-anchor="middle" x="516.54" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
+</g>
+</g>
+</svg>
diff --git a/docs/architecture/04-grpc-services.dot b/docs/architecture/04-grpc-services.dot
new file mode 100644
index 0000000..9b06929
--- /dev/null
+++ b/docs/architecture/04-grpc-services.dot
@@ -0,0 +1,67 @@
+digraph GrpcServices {
+    rankdir=LR;
+    compound=true;
+    fontname="Helvetica";
+    node [fontname="Helvetica", fontsize=10];
+    edge [fontname="Helvetica", fontsize=9];
+
+    labelloc="t";
+    label="gRPC Service Definitions";
+    fontsize=14;
+
+    node [shape=record, style=filled];
+
+    // MetricsService
+    subgraph cluster_metrics {
+        label="MetricsService";
+        style=filled;
+        fillcolor="#E8F5E9";
+
+        metrics_svc [label="{MetricsService|+ StreamMetrics(stream Metric) → StreamAck\l+ GetCurrentState(StateRequest) → MachineState\l+ GetAllStates(Empty) → AllMachinesState\l}", fillcolor="#C8E6C9"];
+
+        metric_msg [label="{Metric|machine_id: string\lhostname: string\ltimestamp_ms: int64\ltype: MetricType\lvalue: double\llabels: map\l}", fillcolor="#A5D6A7"];
+
+        machine_state [label="{MachineState|machine_id: string\lhostname: string\llast_seen_ms: int64\lcurrent_metrics: Metric[]\lhealth: HealthStatus\lmetadata: map\l}", fillcolor="#A5D6A7"];
+    }
+
+    // ControlService
+    subgraph cluster_control {
+        label="ControlService";
+        style=filled;
+        fillcolor="#E3F2FD";
+
+        control_svc [label="{ControlService|+ Control(stream Command) → stream Response\l}", fillcolor="#90CAF9"];
+
+        commands [label="{ControlCommand|command_id: string\l|UpdateIntervalCommand\lRestartCollectionCommand\lShutdownCommand\l}", fillcolor="#64B5F6"];
+    }
+
+    // ConfigService
+    subgraph cluster_config {
+        label="ConfigService";
+        style=filled;
+        fillcolor="#FFF3E0";
+
+        config_svc [label="{ConfigService|+ GetConfig(ConfigRequest) → CollectorConfig\l+ WatchConfig(ConfigRequest) → stream CollectorConfig\l}", fillcolor="#FFE0B2"];
+
+        collector_config [label="{CollectorConfig|collection_interval_seconds: int32\lenabled_metrics: MetricType[]\llabels: map\lthresholds: ThresholdConfig[]\l}", fillcolor="#FFCC80"];
+    }
+
+    // Enums
+    subgraph cluster_enums {
+        label="Enums";
+        style=filled;
+        fillcolor="#F3E5F5";
+
+        metric_type [label="{MetricType|CPU_PERCENT\lMEMORY_PERCENT\lDISK_PERCENT\lNETWORK_*\lLOAD_AVG_*\l...}", fillcolor="#E1BEE7"];
+
+        health_status [label="{HealthStatus|HEALTHY\lWARNING\lCRITICAL\lUNKNOWN\lOFFLINE\l}", fillcolor="#CE93D8"];
+    }
+
+    // Relationships
+    metrics_svc -> metric_msg [style=dashed];
+    metrics_svc -> machine_state [style=dashed];
+    control_svc -> commands [style=dashed];
+    config_svc -> collector_config [style=dashed];
+    metric_msg -> metric_type [style=dotted];
+    machine_state -> health_status [style=dotted];
+}
diff --git a/docs/architecture/04-grpc-services.svg b/docs/architecture/04-grpc-services.svg
new file mode 100644
index 0000000..d4af478
--- /dev/null
+++ b/docs/architecture/04-grpc-services.svg
@@ -0,0 +1,171 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 14.1.1 (0)
+ -->
+<!-- Title: GrpcServices Pages: 1 -->
+<svg width="1030pt" height="486pt"
+ viewBox="0.00 0.00 1030.00 486.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 482.25)">
+<title>GrpcServices</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-482.25 1026.25,-482.25 1026.25,4 -4,4"/>
+<text xml:space="preserve" text-anchor="middle" x="511.12" y="-460.95" font-family="Helvetica,sans-Serif" font-size="14.00">gRPC Service Definitions</text>
+<g id="clust1" class="cluster">
+<title>cluster_metrics</title>
+<polygon fill="#e8f5e9" stroke="black" points="21.5,-8 21.5,-239 726.75,-239 726.75,-8 21.5,-8"/>
+<text xml:space="preserve" text-anchor="middle" x="374.12" y="-221.7" font-family="Helvetica,sans-Serif" font-size="14.00">MetricsService</text>
+</g>
+<g id="clust2" class="cluster">
+<title>cluster_control</title>
+<polygon fill="#e3f2fd" stroke="black" points="23.38,-247 23.38,-336 799.25,-336 799.25,-247 23.38,-247"/>
+<text xml:space="preserve" text-anchor="middle" x="411.31" y="-318.7" font-family="Helvetica,sans-Serif" font-size="14.00">ControlService</text>
+</g>
+<g id="clust3" class="cluster">
+<title>cluster_config</title>
+<polygon fill="#fff3e0" stroke="black" points="8,-344 8,-445 753,-445 753,-344 8,-344"/>
+<text xml:space="preserve" text-anchor="middle" x="380.5" y="-427.7" font-family="Helvetica,sans-Serif" font-size="14.00">ConfigService</text>
+</g>
+<g id="clust4" class="cluster">
+<title>cluster_enums</title>
+<polygon fill="#f3e5f5" stroke="black" points="819.25,-11 819.25,-229 1014.25,-229 1014.25,-11 819.25,-11"/>
+<text xml:space="preserve" text-anchor="middle" x="916.75" y="-211.7" font-family="Helvetica,sans-Serif" font-size="14.00">Enums</text>
+</g>
+<!-- metrics_svc -->
+<g id="node1" class="node">
+<title>metrics_svc</title>
+<polygon fill="#c8e6c9" stroke="black" points="29.5,-87.88 29.5,-134.12 377.25,-134.12 377.25,-87.88 29.5,-87.88"/>
+<text xml:space="preserve" text-anchor="middle" x="73.5" y="-107.88" font-family="Helvetica,sans-Serif" font-size="10.00">MetricsService</text>
+<polyline fill="none" stroke="black" points="117.5,-87.88 117.5,-134.12"/>
+<text xml:space="preserve" text-anchor="start" x="125.5" y="-120.62" font-family="Helvetica,sans-Serif" font-size="10.00">+ StreamMetrics(stream Metric) → StreamAck</text>
+<text xml:space="preserve" text-anchor="start" x="125.5" y="-107.88" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetCurrentState(StateRequest) → MachineState</text>
+<text xml:space="preserve" text-anchor="start" x="125.5" y="-95.12" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetAllStates(Empty) → AllMachinesState</text>
+</g>
+<!-- metric_msg -->
+<g id="node2" class="node">
+<title>metric_msg</title>
+<polygon fill="#a5d6a7" stroke="black" points="525.5,-16.75 525.5,-101.25 692.5,-101.25 692.5,-16.75 525.5,-16.75"/>
+<text xml:space="preserve" text-anchor="middle" x="548.88" y="-55.88" font-family="Helvetica,sans-Serif" font-size="10.00">Metric</text>
+<polyline fill="none" stroke="black" points="572.25,-16.75 572.25,-101.25"/>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-87.75" font-family="Helvetica,sans-Serif" font-size="10.00">machine_id: string</text>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-75" font-family="Helvetica,sans-Serif" font-size="10.00">hostname: string</text>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-62.25" font-family="Helvetica,sans-Serif" font-size="10.00">timestamp_ms: int64</text>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-49.5" font-family="Helvetica,sans-Serif" font-size="10.00">type: MetricType</text>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-36.75" font-family="Helvetica,sans-Serif" font-size="10.00">value: double</text>
+<text xml:space="preserve" text-anchor="start" x="580.25" y="-24" font-family="Helvetica,sans-Serif" font-size="10.00">labels: map</text>
+</g>
+<!-- metrics_svc&#45;&gt;metric_msg -->
+<g id="edge1" class="edge">
+<title>metrics_svc&#45;&gt;metric_msg</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M377.6,-88.68C424.41,-82.65 473.31,-76.35 513.96,-71.12"/>
+<polygon fill="black" stroke="black" points="514.22,-74.61 523.69,-69.86 513.33,-67.67 514.22,-74.61"/>
+</g>
+<!-- machine_state -->
+<g id="node3" class="node">
+<title>machine_state</title>
+<polygon fill="#a5d6a7" stroke="black" points="499.25,-120.75 499.25,-205.25 718.75,-205.25 718.75,-120.75 499.25,-120.75"/>
+<text xml:space="preserve" text-anchor="middle" x="540.62" y="-159.88" font-family="Helvetica,sans-Serif" font-size="10.00">MachineState</text>
+<polyline fill="none" stroke="black" points="582,-120.75 582,-205.25"/>
+<text xml:space="preserve" text-anchor="start" x="590" y="-191.75" font-family="Helvetica,sans-Serif" font-size="10.00">machine_id: string</text>
+<text xml:space="preserve" text-anchor="start" x="590" y="-179" font-family="Helvetica,sans-Serif" font-size="10.00">hostname: string</text>
+<text xml:space="preserve" text-anchor="start" x="590" y="-166.25" font-family="Helvetica,sans-Serif" font-size="10.00">last_seen_ms: int64</text>
+<text xml:space="preserve" text-anchor="start" x="590" y="-153.5" font-family="Helvetica,sans-Serif" font-size="10.00">current_metrics: Metric[]</text>
+<text xml:space="preserve" text-anchor="start" x="590" y="-140.75" font-family="Helvetica,sans-Serif" font-size="10.00">health: HealthStatus</text>
+<text xml:space="preserve" text-anchor="start" x="590" y="-128" font-family="Helvetica,sans-Serif" font-size="10.00">metadata: map</text>
+</g>
+<!-- metrics_svc&#45;&gt;machine_state -->
+<g id="edge2" class="edge">
+<title>metrics_svc&#45;&gt;machine_state</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M377.6,-133.32C414.74,-138.1 453.2,-143.06 487.8,-147.51"/>
+<polygon fill="black" stroke="black" points="487.03,-150.94 497.4,-148.75 487.93,-144 487.03,-150.94"/>
+</g>
+<!-- metric_type -->
+<g id="node8" class="node">
+<title>metric_type</title>
+<polygon fill="#e1bee7" stroke="black" points="827.25,-19.75 827.25,-104.25 1006.25,-104.25 1006.25,-19.75 827.25,-19.75"/>
+<text xml:space="preserve" text-anchor="middle" x="861.88" y="-58.88" font-family="Helvetica,sans-Serif" font-size="10.00">MetricType</text>
+<polyline fill="none" stroke="black" points="896.5,-19.75 896.5,-104.25"/>
+<text xml:space="preserve" text-anchor="start" x="904.5" y="-90.75" font-family="Helvetica,sans-Serif" font-size="10.00">CPU_PERCENT</text>
+<text xml:space="preserve" text-anchor="start" x="904.5" y="-78" font-family="Helvetica,sans-Serif" font-size="10.00">MEMORY_PERCENT</text>
+<text xml:space="preserve" text-anchor="start" x="904.5" y="-65.25" font-family="Helvetica,sans-Serif" font-size="10.00">DISK_PERCENT</text>
+<text xml:space="preserve" text-anchor="start" x="904.5" y="-52.5" font-family="Helvetica,sans-Serif" font-size="10.00">NETWORK_*</text>
+<text xml:space="preserve" text-anchor="start" x="904.5" y="-39.75" font-family="Helvetica,sans-Serif" font-size="10.00">LOAD_AVG_*</text>
+<text xml:space="preserve" text-anchor="middle" x="951.38" y="-27" font-family="Helvetica,sans-Serif" font-size="10.00">...</text>
+</g>
+<!-- metric_msg&#45;&gt;metric_type -->
+<g id="edge5" class="edge">
+<title>metric_msg&#45;&gt;metric_type</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M692.74,-59.81C730.57,-60.18 775.71,-60.63 815.45,-61.02"/>
+<polygon fill="black" stroke="black" points="815.23,-64.51 825.27,-61.11 815.3,-57.51 815.23,-64.51"/>
+</g>
+<!-- health_status -->
+<g id="node9" class="node">
+<title>health_status</title>
+<polygon fill="#ce93d8" stroke="black" points="842.25,-123.12 842.25,-194.88 991.25,-194.88 991.25,-123.12 842.25,-123.12"/>
+<text xml:space="preserve" text-anchor="middle" x="881.75" y="-155.88" font-family="Helvetica,sans-Serif" font-size="10.00">HealthStatus</text>
+<polyline fill="none" stroke="black" points="921.25,-123.12 921.25,-194.88"/>
+<text xml:space="preserve" text-anchor="start" x="929.25" y="-181.38" font-family="Helvetica,sans-Serif" font-size="10.00">HEALTHY</text>
+<text xml:space="preserve" text-anchor="start" x="929.25" y="-168.62" font-family="Helvetica,sans-Serif" font-size="10.00">WARNING</text>
+<text xml:space="preserve" text-anchor="start" x="929.25" y="-155.88" font-family="Helvetica,sans-Serif" font-size="10.00">CRITICAL</text>
+<text xml:space="preserve" text-anchor="start" x="929.25" y="-143.12" font-family="Helvetica,sans-Serif" font-size="10.00">UNKNOWN</text>
+<text xml:space="preserve" text-anchor="start" x="929.25" y="-130.38" font-family="Helvetica,sans-Serif" font-size="10.00">OFFLINE</text>
+</g>
+<!-- machine_state&#45;&gt;health_status -->
+<g id="edge6" class="edge">
+<title>machine_state&#45;&gt;health_status</title>
+<path fill="none" stroke="black" stroke-dasharray="1,5" d="M719.09,-161.57C755.76,-161.09 796.1,-160.57 830.65,-160.11"/>
+<polygon fill="black" stroke="black" points="830.67,-163.61 840.62,-159.98 830.58,-156.61 830.67,-163.61"/>
+</g>
+<!-- control_svc -->
+<g id="node4" class="node">
+<title>control_svc</title>
+<polygon fill="#90caf9" stroke="black" points="31.38,-261 31.38,-297 375.38,-297 375.38,-261 31.38,-261"/>
+<text xml:space="preserve" text-anchor="middle" x="75" y="-276" font-family="Helvetica,sans-Serif" font-size="10.00">ControlService</text>
+<polyline fill="none" stroke="black" points="118.62,-261.25 118.62,-297"/>
+<text xml:space="preserve" text-anchor="start" x="126.62" y="-276" font-family="Helvetica,sans-Serif" font-size="10.00">+ Control(stream Command) → stream Response</text>
+</g>
+<!-- commands -->
+<g id="node5" class="node">
+<title>commands</title>
+<polygon fill="#64b5f6" stroke="black" points="426.75,-255.88 426.75,-302.12 791.25,-302.12 791.25,-255.88 426.75,-255.88"/>
+<text xml:space="preserve" text-anchor="middle" x="477.5" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">ControlCommand</text>
+<polyline fill="none" stroke="black" points="528.25,-255.88 528.25,-302.12"/>
+<text xml:space="preserve" text-anchor="start" x="536.25" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">command_id: string</text>
+<polyline fill="none" stroke="black" points="641,-255.88 641,-302.12"/>
+<text xml:space="preserve" text-anchor="start" x="649" y="-288.62" font-family="Helvetica,sans-Serif" font-size="10.00">UpdateIntervalCommand</text>
+<text xml:space="preserve" text-anchor="start" x="649" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">RestartCollectionCommand</text>
+<text xml:space="preserve" text-anchor="start" x="649" y="-263.12" font-family="Helvetica,sans-Serif" font-size="10.00">ShutdownCommand</text>
+</g>
+<!-- control_svc&#45;&gt;commands -->
+<g id="edge3" class="edge">
+<title>control_svc&#45;&gt;commands</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M375.84,-279C388.79,-279 401.92,-279 414.99,-279"/>
+<polygon fill="black" stroke="black" points="414.95,-282.5 424.95,-279 414.95,-275.5 414.95,-282.5"/>
+</g>
+<!-- config_svc -->
+<g id="node6" class="node">
+<title>config_svc</title>
+<polygon fill="#ffe0b2" stroke="black" points="16,-364 16,-400 390.75,-400 390.75,-364 16,-364"/>
+<text xml:space="preserve" text-anchor="middle" x="57.38" y="-379.12" font-family="Helvetica,sans-Serif" font-size="10.00">ConfigService</text>
+<polyline fill="none" stroke="black" points="98.75,-364.5 98.75,-400"/>
+<text xml:space="preserve" text-anchor="start" x="106.75" y="-385.5" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetConfig(ConfigRequest) → CollectorConfig</text>
+<text xml:space="preserve" text-anchor="start" x="106.75" y="-372.75" font-family="Helvetica,sans-Serif" font-size="10.00">+ WatchConfig(ConfigRequest) → stream CollectorConfig</text>
+</g>
+<!-- collector_config -->
+<g id="node7" class="node">
+<title>collector_config</title>
+<polygon fill="#ffcc80" stroke="black" points="473,-352.5 473,-411.5 745,-411.5 745,-352.5 473,-352.5"/>
+<text xml:space="preserve" text-anchor="middle" x="518.12" y="-378.88" font-family="Helvetica,sans-Serif" font-size="10.00">CollectorConfig</text>
+<polyline fill="none" stroke="black" points="563.25,-352.5 563.25,-411.5"/>
+<text xml:space="preserve" text-anchor="start" x="571.25" y="-398" font-family="Helvetica,sans-Serif" font-size="10.00">collection_interval_seconds: int32</text>
+<text xml:space="preserve" text-anchor="start" x="571.25" y="-385.25" font-family="Helvetica,sans-Serif" font-size="10.00">enabled_metrics: MetricType[]</text>
+<text xml:space="preserve" text-anchor="start" x="571.25" y="-372.5" font-family="Helvetica,sans-Serif" font-size="10.00">labels: map</text>
+<text xml:space="preserve" text-anchor="start" x="571.25" y="-359.75" font-family="Helvetica,sans-Serif" font-size="10.00">thresholds: ThresholdConfig[]</text>
+</g>
+<!-- config_svc&#45;&gt;collector_config -->
+<g id="edge4" class="edge">
+<title>config_svc&#45;&gt;collector_config</title>
+<path fill="none" stroke="black" stroke-dasharray="5,2" d="M391.12,-382C414.61,-382 438.36,-382 461.11,-382"/>
+<polygon fill="black" stroke="black" points="461.03,-385.5 471.03,-382 461.03,-378.5 461.03,-385.5"/>
+</g>
+</g>
+</svg>
diff --git a/docs/architecture/graph.html b/docs/architecture/graph.html
new file mode 100644
index 0000000..8edb4cf
--- /dev/null
+++ b/docs/architecture/graph.html
@@ -0,0 +1,120 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Graph Viewer - System Monitor</title>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body class="graph-viewer">
+    <header class="graph-header">
+        <a href="index.html" class="back-link">← Index</a>
+        <div class="nav-controls">
+            <button onclick="navigate(-1)" id="btn-prev" title="Previous (←)">◀</button>
+            <span id="nav-position">1 / 4</span>
+            <button onclick="navigate(1)" id="btn-next" title="Next (→)">▶</button>
+        </div>
+        <h1 id="graph-title">Loading...</h1>
+        <div class="graph-controls">
+            <button onclick="setMode('fit')">Fit</button>
+            <button onclick="setMode('fit-width')">Width</button>
+            <button onclick="setMode('fit-height')">Height</button>
+            <button onclick="setMode('actual-size')">100%</button>
+            <button onclick="downloadSvg()">↓ SVG</button>
+        </div>
+    </header>
+
+    <div class="graph-container" id="graph-container">
+        <img id="graph-img" src="" alt="Graph">
+    </div>
+
+    <script>
+        const graphOrder = [
+            '01-system-overview',
+            '02-data-flow',
+            '03-deployment',
+            '04-grpc-services'
+        ];
+
+        const graphs = {
+            '01-system-overview': {
+                title: 'System Overview',
+                file: '01-system-overview.svg'
+            },
+            '02-data-flow': {
+                title: 'Data Flow Pipeline',
+                file: '02-data-flow.svg'
+            },
+            '03-deployment': {
+                title: 'Deployment Architecture',
+                file: '03-deployment.svg'
+            },
+            '04-grpc-services': {
+                title: 'gRPC Service Definitions',
+                file: '04-grpc-services.svg'
+            }
+        };
+
+        const params = new URLSearchParams(window.location.search);
+        let graphKey = params.get('g') || '01-system-overview';
+        let currentIndex = graphOrder.indexOf(graphKey);
+        if (currentIndex === -1) currentIndex = 0;
+
+        function loadGraph(key) {
+            const graph = graphs[key];
+            document.getElementById('graph-title').textContent = graph.title;
+            document.getElementById('graph-img').src = graph.file;
+            document.title = graph.title + ' - System Monitor';
+            history.replaceState(null, '', '?g=' + key);
+            graphKey = key;
+            updateNavHints();
+        }
+
+        function updateNavHints() {
+            const idx = graphOrder.indexOf(graphKey);
+            const prevBtn = document.getElementById('btn-prev');
+            const nextBtn = document.getElementById('btn-next');
+            prevBtn.disabled = idx === 0;
+            nextBtn.disabled = idx === graphOrder.length - 1;
+            document.getElementById('nav-position').textContent = (idx + 1) + ' / ' + graphOrder.length;
+        }
+
+        function navigate(direction) {
+            const idx = graphOrder.indexOf(graphKey);
+            const newIdx = idx + direction;
+            if (newIdx >= 0 && newIdx < graphOrder.length) {
+                currentIndex = newIdx;
+                loadGraph(graphOrder[newIdx]);
+            }
+        }
+
+        function setMode(mode) {
+            const container = document.getElementById('graph-container');
+            container.className = 'graph-container ' + mode;
+        }
+
+        function downloadSvg() {
+            const graph = graphs[graphKey];
+            const link = document.createElement('a');
+            link.href = graph.file;
+            link.download = graph.file;
+            link.click();
+        }
+
+        // Keyboard navigation
+        document.addEventListener('keydown', (e) => {
+            if (e.key === 'ArrowLeft') {
+                navigate(-1);
+            } else if (e.key === 'ArrowRight') {
+                navigate(1);
+            } else if (e.key === 'Escape') {
+                window.location.href = 'index.html';
+            }
+        });
+
+        // Initialize
+        loadGraph(graphOrder[currentIndex]);
+        setMode('fit');
+    </script>
+</body>
+</html>
diff --git a/docs/architecture/index.html b/docs/architecture/index.html
new file mode 100644
index 0000000..bcc92dd
--- /dev/null
+++ b/docs/architecture/index.html
@@ -0,0 +1,207 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>System Monitor - Architecture Documentation</title>
+    <link rel="stylesheet" href="styles.css">
+</head>
+<body>
+    <header>
+        <h1>System Monitoring Platform</h1>
+        <p class="subtitle">Architecture & Design Documentation</p>
+    </header>
+
+    <main>
+        <section class="graph-section" id="overview">
+            <div class="graph-header-row">
+                <h2>System Overview</h2>
+                <a href="graph.html?g=01-system-overview" class="view-btn">View Full</a>
+            </div>
+            <a href="graph.html?g=01-system-overview" class="graph-preview">
+                <img src="01-system-overview.svg" alt="System Overview">
+            </a>
+            <div class="graph-details">
+                <p>High-level architecture showing all services, data stores, and communication patterns.</p>
+                <h4>Key Components</h4>
+                <ul>
+                    <li><strong>Collector</strong>: Runs on each monitored machine, streams metrics via gRPC</li>
+                    <li><strong>Aggregator</strong>: Central gRPC server, receives streams, normalizes data</li>
+                    <li><strong>Gateway</strong>: FastAPI service, WebSocket for browser, REST for queries</li>
+                    <li><strong>Alerts</strong>: Subscribes to events, evaluates thresholds, triggers actions</li>
+                </ul>
+            </div>
+        </section>
+
+        <section class="graph-section" id="data-flow">
+            <div class="graph-header-row">
+                <h2>Data Flow Pipeline</h2>
+                <a href="graph.html?g=02-data-flow" class="view-btn">View Full</a>
+            </div>
+            <a href="graph.html?g=02-data-flow" class="graph-preview">
+                <img src="02-data-flow.svg" alt="Data Flow">
+            </a>
+            <div class="graph-details">
+                <p>How metrics flow from collection through storage with different retention tiers.</p>
+                <h4>Storage Tiers</h4>
+                <table class="details-table">
+                    <thead>
+                        <tr><th>Tier</th><th>Resolution</th><th>Retention</th><th>Use Case</th></tr>
+                    </thead>
+                    <tbody>
+                        <tr>
+                            <td>Hot (Redis)</td>
+                            <td>5s</td>
+                            <td>5 min</td>
+                            <td>Current state, live dashboard</td>
+                        </tr>
+                        <tr>
+                            <td>Raw (TimescaleDB)</td>
+                            <td>5s</td>
+                            <td>24h</td>
+                            <td>Recent detailed analysis</td>
+                        </tr>
+                        <tr>
+                            <td>1-min Aggregates</td>
+                            <td>1m</td>
+                            <td>7d</td>
+                            <td>Week view, trends</td>
+                        </tr>
+                        <tr>
+                            <td>1-hour Aggregates</td>
+                            <td>1h</td>
+                            <td>90d</td>
+                            <td>Long-term analysis</td>
+                        </tr>
+                    </tbody>
+                </table>
+            </div>
+        </section>
+
+        <section class="graph-section" id="deployment">
+            <div class="graph-header-row">
+                <h2>Deployment Architecture</h2>
+                <a href="graph.html?g=03-deployment" class="view-btn">View Full</a>
+            </div>
+            <a href="graph.html?g=03-deployment" class="graph-preview">
+                <img src="03-deployment.svg" alt="Deployment">
+            </a>
+            <div class="graph-details">
+                <p>Deployment options from local development to AWS production.</p>
+                <h4>Environments</h4>
+                <ul>
+                    <li><strong>Local Dev</strong>: Kind + Tilt for K8s, or Docker Compose</li>
+                    <li><strong>Demo (EC2)</strong>: Docker Compose on t2.small at sysmonstm.mcrn.ar</li>
+                    <li><strong>Lambda Pipeline</strong>: SQS-triggered aggregation for data processing experience</li>
+                </ul>
+            </div>
+        </section>
+
+        <section class="graph-section" id="grpc">
+            <div class="graph-header-row">
+                <h2>gRPC Service Definitions</h2>
+                <a href="graph.html?g=04-grpc-services" class="view-btn">View Full</a>
+            </div>
+            <a href="graph.html?g=04-grpc-services" class="graph-preview">
+                <img src="04-grpc-services.svg" alt="gRPC Services">
+            </a>
+            <div class="graph-details">
+                <p>Protocol Buffer service and message definitions.</p>
+                <h4>Services</h4>
+                <ul>
+                    <li><strong>MetricsService</strong>: Client-side streaming for metrics ingestion</li>
+                    <li><strong>ControlService</strong>: Bidirectional streaming for collector control</li>
+                    <li><strong>ConfigService</strong>: Server-side streaming for config updates</li>
+                </ul>
+            </div>
+        </section>
+
+        <section class="findings-section">
+            <h2>Interview Talking Points</h2>
+            <div class="findings-grid">
+                <article class="finding-card">
+                    <h3>Domain Mapping</h3>
+                    <ul>
+                        <li>Machine = Payment Processor</li>
+                        <li>Metrics Stream = Transaction Stream</li>
+                        <li>Thresholds = Fraud Detection</li>
+                        <li>Aggregator = Payment Hub</li>
+                    </ul>
+                </article>
+                <article class="finding-card">
+                    <h3>gRPC Patterns</h3>
+                    <ul>
+                        <li>Client streaming (metrics)</li>
+                        <li>Server streaming (config)</li>
+                        <li>Bidirectional (control)</li>
+                        <li>Health checking</li>
+                    </ul>
+                </article>
+                <article class="finding-card">
+                    <h3>Event-Driven</h3>
+                    <ul>
+                        <li>Redis Pub/Sub (current)</li>
+                        <li>Abstraction for Kafka switch</li>
+                        <li>Decoupled alert processing</li>
+                        <li>Real-time WebSocket push</li>
+                    </ul>
+                </article>
+                <article class="finding-card">
+                    <h3>Resilience</h3>
+                    <ul>
+                        <li>Collectors are independent</li>
+                        <li>Graceful degradation</li>
+                        <li>Retry with backoff</li>
+                        <li>Health checks everywhere</li>
+                    </ul>
+                </article>
+            </div>
+        </section>
+
+        <section class="tech-section">
+            <h2>Technology Stack</h2>
+            <div class="tech-grid">
+                <div class="tech-column">
+                    <h3>Core</h3>
+                    <ul>
+                        <li>Python 3.11+</li>
+                        <li>FastAPI</li>
+                        <li>gRPC / protobuf</li>
+                        <li>asyncio</li>
+                    </ul>
+                </div>
+                <div class="tech-column">
+                    <h3>Data</h3>
+                    <ul>
+                        <li>TimescaleDB</li>
+                        <li>Redis</li>
+                        <li>Redis Pub/Sub</li>
+                    </ul>
+                </div>
+                <div class="tech-column">
+                    <h3>Infrastructure</h3>
+                    <ul>
+                        <li>Docker</li>
+                        <li>Kubernetes</li>
+                        <li>Kind + Tilt</li>
+                        <li>Terraform</li>
+                    </ul>
+                </div>
+                <div class="tech-column">
+                    <h3>CI/CD</h3>
+                    <ul>
+                        <li>Woodpecker CI</li>
+                        <li>Kustomize</li>
+                        <li>Container Registry</li>
+                    </ul>
+                </div>
+            </div>
+        </section>
+    </main>
+
+    <footer>
+        <p>System Monitoring Platform - Architecture Documentation</p>
+        <p class="date">Generated: <time datetime="2025-12-29">December 2025</time></p>
+    </footer>
+</body>
+</html>
diff --git a/docs/architecture/styles.css b/docs/architecture/styles.css
new file mode 100644
index 0000000..4f251b9
--- /dev/null
+++ b/docs/architecture/styles.css
@@ -0,0 +1,343 @@
+:root {
+    --bg-primary: #1a1a2e;
+    --bg-secondary: #16213e;
+    --bg-card: #0f3460;
+    --text-primary: #eee;
+    --text-secondary: #a0a0a0;
+    --accent: #e94560;
+    --accent-secondary: #533483;
+    --border: #2a2a4a;
+}
+
+* {
+    box-sizing: border-box;
+    margin: 0;
+    padding: 0;
+}
+
+body {
+    font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
+    background: var(--bg-primary);
+    color: var(--text-primary);
+    line-height: 1.6;
+}
+
+header {
+    background: linear-gradient(135deg, var(--bg-secondary), var(--accent-secondary));
+    padding: 2rem;
+    text-align: center;
+    border-bottom: 2px solid var(--accent);
+}
+
+header h1 {
+    font-size: 2rem;
+    margin-bottom: 0.5rem;
+}
+
+header .subtitle {
+    color: var(--text-secondary);
+    font-size: 1rem;
+}
+
+main {
+    max-width: 1400px;
+    margin: 0 auto;
+    padding: 2rem;
+}
+
+/* Graph sections */
+.graph-section {
+    background: var(--bg-secondary);
+    border-radius: 8px;
+    padding: 1.5rem;
+    margin-bottom: 2rem;
+    border: 1px solid var(--border);
+}
+
+.graph-header-row {
+    display: flex;
+    justify-content: space-between;
+    align-items: center;
+    margin-bottom: 1rem;
+}
+
+.graph-header-row h2 {
+    font-size: 1.25rem;
+    color: var(--accent);
+}
+
+.view-btn {
+    background: var(--accent);
+    color: white;
+    padding: 0.5rem 1rem;
+    border-radius: 4px;
+    text-decoration: none;
+    font-size: 0.875rem;
+    transition: opacity 0.2s;
+}
+
+.view-btn:hover {
+    opacity: 0.8;
+}
+
+.graph-preview {
+    display: block;
+    background: white;
+    border-radius: 4px;
+    padding: 1rem;
+    margin-bottom: 1rem;
+    overflow: auto;
+    max-height: 400px;
+}
+
+.graph-preview img {
+    max-width: 100%;
+    height: auto;
+}
+
+.graph-details {
+    color: var(--text-secondary);
+    font-size: 0.9rem;
+}
+
+.graph-details h4 {
+    color: var(--text-primary);
+    margin: 1rem 0 0.5rem;
+}
+
+.graph-details ul {
+    margin-left: 1.5rem;
+}
+
+.graph-details li {
+    margin-bottom: 0.25rem;
+}
+
+/* Tech section */
+.tech-section {
+    background: var(--bg-secondary);
+    border-radius: 8px;
+    padding: 1.5rem;
+    margin-bottom: 2rem;
+    border: 1px solid var(--border);
+}
+
+.tech-section h2 {
+    color: var(--accent);
+    margin-bottom: 1rem;
+}
+
+.tech-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
+    gap: 1.5rem;
+}
+
+.tech-column h3 {
+    color: var(--text-primary);
+    font-size: 1rem;
+    margin-bottom: 0.75rem;
+    padding-bottom: 0.5rem;
+    border-bottom: 1px solid var(--border);
+}
+
+.tech-column ul {
+    list-style: none;
+}
+
+.tech-column li {
+    padding: 0.25rem 0;
+    color: var(--text-secondary);
+}
+
+/* Findings */
+.findings-section {
+    margin-bottom: 2rem;
+}
+
+.findings-section h2 {
+    color: var(--accent);
+    margin-bottom: 1rem;
+}
+
+.findings-grid {
+    display: grid;
+    grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
+    gap: 1rem;
+}
+
+.finding-card {
+    background: var(--bg-secondary);
+    border-radius: 8px;
+    padding: 1.25rem;
+    border: 1px solid var(--border);
+}
+
+.finding-card h3 {
+    color: var(--accent);
+    font-size: 1rem;
+    margin-bottom: 0.75rem;
+}
+
+.finding-card ul {
+    margin-left: 1rem;
+    color: var(--text-secondary);
+}
+
+.finding-card code {
+    background: var(--bg-primary);
+    padding: 0.125rem 0.375rem;
+    border-radius: 3px;
+    font-size: 0.85em;
+}
+
+/* Footer */
+footer {
+    text-align: center;
+    padding: 2rem;
+    color: var(--text-secondary);
+    border-top: 1px solid var(--border);
+}
+
+footer .date {
+    font-size: 0.85rem;
+}
+
+/* Graph viewer page */
+body.graph-viewer {
+    display: flex;
+    flex-direction: column;
+    height: 100vh;
+}
+
+.graph-header {
+    display: flex;
+    align-items: center;
+    gap: 1rem;
+    padding: 0.75rem 1rem;
+    background: var(--bg-secondary);
+    border-bottom: 1px solid var(--border);
+    flex-wrap: wrap;
+}
+
+.back-link {
+    color: var(--accent);
+    text-decoration: none;
+}
+
+.nav-controls {
+    display: flex;
+    align-items: center;
+    gap: 0.5rem;
+}
+
+.nav-controls button {
+    background: var(--bg-card);
+    color: var(--text-primary);
+    border: 1px solid var(--border);
+    padding: 0.25rem 0.75rem;
+    border-radius: 4px;
+    cursor: pointer;
+}
+
+.nav-controls button:disabled {
+    opacity: 0.3;
+    cursor: not-allowed;
+}
+
+#nav-position {
+    color: var(--text-secondary);
+    font-size: 0.85rem;
+}
+
+.graph-header h1 {
+    flex: 1;
+    font-size: 1rem;
+    text-align: center;
+}
+
+.graph-controls {
+    display: flex;
+    gap: 0.5rem;
+}
+
+.graph-controls button {
+    background: var(--bg-card);
+    color: var(--text-primary);
+    border: 1px solid var(--border);
+    padding: 0.375rem 0.75rem;
+    border-radius: 4px;
+    cursor: pointer;
+    font-size: 0.85rem;
+}
+
+.graph-controls button:hover {
+    background: var(--accent);
+}
+
+.graph-container {
+    flex: 1;
+    overflow: auto;
+    background: white;
+    display: flex;
+    justify-content: center;
+    align-items: flex-start;
+    padding: 1rem;
+}
+
+.graph-container.fit img {
+    max-width: 100%;
+    max-height: calc(100vh - 60px);
+    object-fit: contain;
+}
+
+.graph-container.fit-width img {
+    width: 100%;
+    height: auto;
+}
+
+.graph-container.fit-height img {
+    height: calc(100vh - 60px);
+    width: auto;
+}
+
+.graph-container.actual-size img {
+    /* No constraints */
+}
+
+/* Tables */
+.details-table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 1rem 0;
+    font-size: 0.85rem;
+}
+
+.details-table th,
+.details-table td {
+    padding: 0.5rem;
+    text-align: left;
+    border-bottom: 1px solid var(--border);
+}
+
+.details-table th {
+    color: var(--text-primary);
+    background: var(--bg-primary);
+}
+
+.details-table td {
+    color: var(--text-secondary);
+}
+
+.details-table code {
+    background: var(--bg-primary);
+    padding: 0.125rem 0.375rem;
+    border-radius: 3px;
+}
+
+.note {
+    font-style: italic;
+    font-size: 0.85rem;
+    color: var(--text-secondary);
+    margin-top: 0.5rem;
+}
diff --git a/infra/aws/lambdas/aggregator/placeholder.txt b/infra/aws/lambdas/aggregator/placeholder.txt
new file mode 100644
index 0000000..48cdce8
--- /dev/null
+++ b/infra/aws/lambdas/aggregator/placeholder.txt
@@ -0,0 +1 @@
+placeholder
diff --git a/infra/aws/lambdas/aggregator/placeholder.zip b/infra/aws/lambdas/aggregator/placeholder.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a26ef866da7506d19df44123c54d9956c8ffc25e
GIT binary patch
literal 192
zcmWIWW@h1H00Fj~xzP%ZI(K-0Y!K#WkYOmuNlZ@7$j?bhEz&EgC<zVWWMJlA8<Yvc
zr4`%^j4Ush85qC>l2Wb!Z$>6LW?aTffNW!61mZ1?AQqBNh$XBLOVBI`@MdKLDQ5)2
KP#_J$3=9Cm5GKL^

literal 0
HcmV?d00001

diff --git a/infra/aws/lambdas/compactor/placeholder.txt b/infra/aws/lambdas/compactor/placeholder.txt
new file mode 100644
index 0000000..48cdce8
--- /dev/null
+++ b/infra/aws/lambdas/compactor/placeholder.txt
@@ -0,0 +1 @@
+placeholder
diff --git a/infra/aws/lambdas/compactor/placeholder.zip b/infra/aws/lambdas/compactor/placeholder.zip
new file mode 100644
index 0000000000000000000000000000000000000000..a26ef866da7506d19df44123c54d9956c8ffc25e
GIT binary patch
literal 192
zcmWIWW@h1H00Fj~xzP%ZI(K-0Y!K#WkYOmuNlZ@7$j?bhEz&EgC<zVWWMJlA8<Yvc
zr4`%^j4Ush85qC>l2Wb!Z$>6LW?aTffNW!61mZ1?AQqBNh$XBLOVBI`@MdKLDQ5)2
KP#_J$3=9Cm5GKL^

literal 0
HcmV?d00001

diff --git a/infra/aws/terraform/ec2.tf b/infra/aws/terraform/ec2.tf
new file mode 100644
index 0000000..e9d2b88
--- /dev/null
+++ b/infra/aws/terraform/ec2.tf
@@ -0,0 +1,148 @@
+# EC2 Instance for Docker Compose deployment
+
+resource "aws_security_group" "sysmonstm" {
+  name_prefix = "${var.project_name}-"
+  description = "Security group for System Monitor Platform"
+
+  # HTTP/HTTPS
+  ingress {
+    from_port   = 80
+    to_port     = 80
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "HTTP"
+  }
+
+  ingress {
+    from_port   = 443
+    to_port     = 443
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "HTTPS"
+  }
+
+  # gRPC for collectors
+  ingress {
+    from_port   = 50051
+    to_port     = 50051
+    protocol    = "tcp"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "gRPC Aggregator"
+  }
+
+  # SSH (restricted)
+  dynamic "ingress" {
+    for_each = length(var.allowed_ssh_cidrs) > 0 ? [1] : []
+    content {
+      from_port   = 22
+      to_port     = 22
+      protocol    = "tcp"
+      cidr_blocks = var.allowed_ssh_cidrs
+      description = "SSH"
+    }
+  }
+
+  egress {
+    from_port   = 0
+    to_port     = 0
+    protocol    = "-1"
+    cidr_blocks = ["0.0.0.0/0"]
+    description = "Allow all outbound"
+  }
+
+  tags = {
+    Name = "${var.project_name}-sg"
+  }
+}
+
+resource "aws_iam_role" "ec2" {
+  name_prefix = "${var.project_name}-ec2-"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = "sts:AssumeRole"
+        Effect = "Allow"
+        Principal = {
+          Service = "ec2.amazonaws.com"
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy_attachment" "ec2_ssm" {
+  role       = aws_iam_role.ec2.name
+  policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
+}
+
+resource "aws_iam_instance_profile" "ec2" {
+  name_prefix = "${var.project_name}-"
+  role        = aws_iam_role.ec2.name
+}
+
+resource "aws_instance" "sysmonstm" {
+  ami                    = data.aws_ami.amazon_linux_2023.id
+  instance_type          = var.ec2_instance_type
+  key_name               = var.ec2_key_name != "" ? var.ec2_key_name : null
+  vpc_security_group_ids = [aws_security_group.sysmonstm.id]
+  iam_instance_profile   = aws_iam_instance_profile.ec2.name
+
+  root_block_device {
+    volume_size = 20
+    volume_type = "gp3"
+    encrypted   = true
+  }
+
+  user_data = <<-EOF
+    #!/bin/bash
+    set -e
+
+    # Install Docker
+    dnf update -y
+    dnf install -y docker git
+    systemctl enable docker
+    systemctl start docker
+
+    # Install Docker Compose
+    curl -L "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" \
+      -o /usr/local/bin/docker-compose
+    chmod +x /usr/local/bin/docker-compose
+
+    # Add ec2-user to docker group
+    usermod -aG docker ec2-user
+
+    # Clone and start the application
+    cd /home/ec2-user
+    git clone https://github.com/yourusername/sysmonstm.git || true
+    cd sysmonstm
+
+    # Create .env file
+    cat > .env <<EOL
+    LOG_LEVEL=INFO
+    MACHINE_ID=aws-demo
+    EOL
+
+    # Start services
+    docker-compose up -d
+  EOF
+
+  tags = {
+    Name = "${var.project_name}-server"
+  }
+
+  lifecycle {
+    ignore_changes = [ami]
+  }
+}
+
+# Elastic IP for stable address
+resource "aws_eip" "sysmonstm" {
+  instance = aws_instance.sysmonstm.id
+  domain   = "vpc"
+
+  tags = {
+    Name = "${var.project_name}-eip"
+  }
+}
diff --git a/infra/aws/terraform/lambda.tf b/infra/aws/terraform/lambda.tf
new file mode 100644
index 0000000..258f13f
--- /dev/null
+++ b/infra/aws/terraform/lambda.tf
@@ -0,0 +1,203 @@
+# Lambda Functions for Data Processing Pipeline
+# These are optional and enabled via enable_lambda_pipeline variable
+
+# SQS Queue for buffering metrics
+resource "aws_sqs_queue" "metrics" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  name                       = "${var.project_name}-metrics"
+  visibility_timeout_seconds = var.lambda_timeout * 2
+  message_retention_seconds  = 86400  # 24 hours
+
+  redrive_policy = jsonencode({
+    deadLetterTargetArn = aws_sqs_queue.metrics_dlq[0].arn
+    maxReceiveCount     = 3
+  })
+}
+
+resource "aws_sqs_queue" "metrics_dlq" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  name                      = "${var.project_name}-metrics-dlq"
+  message_retention_seconds = 1209600  # 14 days
+}
+
+# S3 Bucket for metric backups
+resource "aws_s3_bucket" "metrics" {
+  count = var.enable_s3_backup ? 1 : 0
+
+  bucket_prefix = "${var.project_name}-metrics-"
+}
+
+resource "aws_s3_bucket_lifecycle_configuration" "metrics" {
+  count  = var.enable_s3_backup ? 1 : 0
+  bucket = aws_s3_bucket.metrics[0].id
+
+  rule {
+    id     = "archive-old-metrics"
+    status = "Enabled"
+
+    transition {
+      days          = 30
+      storage_class = "STANDARD_IA"
+    }
+
+    transition {
+      days          = 90
+      storage_class = "GLACIER"
+    }
+
+    expiration {
+      days = 365
+    }
+  }
+}
+
+# IAM Role for Lambda
+resource "aws_iam_role" "lambda" {
+  count       = var.enable_lambda_pipeline ? 1 : 0
+  name_prefix = "${var.project_name}-lambda-"
+
+  assume_role_policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Action = "sts:AssumeRole"
+        Effect = "Allow"
+        Principal = {
+          Service = "lambda.amazonaws.com"
+        }
+      }
+    ]
+  })
+}
+
+resource "aws_iam_role_policy" "lambda" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+  name  = "lambda-policy"
+  role  = aws_iam_role.lambda[0].id
+
+  policy = jsonencode({
+    Version = "2012-10-17"
+    Statement = [
+      {
+        Effect = "Allow"
+        Action = [
+          "logs:CreateLogGroup",
+          "logs:CreateLogStream",
+          "logs:PutLogEvents"
+        ]
+        Resource = "arn:aws:logs:*:*:*"
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "sqs:ReceiveMessage",
+          "sqs:DeleteMessage",
+          "sqs:GetQueueAttributes"
+        ]
+        Resource = aws_sqs_queue.metrics[0].arn
+      },
+      {
+        Effect = "Allow"
+        Action = [
+          "s3:PutObject",
+          "s3:GetObject"
+        ]
+        Resource = var.enable_s3_backup ? "${aws_s3_bucket.metrics[0].arn}/*" : "*"
+      }
+    ]
+  })
+}
+
+# Lambda function for metric aggregation
+resource "aws_lambda_function" "aggregator" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  function_name = "${var.project_name}-aggregator"
+  role          = aws_iam_role.lambda[0].arn
+  handler       = "main.handler"
+  runtime       = "python3.11"
+  timeout       = var.lambda_timeout
+  memory_size   = var.lambda_memory_size
+
+  # Placeholder - will be deployed via CI/CD
+  filename         = "${path.module}/../lambdas/aggregator/placeholder.zip"
+  source_code_hash = filebase64sha256("${path.module}/../lambdas/aggregator/placeholder.zip")
+
+  environment {
+    variables = {
+      TIMESCALE_HOST = aws_instance.sysmonstm.private_ip
+      LOG_LEVEL      = "INFO"
+    }
+  }
+
+  lifecycle {
+    ignore_changes = [filename, source_code_hash]
+  }
+}
+
+resource "aws_lambda_event_source_mapping" "sqs_trigger" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  event_source_arn = aws_sqs_queue.metrics[0].arn
+  function_name    = aws_lambda_function.aggregator[0].arn
+  batch_size       = 100
+
+  scaling_config {
+    maximum_concurrency = 5
+  }
+}
+
+# CloudWatch Event for scheduled compaction
+resource "aws_cloudwatch_event_rule" "compactor" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  name                = "${var.project_name}-compactor-schedule"
+  description         = "Trigger metric compaction every hour"
+  schedule_expression = "rate(1 hour)"
+}
+
+resource "aws_lambda_function" "compactor" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  function_name = "${var.project_name}-compactor"
+  role          = aws_iam_role.lambda[0].arn
+  handler       = "main.handler"
+  runtime       = "python3.11"
+  timeout       = 300
+  memory_size   = 512
+
+  filename         = "${path.module}/../lambdas/compactor/placeholder.zip"
+  source_code_hash = filebase64sha256("${path.module}/../lambdas/compactor/placeholder.zip")
+
+  environment {
+    variables = {
+      TIMESCALE_HOST = aws_instance.sysmonstm.private_ip
+      S3_BUCKET      = var.enable_s3_backup ? aws_s3_bucket.metrics[0].bucket : ""
+      LOG_LEVEL      = "INFO"
+    }
+  }
+
+  lifecycle {
+    ignore_changes = [filename, source_code_hash]
+  }
+}
+
+resource "aws_cloudwatch_event_target" "compactor" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  rule      = aws_cloudwatch_event_rule.compactor[0].name
+  target_id = "compactor-lambda"
+  arn       = aws_lambda_function.compactor[0].arn
+}
+
+resource "aws_lambda_permission" "compactor_cloudwatch" {
+  count = var.enable_lambda_pipeline ? 1 : 0
+
+  statement_id  = "AllowCloudWatchInvoke"
+  action        = "lambda:InvokeFunction"
+  function_name = aws_lambda_function.compactor[0].function_name
+  principal     = "events.amazonaws.com"
+  source_arn    = aws_cloudwatch_event_rule.compactor[0].arn
+}
diff --git a/infra/aws/terraform/main.tf b/infra/aws/terraform/main.tf
new file mode 100644
index 0000000..4856bd7
--- /dev/null
+++ b/infra/aws/terraform/main.tf
@@ -0,0 +1,58 @@
+# System Monitor Platform - AWS Infrastructure
+#
+# This Terraform configuration sets up:
+# - EC2 instance for running Docker Compose (demo/staging)
+# - Lambda functions for data processing pipeline
+# - SQS queue for buffering metrics
+# - S3 bucket for metric backups
+# - Security groups and IAM roles
+
+terraform {
+  required_version = ">= 1.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+
+  # Uncomment for remote state
+  # backend "s3" {
+  #   bucket = "your-terraform-state-bucket"
+  #   key    = "sysmonstm/terraform.tfstate"
+  #   region = "us-east-1"
+  # }
+}
+
+provider "aws" {
+  region = var.aws_region
+
+  default_tags {
+    tags = {
+      Project     = "sysmonstm"
+      Environment = var.environment
+      ManagedBy   = "terraform"
+    }
+  }
+}
+
+# Data sources
+data "aws_availability_zones" "available" {
+  state = "available"
+}
+
+data "aws_ami" "amazon_linux_2023" {
+  most_recent = true
+  owners      = ["amazon"]
+
+  filter {
+    name   = "name"
+    values = ["al2023-ami-*-x86_64"]
+  }
+
+  filter {
+    name   = "virtualization-type"
+    values = ["hvm"]
+  }
+}
diff --git a/infra/aws/terraform/outputs.tf b/infra/aws/terraform/outputs.tf
new file mode 100644
index 0000000..a49afb8
--- /dev/null
+++ b/infra/aws/terraform/outputs.tf
@@ -0,0 +1,36 @@
+# Outputs
+
+output "ec2_public_ip" {
+  description = "Public IP of the EC2 instance"
+  value       = aws_eip.sysmonstm.public_ip
+}
+
+output "ec2_instance_id" {
+  description = "EC2 instance ID"
+  value       = aws_instance.sysmonstm.id
+}
+
+output "dashboard_url" {
+  description = "URL for the monitoring dashboard"
+  value       = "http://${aws_eip.sysmonstm.public_ip}:8000"
+}
+
+output "grpc_endpoint" {
+  description = "gRPC endpoint for collectors"
+  value       = "${aws_eip.sysmonstm.public_ip}:50051"
+}
+
+output "sqs_queue_url" {
+  description = "SQS queue URL for metrics"
+  value       = var.enable_lambda_pipeline ? aws_sqs_queue.metrics[0].url : null
+}
+
+output "s3_bucket" {
+  description = "S3 bucket for metric backups"
+  value       = var.enable_s3_backup ? aws_s3_bucket.metrics[0].bucket : null
+}
+
+output "ssh_command" {
+  description = "SSH command to connect to the instance"
+  value       = var.ec2_key_name != "" ? "ssh -i ${var.ec2_key_name}.pem ec2-user@${aws_eip.sysmonstm.public_ip}" : "Use SSM Session Manager"
+}
diff --git a/infra/aws/terraform/terraform.tfvars.example b/infra/aws/terraform/terraform.tfvars.example
new file mode 100644
index 0000000..4f280f7
--- /dev/null
+++ b/infra/aws/terraform/terraform.tfvars.example
@@ -0,0 +1,16 @@
+# Example Terraform variables
+# Copy to terraform.tfvars and fill in your values
+
+aws_region    = "us-east-1"
+environment   = "staging"
+project_name  = "sysmonstm"
+domain_name   = "sysmonstm.mcrn.ar"
+
+# EC2
+ec2_instance_type = "t2.small"
+ec2_key_name      = "your-key-pair-name"
+allowed_ssh_cidrs = ["YOUR.IP.ADDRESS/32"]
+
+# Feature flags
+enable_lambda_pipeline = false
+enable_s3_backup       = false
diff --git a/infra/aws/terraform/variables.tf b/infra/aws/terraform/variables.tf
new file mode 100644
index 0000000..310a234
--- /dev/null
+++ b/infra/aws/terraform/variables.tf
@@ -0,0 +1,70 @@
+# Variables for System Monitor Platform
+
+variable "aws_region" {
+  description = "AWS region to deploy to"
+  type        = string
+  default     = "us-east-1"
+}
+
+variable "environment" {
+  description = "Environment name (dev, staging, prod)"
+  type        = string
+  default     = "staging"
+}
+
+variable "project_name" {
+  description = "Project name for resource naming"
+  type        = string
+  default     = "sysmonstm"
+}
+
+variable "domain_name" {
+  description = "Domain name for the service"
+  type        = string
+  default     = "sysmonstm.mcrn.ar"
+}
+
+# EC2 Configuration
+variable "ec2_instance_type" {
+  description = "EC2 instance type"
+  type        = string
+  default     = "t2.small"
+}
+
+variable "ec2_key_name" {
+  description = "SSH key pair name"
+  type        = string
+  default     = ""
+}
+
+variable "allowed_ssh_cidrs" {
+  description = "CIDR blocks allowed to SSH"
+  type        = list(string)
+  default     = []  # Set to your IP for security
+}
+
+# Lambda Configuration
+variable "lambda_memory_size" {
+  description = "Lambda function memory in MB"
+  type        = number
+  default     = 256
+}
+
+variable "lambda_timeout" {
+  description = "Lambda function timeout in seconds"
+  type        = number
+  default     = 60
+}
+
+# Feature flags
+variable "enable_lambda_pipeline" {
+  description = "Enable Lambda data processing pipeline"
+  type        = bool
+  default     = false
+}
+
+variable "enable_s3_backup" {
+  description = "Enable S3 backup for metrics"
+  type        = bool
+  default     = false
+}
diff --git a/k8s/base/aggregator/configmap.yaml b/k8s/base/aggregator/configmap.yaml
new file mode 100644
index 0000000..1cea721
--- /dev/null
+++ b/k8s/base/aggregator/configmap.yaml
@@ -0,0 +1,15 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: aggregator-config
+data:
+  REDIS_URL: "redis://redis:6379"
+  TIMESCALE_HOST: "timescaledb"
+  TIMESCALE_PORT: "5432"
+  TIMESCALE_USER: "monitor"
+  TIMESCALE_DB: "monitor"
+  GRPC_PORT: "50051"
+  SERVICE_NAME: "aggregator"
+  EVENTS_BACKEND: "redis_pubsub"
+  LOG_LEVEL: "INFO"
+  LOG_FORMAT: "json"
diff --git a/k8s/base/aggregator/deployment.yaml b/k8s/base/aggregator/deployment.yaml
new file mode 100644
index 0000000..566e716
--- /dev/null
+++ b/k8s/base/aggregator/deployment.yaml
@@ -0,0 +1,46 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: aggregator
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: aggregator
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: aggregator
+    spec:
+      containers:
+        - name: aggregator
+          image: sysmonstm/aggregator:latest
+          ports:
+            - containerPort: 50051
+              name: grpc
+          envFrom:
+            - configMapRef:
+                name: aggregator-config
+          env:
+            - name: TIMESCALE_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: timescaledb-secret
+                  key: password
+          resources:
+            requests:
+              memory: "128Mi"
+              cpu: "100m"
+            limits:
+              memory: "256Mi"
+              cpu: "500m"
+          livenessProbe:
+            exec:
+              command: ["/bin/grpc_health_probe", "-addr=:50051"]
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            exec:
+              command: ["/bin/grpc_health_probe", "-addr=:50051"]
+            initialDelaySeconds: 5
+            periodSeconds: 5
diff --git a/k8s/base/aggregator/kustomization.yaml b/k8s/base/aggregator/kustomization.yaml
new file mode 100644
index 0000000..7e3b006
--- /dev/null
+++ b/k8s/base/aggregator/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonLabels:
+  app.kubernetes.io/name: aggregator
+  app.kubernetes.io/component: backend
+
+resources:
+  - deployment.yaml
+  - service.yaml
+  - configmap.yaml
diff --git a/k8s/base/aggregator/service.yaml b/k8s/base/aggregator/service.yaml
new file mode 100644
index 0000000..2c4ceeb
--- /dev/null
+++ b/k8s/base/aggregator/service.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: aggregator
+spec:
+  selector:
+    app.kubernetes.io/name: aggregator
+  ports:
+    - port: 50051
+      targetPort: grpc
+      name: grpc
diff --git a/k8s/base/alerts/configmap.yaml b/k8s/base/alerts/configmap.yaml
new file mode 100644
index 0000000..f28df0a
--- /dev/null
+++ b/k8s/base/alerts/configmap.yaml
@@ -0,0 +1,14 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: alerts-config
+data:
+  REDIS_URL: "redis://redis:6379"
+  TIMESCALE_HOST: "timescaledb"
+  TIMESCALE_PORT: "5432"
+  TIMESCALE_USER: "monitor"
+  TIMESCALE_DB: "monitor"
+  SERVICE_NAME: "alerts"
+  EVENTS_BACKEND: "redis_pubsub"
+  LOG_LEVEL: "INFO"
+  LOG_FORMAT: "json"
diff --git a/k8s/base/alerts/deployment.yaml b/k8s/base/alerts/deployment.yaml
new file mode 100644
index 0000000..96fcfd7
--- /dev/null
+++ b/k8s/base/alerts/deployment.yaml
@@ -0,0 +1,33 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: alerts
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: alerts
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: alerts
+    spec:
+      containers:
+        - name: alerts
+          image: sysmonstm/alerts:latest
+          envFrom:
+            - configMapRef:
+                name: alerts-config
+          env:
+            - name: TIMESCALE_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: timescaledb-secret
+                  key: password
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "200m"
diff --git a/k8s/base/alerts/kustomization.yaml b/k8s/base/alerts/kustomization.yaml
new file mode 100644
index 0000000..7204d5e
--- /dev/null
+++ b/k8s/base/alerts/kustomization.yaml
@@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonLabels:
+  app.kubernetes.io/name: alerts
+  app.kubernetes.io/component: backend
+
+resources:
+  - deployment.yaml
+  - configmap.yaml
diff --git a/k8s/base/gateway/configmap.yaml b/k8s/base/gateway/configmap.yaml
new file mode 100644
index 0000000..04d0f99
--- /dev/null
+++ b/k8s/base/gateway/configmap.yaml
@@ -0,0 +1,16 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: gateway-config
+data:
+  REDIS_URL: "redis://redis:6379"
+  TIMESCALE_HOST: "timescaledb"
+  TIMESCALE_PORT: "5432"
+  TIMESCALE_USER: "monitor"
+  TIMESCALE_DB: "monitor"
+  AGGREGATOR_URL: "aggregator:50051"
+  HTTP_PORT: "8000"
+  SERVICE_NAME: "gateway"
+  EVENTS_BACKEND: "redis_pubsub"
+  LOG_LEVEL: "INFO"
+  LOG_FORMAT: "json"
diff --git a/k8s/base/gateway/deployment.yaml b/k8s/base/gateway/deployment.yaml
new file mode 100644
index 0000000..9c11fe1
--- /dev/null
+++ b/k8s/base/gateway/deployment.yaml
@@ -0,0 +1,48 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: gateway
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: gateway
+    spec:
+      containers:
+        - name: gateway
+          image: sysmonstm/gateway:latest
+          ports:
+            - containerPort: 8000
+              name: http
+          envFrom:
+            - configMapRef:
+                name: gateway-config
+          env:
+            - name: TIMESCALE_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: timescaledb-secret
+                  key: password
+          resources:
+            requests:
+              memory: "128Mi"
+              cpu: "100m"
+            limits:
+              memory: "256Mi"
+              cpu: "500m"
+          livenessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 10
+            periodSeconds: 10
+          readinessProbe:
+            httpGet:
+              path: /health
+              port: http
+            initialDelaySeconds: 5
+            periodSeconds: 5
diff --git a/k8s/base/gateway/kustomization.yaml b/k8s/base/gateway/kustomization.yaml
new file mode 100644
index 0000000..c131ee3
--- /dev/null
+++ b/k8s/base/gateway/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonLabels:
+  app.kubernetes.io/name: gateway
+  app.kubernetes.io/component: frontend
+
+resources:
+  - deployment.yaml
+  - service.yaml
+  - configmap.yaml
diff --git a/k8s/base/gateway/service.yaml b/k8s/base/gateway/service.yaml
new file mode 100644
index 0000000..40d052d
--- /dev/null
+++ b/k8s/base/gateway/service.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: gateway
+spec:
+  selector:
+    app.kubernetes.io/name: gateway
+  ports:
+    - port: 8000
+      targetPort: http
+      name: http
diff --git a/k8s/base/kustomization.yaml b/k8s/base/kustomization.yaml
new file mode 100644
index 0000000..2f7a9f9
--- /dev/null
+++ b/k8s/base/kustomization.yaml
@@ -0,0 +1,17 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: sysmonstm
+
+commonLabels:
+  app.kubernetes.io/part-of: sysmonstm
+  app.kubernetes.io/managed-by: kustomize
+
+resources:
+  - namespace.yaml
+  - redis/
+  - timescaledb/
+  - aggregator/
+  - gateway/
+  - alerts/
+  # collector is deployed separately on each machine
diff --git a/k8s/base/namespace.yaml b/k8s/base/namespace.yaml
new file mode 100644
index 0000000..d34ddfe
--- /dev/null
+++ b/k8s/base/namespace.yaml
@@ -0,0 +1,6 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: sysmonstm
+  labels:
+    app.kubernetes.io/name: sysmonstm
diff --git a/k8s/base/redis/deployment.yaml b/k8s/base/redis/deployment.yaml
new file mode 100644
index 0000000..f9b4db0
--- /dev/null
+++ b/k8s/base/redis/deployment.yaml
@@ -0,0 +1,37 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: redis
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: redis
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: redis
+    spec:
+      containers:
+        - name: redis
+          image: redis:7-alpine
+          ports:
+            - containerPort: 6379
+              name: redis
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "200m"
+          livenessProbe:
+            exec:
+              command: ["redis-cli", "ping"]
+            initialDelaySeconds: 5
+            periodSeconds: 10
+          readinessProbe:
+            exec:
+              command: ["redis-cli", "ping"]
+            initialDelaySeconds: 5
+            periodSeconds: 5
diff --git a/k8s/base/redis/kustomization.yaml b/k8s/base/redis/kustomization.yaml
new file mode 100644
index 0000000..de9023a
--- /dev/null
+++ b/k8s/base/redis/kustomization.yaml
@@ -0,0 +1,10 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonLabels:
+  app.kubernetes.io/name: redis
+  app.kubernetes.io/component: cache
+
+resources:
+  - deployment.yaml
+  - service.yaml
diff --git a/k8s/base/redis/service.yaml b/k8s/base/redis/service.yaml
new file mode 100644
index 0000000..ff256b5
--- /dev/null
+++ b/k8s/base/redis/service.yaml
@@ -0,0 +1,11 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: redis
+spec:
+  selector:
+    app.kubernetes.io/name: redis
+  ports:
+    - port: 6379
+      targetPort: redis
+      name: redis
diff --git a/k8s/base/timescaledb/configmap.yaml b/k8s/base/timescaledb/configmap.yaml
new file mode 100644
index 0000000..d3dd5bd
--- /dev/null
+++ b/k8s/base/timescaledb/configmap.yaml
@@ -0,0 +1,94 @@
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: timescaledb-init
+data:
+  init.sql: |
+    -- TimescaleDB initialization script
+    CREATE EXTENSION IF NOT EXISTS timescaledb;
+
+    CREATE TABLE IF NOT EXISTS metrics_raw (
+        time        TIMESTAMPTZ NOT NULL,
+        machine_id  TEXT NOT NULL,
+        hostname    TEXT NOT NULL,
+        metric_type TEXT NOT NULL,
+        value       DOUBLE PRECISION NOT NULL,
+        labels      JSONB DEFAULT '{}'::jsonb
+    );
+
+    SELECT create_hypertable('metrics_raw', 'time',
+        chunk_time_interval => INTERVAL '1 hour',
+        if_not_exists => TRUE
+    );
+
+    CREATE INDEX IF NOT EXISTS idx_metrics_raw_machine
+        ON metrics_raw (machine_id, time DESC);
+    CREATE INDEX IF NOT EXISTS idx_metrics_raw_type
+        ON metrics_raw (metric_type, time DESC);
+
+    CREATE TABLE IF NOT EXISTS metrics_1m (
+        time        TIMESTAMPTZ NOT NULL,
+        machine_id  TEXT NOT NULL,
+        hostname    TEXT NOT NULL,
+        metric_type TEXT NOT NULL,
+        avg_value   DOUBLE PRECISION NOT NULL,
+        min_value   DOUBLE PRECISION NOT NULL,
+        max_value   DOUBLE PRECISION NOT NULL,
+        sample_count INTEGER NOT NULL
+    );
+
+    SELECT create_hypertable('metrics_1m', 'time',
+        chunk_time_interval => INTERVAL '1 day',
+        if_not_exists => TRUE
+    );
+
+    CREATE TABLE IF NOT EXISTS machines (
+        machine_id  TEXT PRIMARY KEY,
+        hostname    TEXT NOT NULL,
+        first_seen  TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+        last_seen   TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+        metadata    JSONB DEFAULT '{}'::jsonb,
+        health      TEXT NOT NULL DEFAULT 'UNKNOWN'
+    );
+
+    CREATE TABLE IF NOT EXISTS alert_rules (
+        id          SERIAL PRIMARY KEY,
+        name        TEXT NOT NULL UNIQUE,
+        metric_type TEXT NOT NULL,
+        operator    TEXT NOT NULL,
+        threshold   DOUBLE PRECISION NOT NULL,
+        severity    TEXT NOT NULL,
+        enabled     BOOLEAN NOT NULL DEFAULT TRUE,
+        created_at  TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+        updated_at  TIMESTAMPTZ NOT NULL DEFAULT NOW()
+    );
+
+    CREATE TABLE IF NOT EXISTS alerts (
+        id          SERIAL,
+        time        TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+        machine_id  TEXT NOT NULL,
+        rule_id     INTEGER REFERENCES alert_rules(id),
+        rule_name   TEXT NOT NULL,
+        metric_type TEXT NOT NULL,
+        value       DOUBLE PRECISION NOT NULL,
+        threshold   DOUBLE PRECISION NOT NULL,
+        severity    TEXT NOT NULL,
+        resolved_at TIMESTAMPTZ,
+        PRIMARY KEY (id, time)
+    );
+
+    SELECT create_hypertable('alerts', 'time',
+        chunk_time_interval => INTERVAL '1 day',
+        if_not_exists => TRUE
+    );
+
+    SELECT add_retention_policy('metrics_raw', INTERVAL '24 hours', if_not_exists => TRUE);
+    SELECT add_retention_policy('alerts', INTERVAL '30 days', if_not_exists => TRUE);
+
+    INSERT INTO alert_rules (name, metric_type, operator, threshold, severity)
+    VALUES
+        ('High CPU Usage', 'CPU_PERCENT', 'gt', 80.0, 'warning'),
+        ('Critical CPU Usage', 'CPU_PERCENT', 'gt', 95.0, 'critical'),
+        ('High Memory Usage', 'MEMORY_PERCENT', 'gt', 85.0, 'warning'),
+        ('Critical Memory Usage', 'MEMORY_PERCENT', 'gt', 95.0, 'critical')
+    ON CONFLICT (name) DO NOTHING;
diff --git a/k8s/base/timescaledb/kustomization.yaml b/k8s/base/timescaledb/kustomization.yaml
new file mode 100644
index 0000000..ae9a73d
--- /dev/null
+++ b/k8s/base/timescaledb/kustomization.yaml
@@ -0,0 +1,11 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+commonLabels:
+  app.kubernetes.io/name: timescaledb
+  app.kubernetes.io/component: database
+
+resources:
+  - statefulset.yaml
+  - service.yaml
+  - configmap.yaml
diff --git a/k8s/base/timescaledb/service.yaml b/k8s/base/timescaledb/service.yaml
new file mode 100644
index 0000000..4b313bb
--- /dev/null
+++ b/k8s/base/timescaledb/service.yaml
@@ -0,0 +1,12 @@
+apiVersion: v1
+kind: Service
+metadata:
+  name: timescaledb
+spec:
+  selector:
+    app.kubernetes.io/name: timescaledb
+  ports:
+    - port: 5432
+      targetPort: postgres
+      name: postgres
+  clusterIP: None  # Headless for StatefulSet
diff --git a/k8s/base/timescaledb/statefulset.yaml b/k8s/base/timescaledb/statefulset.yaml
new file mode 100644
index 0000000..411d5d3
--- /dev/null
+++ b/k8s/base/timescaledb/statefulset.yaml
@@ -0,0 +1,65 @@
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: timescaledb
+spec:
+  serviceName: timescaledb
+  replicas: 1
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: timescaledb
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: timescaledb
+    spec:
+      containers:
+        - name: timescaledb
+          image: timescale/timescaledb:latest-pg15
+          ports:
+            - containerPort: 5432
+              name: postgres
+          env:
+            - name: POSTGRES_USER
+              value: monitor
+            - name: POSTGRES_PASSWORD
+              valueFrom:
+                secretKeyRef:
+                  name: timescaledb-secret
+                  key: password
+            - name: POSTGRES_DB
+              value: monitor
+          resources:
+            requests:
+              memory: "256Mi"
+              cpu: "100m"
+            limits:
+              memory: "512Mi"
+              cpu: "500m"
+          volumeMounts:
+            - name: data
+              mountPath: /var/lib/postgresql/data
+            - name: init-scripts
+              mountPath: /docker-entrypoint-initdb.d
+          livenessProbe:
+            exec:
+              command: ["pg_isready", "-U", "monitor", "-d", "monitor"]
+            initialDelaySeconds: 30
+            periodSeconds: 10
+          readinessProbe:
+            exec:
+              command: ["pg_isready", "-U", "monitor", "-d", "monitor"]
+            initialDelaySeconds: 5
+            periodSeconds: 5
+      volumes:
+        - name: init-scripts
+          configMap:
+            name: timescaledb-init
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: ["ReadWriteOnce"]
+        resources:
+          requests:
+            storage: 5Gi
diff --git a/k8s/overlays/local/kustomization.yaml b/k8s/overlays/local/kustomization.yaml
new file mode 100644
index 0000000..4f2fe45
--- /dev/null
+++ b/k8s/overlays/local/kustomization.yaml
@@ -0,0 +1,22 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: sysmonstm
+
+resources:
+  - ../../base
+  - secrets.yaml
+
+patches:
+  - path: patches/reduce-resources.yaml
+
+images:
+  - name: sysmonstm/aggregator
+    newName: sysmonstm-aggregator
+    newTag: dev
+  - name: sysmonstm/gateway
+    newName: sysmonstm-gateway
+    newTag: dev
+  - name: sysmonstm/alerts
+    newName: sysmonstm-alerts
+    newTag: dev
diff --git a/k8s/overlays/local/patches/reduce-resources.yaml b/k8s/overlays/local/patches/reduce-resources.yaml
new file mode 100644
index 0000000..fa70112
--- /dev/null
+++ b/k8s/overlays/local/patches/reduce-resources.yaml
@@ -0,0 +1,50 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: aggregator
+spec:
+  template:
+    spec:
+      containers:
+        - name: aggregator
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "200m"
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: gateway
+spec:
+  template:
+    spec:
+      containers:
+        - name: gateway
+          resources:
+            requests:
+              memory: "64Mi"
+              cpu: "50m"
+            limits:
+              memory: "128Mi"
+              cpu: "200m"
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: timescaledb
+spec:
+  template:
+    spec:
+      containers:
+        - name: timescaledb
+          resources:
+            requests:
+              memory: "128Mi"
+              cpu: "50m"
+            limits:
+              memory: "256Mi"
+              cpu: "200m"
diff --git a/k8s/overlays/local/secrets.yaml b/k8s/overlays/local/secrets.yaml
new file mode 100644
index 0000000..385cb5c
--- /dev/null
+++ b/k8s/overlays/local/secrets.yaml
@@ -0,0 +1,8 @@
+apiVersion: v1
+kind: Secret
+metadata:
+  name: timescaledb-secret
+  namespace: sysmonstm
+type: Opaque
+stringData:
+  password: "monitor"  # Only for local dev!
diff --git a/proto/metrics.proto b/proto/metrics.proto
new file mode 100644
index 0000000..44130b9
--- /dev/null
+++ b/proto/metrics.proto
@@ -0,0 +1,159 @@
+syntax = "proto3";
+
+package monitoring;
+
+option go_package = "github.com/your-org/sysmonstm/proto";
+
+// MetricsService handles streaming metrics from collectors to aggregator
+service MetricsService {
+  // Client-side streaming: collector streams metrics to aggregator
+  rpc StreamMetrics(stream Metric) returns (StreamAck) {}
+
+  // Get current state of a machine
+  rpc GetCurrentState(StateRequest) returns (MachineState) {}
+
+  // Get current state of all machines
+  rpc GetAllStates(Empty) returns (AllMachinesState) {}
+}
+
+// ControlService handles bidirectional control commands
+service ControlService {
+  // Bidirectional streaming for commands and responses
+  rpc Control(stream ControlCommand) returns (stream ControlResponse) {}
+}
+
+// ConfigService handles dynamic configuration
+service ConfigService {
+  // Get current configuration for a collector
+  rpc GetConfig(ConfigRequest) returns (CollectorConfig) {}
+
+  // Stream configuration updates
+  rpc WatchConfig(ConfigRequest) returns (stream CollectorConfig) {}
+}
+
+// Empty message for requests with no parameters
+message Empty {}
+
+// Basic metric message
+message Metric {
+  string machine_id = 1;
+  string hostname = 2;
+  int64 timestamp_ms = 3;
+  MetricType type = 4;
+  double value = 5;
+  map<string, string> labels = 6;
+}
+
+// Batch of metrics for efficient transmission
+message MetricBatch {
+  string machine_id = 1;
+  string hostname = 2;
+  int64 timestamp_ms = 3;
+  repeated MetricPoint metrics = 4;
+}
+
+message MetricPoint {
+  MetricType type = 1;
+  double value = 2;
+  map<string, string> labels = 3;
+}
+
+enum MetricType {
+  METRIC_TYPE_UNSPECIFIED = 0;
+  CPU_PERCENT = 1;
+  CPU_PERCENT_PER_CORE = 2;
+  MEMORY_PERCENT = 3;
+  MEMORY_USED_BYTES = 4;
+  MEMORY_AVAILABLE_BYTES = 5;
+  DISK_PERCENT = 6;
+  DISK_USED_BYTES = 7;
+  DISK_READ_BYTES_SEC = 8;
+  DISK_WRITE_BYTES_SEC = 9;
+  NETWORK_SENT_BYTES_SEC = 10;
+  NETWORK_RECV_BYTES_SEC = 11;
+  NETWORK_CONNECTIONS = 12;
+  PROCESS_COUNT = 13;
+  LOAD_AVG_1M = 14;
+  LOAD_AVG_5M = 15;
+  LOAD_AVG_15M = 16;
+}
+
+// Acknowledgment for streamed metrics
+message StreamAck {
+  bool success = 1;
+  int64 metrics_received = 2;
+  string message = 3;
+}
+
+// Request for machine state
+message StateRequest {
+  string machine_id = 1;
+}
+
+// Current state of a single machine
+message MachineState {
+  string machine_id = 1;
+  string hostname = 2;
+  int64 last_seen_ms = 3;
+  repeated Metric current_metrics = 4;
+  HealthStatus health = 5;
+  map<string, string> metadata = 6;
+}
+
+// State of all machines
+message AllMachinesState {
+  repeated MachineState machines = 1;
+}
+
+enum HealthStatus {
+  HEALTH_STATUS_UNSPECIFIED = 0;
+  HEALTHY = 1;
+  WARNING = 2;
+  CRITICAL = 3;
+  UNKNOWN = 4;
+  OFFLINE = 5;
+}
+
+// Control commands for collectors
+message ControlCommand {
+  string command_id = 1;
+  oneof command {
+    UpdateIntervalCommand update_interval = 2;
+    RestartCollectionCommand restart = 3;
+    ShutdownCommand shutdown = 4;
+  }
+}
+
+message UpdateIntervalCommand {
+  int32 interval_seconds = 1;
+}
+
+message RestartCollectionCommand {}
+
+message ShutdownCommand {
+  bool graceful = 1;
+}
+
+message ControlResponse {
+  string command_id = 1;
+  bool success = 2;
+  string message = 3;
+}
+
+// Configuration messages
+message ConfigRequest {
+  string machine_id = 1;
+}
+
+message CollectorConfig {
+  int32 collection_interval_seconds = 1;
+  repeated MetricType enabled_metrics = 2;
+  map<string, string> labels = 3;
+  repeated ThresholdConfig thresholds = 4;
+}
+
+message ThresholdConfig {
+  MetricType metric_type = 1;
+  double warning_threshold = 2;
+  double critical_threshold = 3;
+}
diff --git a/scripts/generate-diagrams.sh b/scripts/generate-diagrams.sh
new file mode 100755
index 0000000..5d9c0df
--- /dev/null
+++ b/scripts/generate-diagrams.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Generate SVG diagrams from Graphviz DOT files
+# Requires: graphviz (apt install graphviz)
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ARCH_DIR="$SCRIPT_DIR/../docs/architecture"
+
+cd "$ARCH_DIR"
+
+echo "Generating architecture diagrams..."
+
+for dotfile in *.dot; do
+    if [ -f "$dotfile" ]; then
+        svgfile="${dotfile%.dot}.svg"
+        echo "  $dotfile -> $svgfile"
+        dot -Tsvg "$dotfile" -o "$svgfile"
+    fi
+done
+
+echo "Done! Open docs/architecture/index.html in a browser."
diff --git a/scripts/init-db.sql b/scripts/init-db.sql
new file mode 100644
index 0000000..5d66634
--- /dev/null
+++ b/scripts/init-db.sql
@@ -0,0 +1,158 @@
+-- TimescaleDB initialization script
+-- Creates hypertables for time-series metrics storage
+
+-- Enable TimescaleDB extension
+CREATE EXTENSION IF NOT EXISTS timescaledb;
+
+-- Raw metrics table (high resolution, short retention)
+CREATE TABLE IF NOT EXISTS metrics_raw (
+    time        TIMESTAMPTZ NOT NULL,
+    machine_id  TEXT NOT NULL,
+    hostname    TEXT NOT NULL,
+    metric_type TEXT NOT NULL,
+    value       DOUBLE PRECISION NOT NULL,
+    labels      JSONB DEFAULT '{}'::jsonb
+);
+
+-- Convert to hypertable with 1-hour chunks
+SELECT create_hypertable('metrics_raw', 'time',
+    chunk_time_interval => INTERVAL '1 hour',
+    if_not_exists => TRUE
+);
+
+-- Create indexes for common queries
+CREATE INDEX IF NOT EXISTS idx_metrics_raw_machine
+    ON metrics_raw (machine_id, time DESC);
+CREATE INDEX IF NOT EXISTS idx_metrics_raw_type
+    ON metrics_raw (metric_type, time DESC);
+
+-- Aggregated metrics table (1-minute resolution, longer retention)
+CREATE TABLE IF NOT EXISTS metrics_1m (
+    time        TIMESTAMPTZ NOT NULL,
+    machine_id  TEXT NOT NULL,
+    hostname    TEXT NOT NULL,
+    metric_type TEXT NOT NULL,
+    avg_value   DOUBLE PRECISION NOT NULL,
+    min_value   DOUBLE PRECISION NOT NULL,
+    max_value   DOUBLE PRECISION NOT NULL,
+    sample_count INTEGER NOT NULL
+);
+
+SELECT create_hypertable('metrics_1m', 'time',
+    chunk_time_interval => INTERVAL '1 day',
+    if_not_exists => TRUE
+);
+
+CREATE INDEX IF NOT EXISTS idx_metrics_1m_machine
+    ON metrics_1m (machine_id, time DESC);
+
+-- Aggregated metrics table (1-hour resolution, long retention)
+CREATE TABLE IF NOT EXISTS metrics_1h (
+    time        TIMESTAMPTZ NOT NULL,
+    machine_id  TEXT NOT NULL,
+    hostname    TEXT NOT NULL,
+    metric_type TEXT NOT NULL,
+    avg_value   DOUBLE PRECISION NOT NULL,
+    min_value   DOUBLE PRECISION NOT NULL,
+    max_value   DOUBLE PRECISION NOT NULL,
+    sample_count INTEGER NOT NULL
+);
+
+SELECT create_hypertable('metrics_1h', 'time',
+    chunk_time_interval => INTERVAL '1 week',
+    if_not_exists => TRUE
+);
+
+CREATE INDEX IF NOT EXISTS idx_metrics_1h_machine
+    ON metrics_1h (machine_id, time DESC);
+
+-- Machines registry
+CREATE TABLE IF NOT EXISTS machines (
+    machine_id  TEXT PRIMARY KEY,
+    hostname    TEXT NOT NULL,
+    first_seen  TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    last_seen   TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    metadata    JSONB DEFAULT '{}'::jsonb,
+    health      TEXT NOT NULL DEFAULT 'UNKNOWN'
+);
+
+-- Alert rules configuration
+CREATE TABLE IF NOT EXISTS alert_rules (
+    id          SERIAL PRIMARY KEY,
+    name        TEXT NOT NULL UNIQUE,
+    metric_type TEXT NOT NULL,
+    operator    TEXT NOT NULL CHECK (operator IN ('gt', 'lt', 'gte', 'lte', 'eq')),
+    threshold   DOUBLE PRECISION NOT NULL,
+    severity    TEXT NOT NULL CHECK (severity IN ('warning', 'critical')),
+    enabled     BOOLEAN NOT NULL DEFAULT TRUE,
+    created_at  TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    updated_at  TIMESTAMPTZ NOT NULL DEFAULT NOW()
+);
+
+-- Alert history
+CREATE TABLE IF NOT EXISTS alerts (
+    id          SERIAL,
+    time        TIMESTAMPTZ NOT NULL DEFAULT NOW(),
+    machine_id  TEXT NOT NULL,
+    rule_id     INTEGER REFERENCES alert_rules(id),
+    rule_name   TEXT NOT NULL,
+    metric_type TEXT NOT NULL,
+    value       DOUBLE PRECISION NOT NULL,
+    threshold   DOUBLE PRECISION NOT NULL,
+    severity    TEXT NOT NULL,
+    resolved_at TIMESTAMPTZ,
+    PRIMARY KEY (id, time)
+);
+
+SELECT create_hypertable('alerts', 'time',
+    chunk_time_interval => INTERVAL '1 day',
+    if_not_exists => TRUE
+);
+
+-- Retention policies
+-- Raw data: 24 hours
+SELECT add_retention_policy('metrics_raw', INTERVAL '24 hours', if_not_exists => TRUE);
+
+-- 1-minute aggregates: 7 days
+SELECT add_retention_policy('metrics_1m', INTERVAL '7 days', if_not_exists => TRUE);
+
+-- 1-hour aggregates: 90 days
+SELECT add_retention_policy('metrics_1h', INTERVAL '90 days', if_not_exists => TRUE);
+
+-- Alerts: 30 days
+SELECT add_retention_policy('alerts', INTERVAL '30 days', if_not_exists => TRUE);
+
+-- Continuous aggregates for automatic downsampling
+CREATE MATERIALIZED VIEW IF NOT EXISTS metrics_1m_agg
+WITH (timescaledb.continuous) AS
+SELECT
+    time_bucket('1 minute', time) AS time,
+    machine_id,
+    hostname,
+    metric_type,
+    AVG(value) AS avg_value,
+    MIN(value) AS min_value,
+    MAX(value) AS max_value,
+    COUNT(*) AS sample_count
+FROM metrics_raw
+GROUP BY time_bucket('1 minute', time), machine_id, hostname, metric_type
+WITH NO DATA;
+
+-- Refresh policy for continuous aggregate
+SELECT add_continuous_aggregate_policy('metrics_1m_agg',
+    start_offset => INTERVAL '1 hour',
+    end_offset => INTERVAL '1 minute',
+    schedule_interval => INTERVAL '1 minute',
+    if_not_exists => TRUE
+);
+
+-- Insert default alert rules
+INSERT INTO alert_rules (name, metric_type, operator, threshold, severity)
+VALUES
+    ('High CPU Usage', 'CPU_PERCENT', 'gt', 80.0, 'warning'),
+    ('Critical CPU Usage', 'CPU_PERCENT', 'gt', 95.0, 'critical'),
+    ('High Memory Usage', 'MEMORY_PERCENT', 'gt', 85.0, 'warning'),
+    ('Critical Memory Usage', 'MEMORY_PERCENT', 'gt', 95.0, 'critical'),
+    ('High Disk Usage', 'DISK_PERCENT', 'gt', 80.0, 'warning'),
+    ('Critical Disk Usage', 'DISK_PERCENT', 'gt', 90.0, 'critical')
+ON CONFLICT (name) DO NOTHING;
diff --git a/services/aggregator/Dockerfile b/services/aggregator/Dockerfile
new file mode 100644
index 0000000..c4fb40e
--- /dev/null
+++ b/services/aggregator/Dockerfile
@@ -0,0 +1,47 @@
+# Multi-stage Dockerfile for Aggregator service
+
+FROM python:3.11-slim as base
+
+WORKDIR /app
+
+# Install system dependencies including grpc_health_probe
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && curl -fsSL https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/v0.4.24/grpc_health_probe-linux-amd64 \
+       -o /bin/grpc_health_probe \
+    && chmod +x /bin/grpc_health_probe \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY services/aggregator/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY shared /app/shared
+COPY proto /app/proto
+
+RUN python -m grpc_tools.protoc \
+    -I/app/proto \
+    --python_out=/app/shared \
+    --grpc_python_out=/app/shared \
+    /app/proto/metrics.proto
+
+COPY services/aggregator /app/services/aggregator
+
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+
+# =============================================================================
+FROM base as development
+
+RUN pip install --no-cache-dir watchfiles
+
+CMD ["python", "-m", "watchfiles", "python services/aggregator/main.py", "/app/services/aggregator"]
+
+# =============================================================================
+FROM base as production
+
+RUN useradd --create-home --shell /bin/bash appuser
+USER appuser
+
+EXPOSE 50051
+
+CMD ["python", "services/aggregator/main.py"]
diff --git a/services/aggregator/requirements.txt b/services/aggregator/requirements.txt
new file mode 100644
index 0000000..eea914a
--- /dev/null
+++ b/services/aggregator/requirements.txt
@@ -0,0 +1,9 @@
+grpcio>=1.60.0
+grpcio-tools>=1.60.0
+grpcio-health-checking>=1.60.0
+redis>=5.0.0
+asyncpg>=0.29.0
+structlog>=23.2.0
+python-json-logger>=2.0.7
+pydantic>=2.5.0
+pydantic-settings>=2.1.0
diff --git a/services/alerts/Dockerfile b/services/alerts/Dockerfile
new file mode 100644
index 0000000..d1300a9
--- /dev/null
+++ b/services/alerts/Dockerfile
@@ -0,0 +1,35 @@
+# Multi-stage Dockerfile for Alerts service
+
+FROM python:3.11-slim as base
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY services/alerts/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY shared /app/shared
+COPY proto /app/proto
+
+COPY services/alerts /app/services/alerts
+
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+
+# =============================================================================
+FROM base as development
+
+RUN pip install --no-cache-dir watchfiles
+
+CMD ["python", "-m", "watchfiles", "python services/alerts/main.py", "/app/services/alerts"]
+
+# =============================================================================
+FROM base as production
+
+RUN useradd --create-home --shell /bin/bash appuser
+USER appuser
+
+CMD ["python", "services/alerts/main.py"]
diff --git a/services/alerts/requirements.txt b/services/alerts/requirements.txt
new file mode 100644
index 0000000..dc6d7d7
--- /dev/null
+++ b/services/alerts/requirements.txt
@@ -0,0 +1,6 @@
+redis>=5.0.0
+asyncpg>=0.29.0
+structlog>=23.2.0
+python-json-logger>=2.0.7
+pydantic>=2.5.0
+pydantic-settings>=2.1.0
diff --git a/services/collector/Dockerfile b/services/collector/Dockerfile
new file mode 100644
index 0000000..b514f26
--- /dev/null
+++ b/services/collector/Dockerfile
@@ -0,0 +1,55 @@
+# Multi-stage Dockerfile for Collector service
+# Stages: base -> development, base -> production
+
+# =============================================================================
+# Base stage - common dependencies
+# =============================================================================
+FROM python:3.11-slim as base
+
+WORKDIR /app
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install Python dependencies
+COPY services/collector/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy shared code and proto
+COPY shared /app/shared
+COPY proto /app/proto
+
+# Generate gRPC code from proto
+RUN python -m grpc_tools.protoc \
+    -I/app/proto \
+    --python_out=/app/shared \
+    --grpc_python_out=/app/shared \
+    /app/proto/metrics.proto
+
+# Copy service code
+COPY services/collector /app/services/collector
+
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+
+# =============================================================================
+# Development stage - with hot reload
+# =============================================================================
+FROM base as development
+
+RUN pip install --no-cache-dir watchfiles
+
+CMD ["python", "-m", "watchfiles", "python services/collector/main.py", "/app/services/collector"]
+
+# =============================================================================
+# Production stage - optimized
+# =============================================================================
+FROM base as production
+
+# Run as non-root user
+RUN useradd --create-home --shell /bin/bash appuser
+USER appuser
+
+CMD ["python", "services/collector/main.py"]
diff --git a/services/collector/requirements.txt b/services/collector/requirements.txt
new file mode 100644
index 0000000..9a806f4
--- /dev/null
+++ b/services/collector/requirements.txt
@@ -0,0 +1,7 @@
+grpcio>=1.60.0
+grpcio-tools>=1.60.0
+psutil>=5.9.0
+structlog>=23.2.0
+python-json-logger>=2.0.7
+pydantic>=2.5.0
+pydantic-settings>=2.1.0
diff --git a/services/gateway/Dockerfile b/services/gateway/Dockerfile
new file mode 100644
index 0000000..ad54609
--- /dev/null
+++ b/services/gateway/Dockerfile
@@ -0,0 +1,44 @@
+# Multi-stage Dockerfile for Gateway service (FastAPI)
+
+FROM python:3.11-slim as base
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY services/gateway/requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY shared /app/shared
+COPY proto /app/proto
+
+RUN python -m grpc_tools.protoc \
+    -I/app/proto \
+    --python_out=/app/shared \
+    --grpc_python_out=/app/shared \
+    /app/proto/metrics.proto
+
+COPY services/gateway /app/services/gateway
+COPY web /app/web
+
+ENV PYTHONPATH=/app
+ENV PYTHONUNBUFFERED=1
+
+# =============================================================================
+FROM base as development
+
+RUN pip install --no-cache-dir watchfiles
+
+CMD ["uvicorn", "services.gateway.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
+
+# =============================================================================
+FROM base as production
+
+RUN useradd --create-home --shell /bin/bash appuser
+USER appuser
+
+EXPOSE 8000
+
+CMD ["uvicorn", "services.gateway.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
diff --git a/services/gateway/requirements.txt b/services/gateway/requirements.txt
new file mode 100644
index 0000000..ead958c
--- /dev/null
+++ b/services/gateway/requirements.txt
@@ -0,0 +1,13 @@
+fastapi>=0.109.0
+uvicorn[standard]>=0.27.0
+grpcio>=1.60.0
+grpcio-tools>=1.60.0
+redis>=5.0.0
+asyncpg>=0.29.0
+websockets>=12.0
+jinja2>=3.1.2
+structlog>=23.2.0
+python-json-logger>=2.0.7
+pydantic>=2.5.0
+pydantic-settings>=2.1.0
+httpx>=0.26.0
diff --git a/shared/events/__init__.py b/shared/events/__init__.py
new file mode 100644
index 0000000..ccf1c20
--- /dev/null
+++ b/shared/events/__init__.py
@@ -0,0 +1,34 @@
+"""
+Event publishing/subscribing abstraction layer.
+
+Supports:
+- Redis Pub/Sub (default, simple)
+- Redis Streams (with consumer groups, persistence)
+- Kafka (future, for high-throughput)
+
+Usage:
+    from shared.events import get_publisher, get_subscriber
+
+    # Publishing
+    async with get_publisher() as pub:
+        await pub.publish("metrics.raw", {"machine_id": "m1", ...})
+
+    # Subscribing
+    async with get_subscriber(["metrics.raw", "alerts.*"]) as sub:
+        async for topic, message in sub.consume():
+            process(topic, message)
+"""
+
+from .base import EventPublisher, EventSubscriber, Event
+from .redis_pubsub import RedisPubSubPublisher, RedisPubSubSubscriber
+from .factory import get_publisher, get_subscriber
+
+__all__ = [
+    "EventPublisher",
+    "EventSubscriber",
+    "Event",
+    "RedisPubSubPublisher",
+    "RedisPubSubSubscriber",
+    "get_publisher",
+    "get_subscriber",
+]
diff --git a/shared/events/base.py b/shared/events/base.py
new file mode 100644
index 0000000..edbbcb8
--- /dev/null
+++ b/shared/events/base.py
@@ -0,0 +1,117 @@
+"""Abstract base classes for event publishing and subscribing."""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from datetime import datetime
+from typing import Any, AsyncIterator
+import uuid
+
+
+@dataclass
+class Event:
+    """Standard event envelope."""
+    topic: str
+    payload: dict[str, Any]
+    event_id: str = field(default_factory=lambda: str(uuid.uuid4()))
+    timestamp: datetime = field(default_factory=datetime.utcnow)
+    source: str = ""
+
+    def to_dict(self) -> dict[str, Any]:
+        return {
+            "event_id": self.event_id,
+            "topic": self.topic,
+            "timestamp": self.timestamp.isoformat(),
+            "source": self.source,
+            "payload": self.payload,
+        }
+
+    @classmethod
+    def from_dict(cls, data: dict[str, Any]) -> "Event":
+        return cls(
+            event_id=data.get("event_id", str(uuid.uuid4())),
+            topic=data["topic"],
+            timestamp=datetime.fromisoformat(data["timestamp"]) if "timestamp" in data else datetime.utcnow(),
+            source=data.get("source", ""),
+            payload=data.get("payload", {}),
+        )
+
+
+class EventPublisher(ABC):
+    """Abstract base for event publishers."""
+
+    @abstractmethod
+    async def connect(self) -> None:
+        """Establish connection to the message broker."""
+        pass
+
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close connection to the message broker."""
+        pass
+
+    @abstractmethod
+    async def publish(self, topic: str, payload: dict[str, Any], **kwargs) -> str:
+        """
+        Publish an event to a topic.
+
+        Args:
+            topic: The topic/channel to publish to
+            payload: The event data
+            **kwargs: Additional options (e.g., headers, partition key)
+
+        Returns:
+            The event ID
+        """
+        pass
+
+    async def publish_event(self, event: Event) -> str:
+        """Publish a pre-constructed Event object."""
+        return await self.publish(event.topic, event.payload, event_id=event.event_id)
+
+    async def __aenter__(self) -> "EventPublisher":
+        await self.connect()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        await self.disconnect()
+
+
+class EventSubscriber(ABC):
+    """Abstract base for event subscribers."""
+
+    @abstractmethod
+    async def connect(self) -> None:
+        """Establish connection to the message broker."""
+        pass
+
+    @abstractmethod
+    async def disconnect(self) -> None:
+        """Close connection and unsubscribe."""
+        pass
+
+    @abstractmethod
+    async def subscribe(self, topics: list[str]) -> None:
+        """
+        Subscribe to one or more topics.
+
+        Args:
+            topics: List of topics/patterns to subscribe to
+        """
+        pass
+
+    @abstractmethod
+    async def consume(self) -> AsyncIterator[Event]:
+        """
+        Async generator that yields events as they arrive.
+
+        Yields:
+            Event objects
+        """
+        pass
+
+    async def __aenter__(self) -> "EventSubscriber":
+        await self.connect()
+        return self
+
+    async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
+        await self.disconnect()
diff --git a/shared/events/factory.py b/shared/events/factory.py
new file mode 100644
index 0000000..b7d94c3
--- /dev/null
+++ b/shared/events/factory.py
@@ -0,0 +1,101 @@
+"""Factory functions for creating event publishers and subscribers."""
+
+import os
+from enum import Enum
+
+from .base import EventPublisher, EventSubscriber
+from .redis_pubsub import RedisPubSubPublisher, RedisPubSubSubscriber
+
+
+class EventBackend(str, Enum):
+    """Supported event backends."""
+
+    REDIS_PUBSUB = "redis_pubsub"
+    REDIS_STREAMS = "redis_streams"  # Future
+    KAFKA = "kafka"  # Future
+
+
+def get_publisher(
+    backend: EventBackend | str | None = None,
+    source: str = "",
+    **kwargs,
+) -> EventPublisher:
+    """
+    Factory function to get an event publisher.
+
+    Args:
+        backend: The event backend to use (default: from EVENTS_BACKEND env var or redis_pubsub)
+        source: Identifier for the source service
+        **kwargs: Backend-specific options
+
+    Returns:
+        An EventPublisher instance
+
+    Environment variables:
+        EVENTS_BACKEND: Default backend (redis_pubsub, redis_streams, kafka)
+        REDIS_URL: Redis connection URL
+        KAFKA_BOOTSTRAP_SERVERS: Kafka bootstrap servers (future)
+    """
+    if backend is None:
+        backend = os.getenv("EVENTS_BACKEND", EventBackend.REDIS_PUBSUB)
+
+    if isinstance(backend, str):
+        backend = EventBackend(backend)
+
+    if backend == EventBackend.REDIS_PUBSUB:
+        redis_url = kwargs.get("redis_url") or os.getenv(
+            "REDIS_URL", "redis://localhost:6379"
+        )
+        return RedisPubSubPublisher(redis_url=redis_url, source=source)
+
+    elif backend == EventBackend.REDIS_STREAMS:
+        raise NotImplementedError("Redis Streams backend not yet implemented")
+
+    elif backend == EventBackend.KAFKA:
+        raise NotImplementedError("Kafka backend not yet implemented")
+
+    else:
+        raise ValueError(f"Unknown event backend: {backend}")
+
+
+def get_subscriber(
+    topics: list[str] | None = None,
+    backend: EventBackend | str | None = None,
+    **kwargs,
+) -> EventSubscriber:
+    """
+    Factory function to get an event subscriber.
+
+    Args:
+        topics: Topics to subscribe to
+        backend: The event backend to use (default: from EVENTS_BACKEND env var or redis_pubsub)
+        **kwargs: Backend-specific options
+
+    Returns:
+        An EventSubscriber instance
+
+    Environment variables:
+        EVENTS_BACKEND: Default backend (redis_pubsub, redis_streams, kafka)
+        REDIS_URL: Redis connection URL
+        KAFKA_BOOTSTRAP_SERVERS: Kafka bootstrap servers (future)
+    """
+    if backend is None:
+        backend = os.getenv("EVENTS_BACKEND", EventBackend.REDIS_PUBSUB)
+
+    if isinstance(backend, str):
+        backend = EventBackend(backend)
+
+    if backend == EventBackend.REDIS_PUBSUB:
+        redis_url = kwargs.get("redis_url") or os.getenv(
+            "REDIS_URL", "redis://localhost:6379"
+        )
+        return RedisPubSubSubscriber(redis_url=redis_url, topics=topics)
+
+    elif backend == EventBackend.REDIS_STREAMS:
+        raise NotImplementedError("Redis Streams backend not yet implemented")
+
+    elif backend == EventBackend.KAFKA:
+        raise NotImplementedError("Kafka backend not yet implemented")
+
+    else:
+        raise ValueError(f"Unknown event backend: {backend}")
diff --git a/shared/events/redis_pubsub.py b/shared/events/redis_pubsub.py
new file mode 100644
index 0000000..0bffe81
--- /dev/null
+++ b/shared/events/redis_pubsub.py
@@ -0,0 +1,142 @@
+"""Redis Pub/Sub implementation of event publishing/subscribing."""
+
+import asyncio
+import json
+import logging
+from typing import Any, AsyncIterator
+
+import redis.asyncio as redis
+
+from .base import Event, EventPublisher, EventSubscriber
+
+logger = logging.getLogger(__name__)
+
+
+class RedisPubSubPublisher(EventPublisher):
+    """Redis Pub/Sub based event publisher."""
+
+    def __init__(
+        self,
+        redis_url: str = "redis://localhost:6379",
+        source: str = "",
+    ):
+        self.redis_url = redis_url
+        self.source = source
+        self._client: redis.Redis | None = None
+
+    async def connect(self) -> None:
+        self._client = redis.from_url(self.redis_url, decode_responses=True)
+        await self._client.ping()
+        logger.info(f"Connected to Redis at {self.redis_url}")
+
+    async def disconnect(self) -> None:
+        if self._client:
+            await self._client.close()
+            self._client = None
+            logger.info("Disconnected from Redis")
+
+    async def publish(self, topic: str, payload: dict[str, Any], **kwargs) -> str:
+        if not self._client:
+            raise RuntimeError("Publisher not connected")
+
+        event = Event(
+            topic=topic,
+            payload=payload,
+            event_id=kwargs.get("event_id", None)
+            or Event(topic="", payload={}).event_id,
+            source=self.source,
+        )
+
+        message = json.dumps(event.to_dict())
+        await self._client.publish(topic, message)
+
+        logger.debug(f"Published event {event.event_id} to {topic}")
+        return event.event_id
+
+
+class RedisPubSubSubscriber(EventSubscriber):
+    """Redis Pub/Sub based event subscriber."""
+
+    def __init__(
+        self,
+        redis_url: str = "redis://localhost:6379",
+        topics: list[str] | None = None,
+    ):
+        self.redis_url = redis_url
+        self._topics = topics or []
+        self._client: redis.Redis | None = None
+        self._pubsub: redis.client.PubSub | None = None
+        self._running = False
+
+    async def connect(self) -> None:
+        self._client = redis.from_url(self.redis_url, decode_responses=True)
+        await self._client.ping()
+        self._pubsub = self._client.pubsub()
+        logger.info(f"Connected to Redis at {self.redis_url}")
+
+        if self._topics:
+            await self.subscribe(self._topics)
+
+    async def disconnect(self) -> None:
+        self._running = False
+        if self._pubsub:
+            await self._pubsub.unsubscribe()
+            await self._pubsub.close()
+            self._pubsub = None
+        if self._client:
+            await self._client.close()
+            self._client = None
+        logger.info("Disconnected from Redis")
+
+    async def subscribe(self, topics: list[str]) -> None:
+        if not self._pubsub:
+            raise RuntimeError("Subscriber not connected")
+
+        # Separate pattern subscriptions from regular ones
+        patterns = [t for t in topics if "*" in t]
+        channels = [t for t in topics if "*" not in t]
+
+        if channels:
+            await self._pubsub.subscribe(*channels)
+            logger.info(f"Subscribed to channels: {channels}")
+
+        if patterns:
+            await self._pubsub.psubscribe(*patterns)
+            logger.info(f"Subscribed to patterns: {patterns}")
+
+        self._topics.extend(topics)
+
+    async def consume(self) -> AsyncIterator[Event]:
+        if not self._pubsub:
+            raise RuntimeError("Subscriber not connected")
+
+        self._running = True
+
+        while self._running:
+            try:
+                message = await self._pubsub.get_message(
+                    ignore_subscribe_messages=True,
+                    timeout=1.0,
+                )
+
+                if message is None:
+                    await asyncio.sleep(0.01)
+                    continue
+
+                if message["type"] not in ("message", "pmessage"):
+                    continue
+
+                try:
+                    data = json.loads(message["data"])
+                    event = Event.from_dict(data)
+                    yield event
+                except (json.JSONDecodeError, KeyError) as e:
+                    logger.warning(f"Failed to parse event: {e}")
+                    continue
+
+            except asyncio.CancelledError:
+                self._running = False
+                break
+            except Exception as e:
+                logger.error(f"Error consuming events: {e}")
+                await asyncio.sleep(1.0)