new three layer deployment

2026-01-22 12:55:50 -03:00
parent 174bc15368
commit dc3518f138
15 changed files with 766 additions and 643 deletions
--- a/ctrl/README.md
+++ b/ctrl/README.md
@@ -0,0 +1,82 @@
+# Deployment Configurations
+
+This directory contains deployment configurations for sysmonstm.
+
+## Architecture
+
+```
+┌─────────────┐     ┌─────────────┐     ┌─────────────┐     ┌─────────────┐
+│  Collector  │────▶│     Hub     │────▶│    Edge     │────▶│   Browser   │
+│   (mcrn)    │     │   (local)   │     │   (AWS)     │     │             │
+└─────────────┘     └─────────────┘     └─────────────┘     └─────────────┘
+┌─────────────┐            │
+│  Collector  │────────────┘
+│   (nfrt)    │
+└─────────────┘
+```
+
+## Directory Structure
+
+```
+ctrl/
+├── collector/    # Lightweight agent for each monitored machine
+├── hub/          # Local aggregator (receives from collectors, forwards to edge)
+├── edge/         # Cloud dashboard (public-facing, receives from hub)
+└── dev/          # Full gRPC stack for development
+```
+
+## Production Deployment (3-tier)
+
+### 1. Edge (AWS)
+Public-facing dashboard that receives metrics from hub.
+
+```bash
+cd ctrl/edge
+docker compose up -d
+```
+
+### 2. Hub (Local Server)
+Runs on your local network, receives from collectors, forwards to edge.
+
+```bash
+cd ctrl/hub
+EDGE_URL=wss://sysmonstm.mcrn.ar/ws EDGE_API_KEY=xxx docker compose up -d
+```
+
+### 3. Collectors (Each Machine)
+Run on each machine you want to monitor.
+
+```bash
+docker run -d --name sysmonstm-collector --network host \
+  -e HUB_URL=ws://hub-machine:8080/ws \
+  -e MACHINE_ID=$(hostname) \
+  -e API_KEY=xxx \
+  registry.mcrn.ar/sysmonstm/collector:latest
+```
+
+## Development (Full Stack)
+
+For local development with the complete gRPC-based architecture:
+
+```bash
+# From repo root
+docker compose up
+```
+
+This runs: aggregator, gateway, collector, alerts, redis, timescaledb
+
+## Environment Variables
+
+### Collector
+- `HUB_URL` - WebSocket URL of hub (default: ws://localhost:8080/ws)
+- `MACHINE_ID` - Identifier for this machine (default: hostname)
+- `API_KEY` - Authentication key
+- `INTERVAL` - Seconds between collections (default: 5)
+
+### Hub
+- `API_KEY` - Key required from collectors
+- `EDGE_URL` - WebSocket URL of edge (optional, for forwarding)
+- `EDGE_API_KEY` - Key for authenticating to edge
+
+### Edge
+- `API_KEY` - Key required from hub
--- a/ctrl/collector/Dockerfile
+++ b/ctrl/collector/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN pip install --no-cache-dir psutil websockets
+
+COPY collector.py .
+
+# Default environment variables
+ENV HUB_URL=ws://localhost:8080/ws
+ENV MACHINE_ID=""
+ENV API_KEY=""
+ENV INTERVAL=5
+ENV LOG_LEVEL=INFO
+
+CMD ["python", "collector.py"]
--- a/ctrl/collector/collector.py
+++ b/ctrl/collector/collector.py
@@ -0,0 +1,136 @@
+#!/usr/bin/env python3
+"""Lightweight WebSocket metrics collector for sysmonstm standalone deployment."""
+
+import asyncio
+import json
+import logging
+import os
+import socket
+import time
+
+import psutil
+
+# Configuration from environment
+HUB_URL = os.environ.get("HUB_URL", "ws://localhost:8080/ws")
+MACHINE_ID = os.environ.get("MACHINE_ID", socket.gethostname())
+API_KEY = os.environ.get("API_KEY", "")
+INTERVAL = int(os.environ.get("INTERVAL", "5"))
+LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
+
+# Logging setup
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+log = logging.getLogger("collector")
+
+
+def collect_metrics() -> dict:
+    """Collect system metrics using psutil."""
+    metrics = {
+        "type": "metrics",
+        "machine_id": MACHINE_ID,
+        "hostname": socket.gethostname(),
+        "timestamp": time.time(),
+    }
+
+    # CPU
+    try:
+        metrics["cpu"] = psutil.cpu_percent(interval=None)
+    except Exception:
+        pass
+
+    # Memory
+    try:
+        mem = psutil.virtual_memory()
+        metrics["memory"] = mem.percent
+        metrics["memory_used_gb"] = round(mem.used / (1024**3), 2)
+        metrics["memory_total_gb"] = round(mem.total / (1024**3), 2)
+    except Exception:
+        pass
+
+    # Disk
+    try:
+        disk = psutil.disk_usage("/")
+        metrics["disk"] = disk.percent
+        metrics["disk_used_gb"] = round(disk.used / (1024**3), 2)
+        metrics["disk_total_gb"] = round(disk.total / (1024**3), 2)
+    except Exception:
+        pass
+
+    # Load average (Unix only)
+    try:
+        load1, load5, load15 = psutil.getloadavg()
+        metrics["load_1m"] = round(load1, 2)
+        metrics["load_5m"] = round(load5, 2)
+        metrics["load_15m"] = round(load15, 2)
+    except (AttributeError, OSError):
+        pass
+
+    # Network connections count
+    try:
+        metrics["connections"] = len(psutil.net_connections(kind="inet"))
+    except (psutil.AccessDenied, PermissionError):
+        pass
+
+    # Process count
+    try:
+        metrics["processes"] = len(psutil.pids())
+    except Exception:
+        pass
+
+    return metrics
+
+
+async def run_collector():
+    """Main collector loop with auto-reconnect."""
+    import websockets
+
+    # Build URL with API key if provided
+    url = HUB_URL
+    if API_KEY:
+        separator = "&" if "?" in url else "?"
+        url = f"{url}{separator}key={API_KEY}"
+
+    # Prime CPU percent (first call always returns 0)
+    psutil.cpu_percent(interval=None)
+
+    while True:
+        try:
+            log.info(f"Connecting to {HUB_URL}...")
+            async with websockets.connect(url) as ws:
+                log.info(
+                    f"Connected. Sending metrics every {INTERVAL}s as '{MACHINE_ID}'"
+                )
+
+                while True:
+                    metrics = collect_metrics()
+                    await ws.send(json.dumps(metrics))
+                    log.debug(
+                        f"Sent: cpu={metrics.get('cpu', '?')}% mem={metrics.get('memory', '?')}% disk={metrics.get('disk', '?')}%"
+                    )
+                    await asyncio.sleep(INTERVAL)
+
+        except asyncio.CancelledError:
+            log.info("Collector stopped")
+            break
+        except Exception as e:
+            log.warning(f"Connection error: {e}. Reconnecting in 5s...")
+            await asyncio.sleep(5)
+
+
+def main():
+    log.info("sysmonstm collector starting")
+    log.info(f"  Hub: {HUB_URL}")
+    log.info(f"  Machine: {MACHINE_ID}")
+    log.info(f"  Interval: {INTERVAL}s")
+
+    try:
+        asyncio.run(run_collector())
+    except KeyboardInterrupt:
+        log.info("Stopped")
+
+
+if __name__ == "__main__":
+    main()
--- a/ctrl/dev/docker-compose.override.yml
+++ b/ctrl/dev/docker-compose.override.yml
@@ -0,0 +1,48 @@
+# Development overrides - hot reload, mounted volumes, debug settings
+# Usage: docker compose up (automatically includes this file)
+
+version: "3.8"
+
+services:
+  aggregator:
+    build:
+      target: development
+    volumes:
+      - ./services/aggregator:/app/services/aggregator:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      RELOAD: "true"
+
+  gateway:
+    build:
+      target: development
+    volumes:
+      - ./services/gateway:/app/services/gateway:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+      - ./web:/app/web:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      RELOAD: "true"
+
+  alerts:
+    build:
+      target: development
+    volumes:
+      - ./services/alerts:/app/services/alerts:ro
+      - ./shared:/app/shared:ro
+    environment:
+      LOG_LEVEL: DEBUG
+
+  collector:
+    build:
+      target: development
+    volumes:
+      - ./services/collector:/app/services/collector:ro
+      - ./shared:/app/shared:ro
+      - ./proto:/app/proto:ro
+    environment:
+      LOG_LEVEL: DEBUG
+      COLLECTION_INTERVAL: 2
--- a/ctrl/dev/docker-compose.yml
+++ b/ctrl/dev/docker-compose.yml
@@ -0,0 +1,154 @@
+version: "3.8"
+
+# This file works both locally and on EC2 for demo purposes.
+# For local dev with hot-reload, use: docker compose -f docker-compose.yml -f docker-compose.override.yml up
+
+x-common-env: &common-env
+  REDIS_URL: redis://redis:6379
+  TIMESCALE_URL: postgresql://monitor:monitor@timescaledb:5432/monitor
+  EVENTS_BACKEND: redis_pubsub
+  LOG_LEVEL: ${LOG_LEVEL:-INFO}
+  LOG_FORMAT: json
+
+x-healthcheck-defaults: &healthcheck-defaults
+  interval: 10s
+  timeout: 5s
+  retries: 3
+  start_period: 10s
+
+services:
+  # =============================================================================
+  # Infrastructure
+  # =============================================================================
+
+  redis:
+    image: redis:7-alpine
+    ports:
+      - "${REDIS_PORT:-6379}:6379"
+    volumes:
+      - redis-data:/data
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "redis-cli", "ping"]
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+  timescaledb:
+    image: timescale/timescaledb:latest-pg15
+    environment:
+      POSTGRES_USER: monitor
+      POSTGRES_PASSWORD: monitor
+      POSTGRES_DB: monitor
+    ports:
+      - "${TIMESCALE_PORT:-5432}:5432"
+    volumes:
+      - timescale-data:/var/lib/postgresql/data
+      - ./scripts/init-db.sql:/docker-entrypoint-initdb.d/init.sql:ro
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD-SHELL", "pg_isready -U monitor -d monitor"]
+    deploy:
+      resources:
+        limits:
+          memory: 512M
+
+  # =============================================================================
+  # Application Services
+  # =============================================================================
+
+  aggregator:
+    build:
+      context: .
+      dockerfile: services/aggregator/Dockerfile
+    environment:
+      <<: *common-env
+      GRPC_PORT: 50051
+      SERVICE_NAME: aggregator
+    ports:
+      - "${AGGREGATOR_GRPC_PORT:-50051}:50051"
+    depends_on:
+      redis:
+        condition: service_healthy
+      timescaledb:
+        condition: service_healthy
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "/bin/grpc_health_probe", "-addr=:50051"]
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  gateway:
+    build:
+      context: .
+      dockerfile: services/gateway/Dockerfile
+    environment:
+      <<: *common-env
+      HTTP_PORT: 8000
+      AGGREGATOR_URL: aggregator:50051
+      SERVICE_NAME: gateway
+    ports:
+      - "${GATEWAY_PORT:-8000}:8000"
+    depends_on:
+      - aggregator
+      - redis
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
+    deploy:
+      resources:
+        limits:
+          memory: 256M
+
+  alerts:
+    build:
+      context: .
+      dockerfile: services/alerts/Dockerfile
+    environment:
+      <<: *common-env
+      SERVICE_NAME: alerts
+    depends_on:
+      redis:
+        condition: service_healthy
+      timescaledb:
+        condition: service_healthy
+    healthcheck:
+      <<: *healthcheck-defaults
+      test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
+    deploy:
+      resources:
+        limits:
+          memory: 128M
+
+  # Collector runs separately on each machine being monitored
+  # For local testing, we run one instance
+  collector:
+    build:
+      context: .
+      dockerfile: services/collector/Dockerfile
+    environment:
+      <<: *common-env
+      AGGREGATOR_URL: aggregator:50051
+      MACHINE_ID: ${MACHINE_ID:-local-dev}
+      COLLECTION_INTERVAL: ${COLLECTION_INTERVAL:-5}
+      SERVICE_NAME: collector
+    depends_on:
+      - aggregator
+    deploy:
+      resources:
+        limits:
+          memory: 64M
+    # For actual system metrics, you might need:
+    # privileged: true
+    # pid: host
+
+volumes:
+  redis-data:
+  timescale-data:
+
+networks:
+  default:
+    name: sysmonstm
--- a/ctrl/edge/Dockerfile
+++ b/ctrl/edge/Dockerfile
@@ -0,0 +1,14 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN pip install --no-cache-dir fastapi uvicorn[standard] websockets
+
+COPY edge.py .
+
+ENV API_KEY=""
+ENV LOG_LEVEL=INFO
+
+EXPOSE 8080
+
+CMD ["uvicorn", "edge:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/ctrl/standalone/README.md
+++ b/ctrl/standalone/README.md
--- a/ctrl/standalone/docker-compose.yml
+++ b/ctrl/standalone/docker-compose.yml
@@ -1,8 +1,11 @@
 services:
-  sysmonstm:
+  edge:
    build: .
-    container_name: sysmonstm
+    container_name: sysmonstm-edge
    restart: unless-stopped
+    environment:
+      - API_KEY=${API_KEY:-}
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
    ports:
      - "8080:8080"
    networks:
--- a/ctrl/standalone/main.py
+++ b/ctrl/standalone/main.py
@@ -1,11 +1,26 @@
 """Minimal sysmonstm gateway - standalone mode without dependencies."""

-from fastapi import FastAPI, WebSocket, WebSocketDisconnect
-from fastapi.responses import HTMLResponse
-import json
 import asyncio
+import json
+import logging
+import os
 from datetime import datetime

+from fastapi import FastAPI, Query, WebSocket, WebSocketDisconnect
+from fastapi.responses import HTMLResponse
+
+# Configuration
+API_KEY = os.environ.get("API_KEY", "")
+LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
+
+# Logging setup
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+log = logging.getLogger("gateway")
+
 app = FastAPI(title="sysmonstm")

 # Store connected websockets
@@ -107,19 +122,20 @@ HTML = """
        let machines = {};

        function connect() {
-            const ws = new WebSocket(`wss://${location.host}/ws`);
-            
+            const protocol = location.protocol === 'https:' ? 'wss:' : 'ws:';
+            const ws = new WebSocket(`${protocol}//${location.host}/ws`);
+
            ws.onopen = () => {
                statusDot.classList.add('ok');
                statusText.textContent = 'connected';
            };
-            
+
            ws.onclose = () => {
                statusDot.classList.remove('ok');
                statusText.textContent = 'disconnected';
                setTimeout(connect, 2000);
            };
-            
+
            ws.onmessage = (e) => {
                const data = JSON.parse(e.data);
                if (data.type === 'metrics') {
@@ -128,71 +144,113 @@ HTML = """
                }
            };
        }
-        
+
        function render() {
-            const ids = Object.keys(machines);
+            const ids = Object.keys(machines).sort();
            if (ids.length === 0) {
                machinesEl.innerHTML = '<div class="empty"><h2>No collectors connected</h2><p>Start a collector to see metrics</p></div>';
                return;
            }
-            
+
            machinesEl.innerHTML = ids.map(id => {
                const m = machines[id];
+                const ts = m.timestamp ? new Date(m.timestamp * 1000).toLocaleTimeString() : '-';
                return `
                    <div class="machine">
                        <h3>${id}</h3>
                        <div class="metric"><span>CPU</span><span>${m.cpu?.toFixed(1) || '-'}%</span></div>
                        <div class="metric"><span>Memory</span><span>${m.memory?.toFixed(1) || '-'}%</span></div>
                        <div class="metric"><span>Disk</span><span>${m.disk?.toFixed(1) || '-'}%</span></div>
-                        <div class="metric"><span>Updated</span><span>${new Date(m.timestamp).toLocaleTimeString()}</span></div>
+                        <div class="metric"><span>Load (1m)</span><span>${m.load_1m?.toFixed(2) || '-'}</span></div>
+                        <div class="metric"><span>Processes</span><span>${m.processes || '-'}</span></div>
+                        <div class="metric"><span>Updated</span><span>${ts}</span></div>
                    </div>
                `;
            }).join('');
        }
-        
+
        connect();
    </script>
 </body>
 </html>
 """

+
@app.get("/", response_class=HTMLResponse)
 async def index():
    return HTML

+
@app.get("/health")
 async def health():
    return {"status": "ok", "machines": len(machines)}

+
+@app.get("/api/machines")
+async def get_machines():
+    return machines
+
+
@app.websocket("/ws")
-async def websocket_endpoint(websocket: WebSocket):
+async def websocket_endpoint(websocket: WebSocket, key: str = Query(default="")):
+    # API key validation for collectors (browsers don't need key)
+    # Check if this looks like a collector (will send metrics) or browser (will receive)
+    # We validate key only when metrics are received, allowing browsers to connect freely
+
    await websocket.accept()
    connections.append(websocket)
+    client = websocket.client.host if websocket.client else "unknown"
+    log.info(f"WebSocket connected: {client}")
+
    try:
-        # Send current state
+        # Send current state to new connection
        for machine_id, data in machines.items():
-            await websocket.send_json({"type": "metrics", "machine_id": machine_id, **data})
-        # Keep alive
+            await websocket.send_json(
+                {"type": "metrics", "machine_id": machine_id, **data}
+            )
+
+        # Main loop
        while True:
            try:
                msg = await asyncio.wait_for(websocket.receive_text(), timeout=30)
                data = json.loads(msg)
+
                if data.get("type") == "metrics":
+                    # Validate API key for metric submissions
+                    if API_KEY and key != API_KEY:
+                        log.warning(f"Invalid API key from {client}")
+                        await websocket.close(code=4001, reason="Invalid API key")
+                        return
+
                    machine_id = data.get("machine_id", "unknown")
-                    machines[machine_id] = {**data, "timestamp": datetime.utcnow().isoformat()}
-                    # Broadcast to all
+                    machines[machine_id] = data
+                    log.debug(f"Metrics from {machine_id}: cpu={data.get('cpu')}%")
+
+                    # Broadcast to all connected clients
                    for conn in connections:
                        try:
-                            await conn.send_json({"type": "metrics", "machine_id": machine_id, **machines[machine_id]})
-                        except:
+                            await conn.send_json(
+                                {"type": "metrics", "machine_id": machine_id, **data}
+                            )
+                        except Exception:
                            pass
+
            except asyncio.TimeoutError:
+                # Send ping to keep connection alive
                await websocket.send_json({"type": "ping"})
+
    except WebSocketDisconnect:
-        pass
+        log.info(f"WebSocket disconnected: {client}")
+    except Exception as e:
+        log.error(f"WebSocket error: {e}")
    finally:
-        connections.remove(websocket)
+        if websocket in connections:
+            connections.remove(websocket)
+

 if __name__ == "__main__":
    import uvicorn
+
+    log.info("Starting sysmonstm gateway")
+    log.info(f"  API key: {'configured' if API_KEY else 'not set (open)'}")
    uvicorn.run(app, host="0.0.0.0", port=8080)
--- a/ctrl/hub/Dockerfile
+++ b/ctrl/hub/Dockerfile
@@ -0,0 +1,16 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+RUN pip install --no-cache-dir fastapi uvicorn[standard] websockets
+
+COPY hub.py .
+
+ENV API_KEY=""
+ENV EDGE_URL=""
+ENV EDGE_API_KEY=""
+ENV LOG_LEVEL=INFO
+
+EXPOSE 8080
+
+CMD ["uvicorn", "hub:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/ctrl/hub/docker-compose.yml
+++ b/ctrl/hub/docker-compose.yml
@@ -0,0 +1,12 @@
+services:
+  hub:
+    build: .
+    container_name: sysmonstm-hub
+    restart: unless-stopped
+    environment:
+      - API_KEY=${API_KEY:-}
+      - EDGE_URL=${EDGE_URL:-}
+      - EDGE_API_KEY=${EDGE_API_KEY:-}
+      - LOG_LEVEL=${LOG_LEVEL:-INFO}
+    ports:
+      - "8080:8080"
--- a/ctrl/hub/hub.py
+++ b/ctrl/hub/hub.py
@@ -0,0 +1,151 @@
+#!/usr/bin/env python3
+"""
+sysmonstm hub - Local aggregator that receives from collectors and forwards to edge.
+
+Runs on the local network, receives metrics from collectors via WebSocket,
+and forwards them to the cloud edge.
+"""
+
+import asyncio
+import json
+import logging
+import os
+
+from fastapi import FastAPI, Query, WebSocket, WebSocketDisconnect
+
+# Configuration
+API_KEY = os.environ.get("API_KEY", "")
+EDGE_URL = os.environ.get("EDGE_URL", "")  # e.g., wss://sysmonstm.mcrn.ar/ws
+EDGE_API_KEY = os.environ.get("EDGE_API_KEY", "")
+LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
+
+# Logging setup
+logging.basicConfig(
+    level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+log = logging.getLogger("hub")
+
+app = FastAPI(title="sysmonstm-hub")
+
+# State
+collector_connections: list[WebSocket] = []
+machines: dict = {}
+edge_ws = None
+
+
+async def connect_to_edge():
+    """Maintain persistent connection to edge and forward metrics."""
+    global edge_ws
+
+    if not EDGE_URL:
+        log.info("No EDGE_URL configured, running in local-only mode")
+        return
+
+    import websockets
+
+    url = EDGE_URL
+    if EDGE_API_KEY:
+        separator = "&" if "?" in url else "?"
+        url = f"{url}{separator}key={EDGE_API_KEY}"
+
+    while True:
+        try:
+            log.info(f"Connecting to edge: {EDGE_URL}")
+            async with websockets.connect(url) as ws:
+                edge_ws = ws
+                log.info("Connected to edge")
+
+                while True:
+                    try:
+                        msg = await asyncio.wait_for(ws.recv(), timeout=30)
+                        # Ignore messages from edge (pings, etc)
+                    except asyncio.TimeoutError:
+                        await ws.ping()
+
+        except asyncio.CancelledError:
+            break
+        except Exception as e:
+            edge_ws = None
+            log.warning(f"Edge connection error: {e}. Reconnecting in 5s...")
+            await asyncio.sleep(5)
+
+
+async def forward_to_edge(data: dict):
+    """Forward metrics to edge if connected."""
+    global edge_ws
+    if edge_ws:
+        try:
+            await edge_ws.send(json.dumps(data))
+            log.debug(f"Forwarded to edge: {data.get('machine_id')}")
+        except Exception as e:
+            log.warning(f"Failed to forward to edge: {e}")
+
+
+@app.on_event("startup")
+async def startup():
+    asyncio.create_task(connect_to_edge())
+
+
+@app.get("/health")
+async def health():
+    return {
+        "status": "ok",
+        "machines": len(machines),
+        "collectors": len(collector_connections),
+        "edge_connected": edge_ws is not None,
+    }
+
+
+@app.get("/api/machines")
+async def get_machines():
+    return machines
+
+
+@app.websocket("/ws")
+async def websocket_endpoint(websocket: WebSocket, key: str = Query(default="")):
+    # Validate API key
+    if API_KEY and key != API_KEY:
+        log.warning(f"Invalid API key from {websocket.client}")
+        await websocket.close(code=4001, reason="Invalid API key")
+        return
+
+    await websocket.accept()
+    collector_connections.append(websocket)
+    client = websocket.client.host if websocket.client else "unknown"
+    log.info(f"Collector connected: {client}")
+
+    try:
+        while True:
+            try:
+                msg = await asyncio.wait_for(websocket.receive_text(), timeout=30)
+                data = json.loads(msg)
+
+                if data.get("type") == "metrics":
+                    machine_id = data.get("machine_id", "unknown")
+                    machines[machine_id] = data
+                    log.debug(f"Metrics from {machine_id}: cpu={data.get('cpu')}%")
+
+                    # Forward to edge
+                    await forward_to_edge(data)
+
+            except asyncio.TimeoutError:
+                await websocket.send_json({"type": "ping"})
+
+    except WebSocketDisconnect:
+        log.info(f"Collector disconnected: {client}")
+    except Exception as e:
+        log.error(f"WebSocket error: {e}")
+    finally:
+        if websocket in collector_connections:
+            collector_connections.remove(websocket)
+
+
+if __name__ == "__main__":
+    import uvicorn
+
+    log.info("Starting sysmonstm hub")
+    log.info(f"  API key: {'configured' if API_KEY else 'not set (open)'}")
+    log.info(f"  Edge URL: {EDGE_URL or 'not configured (local only)'}")
+    uvicorn.run(app, host="0.0.0.0", port=8080)
--- a/ctrl/standalone/Dockerfile
+++ b/ctrl/standalone/Dockerfile
@@ -1,6 +0,0 @@
-FROM python:3.11-slim
-WORKDIR /app
-RUN pip install --no-cache-dir fastapi uvicorn[standard] websockets
-COPY main.py .
-EXPOSE 8080
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]