simple is better
This commit is contained in:
104
ctrl/README.md
104
ctrl/README.md
@@ -1,82 +1,72 @@
|
||||
# Deployment Configurations
|
||||
|
||||
This directory contains deployment configurations for sysmonstm.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
┌─────────────┐ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
||||
│ Collector │────▶│ Hub │────▶│ Edge │────▶│ Browser │
|
||||
│ (mcrn) │ │ (local) │ │ (AWS) │ │ │
|
||||
└─────────────┘ └─────────────┘ └─────────────┘ └─────────────┘
|
||||
┌─────────────┐ │
|
||||
│ Collector │────────────┘
|
||||
│ (nfrt) │
|
||||
┌─────────────┐ ┌─────────────────────────────────────┐ ┌─────────────┐
|
||||
│ Collector │────▶│ Aggregator + Gateway + Redis + TS │────▶│ Edge │────▶ Browser
|
||||
│ (mcrn) │gRPC │ (LOCAL) │ WS │ (AWS) │ WS
|
||||
└─────────────┘ └─────────────────────────────────────┘ └─────────────┘
|
||||
┌─────────────┐ │
|
||||
│ Collector │────────────────────┘
|
||||
│ (nfrt) │gRPC
|
||||
└─────────────┘
|
||||
```
|
||||
|
||||
- **Collectors** use gRPC to stream metrics to the local aggregator
|
||||
- **Gateway** forwards to edge via WebSocket (if `EDGE_URL` configured)
|
||||
- **Edge** (AWS) relays to browsers via WebSocket
|
||||
|
||||
## Directory Structure
|
||||
|
||||
```
|
||||
ctrl/
|
||||
├── collector/ # Lightweight agent for each monitored machine
|
||||
├── hub/ # Local aggregator (receives from collectors, forwards to edge)
|
||||
├── edge/ # Cloud dashboard (public-facing, receives from hub)
|
||||
└── dev/ # Full gRPC stack for development
|
||||
├── dev/ # Full stack for local development (docker-compose)
|
||||
└── edge/ # Cloud dashboard for AWS deployment
|
||||
```
|
||||
|
||||
## Production Deployment (3-tier)
|
||||
|
||||
### 1. Edge (AWS)
|
||||
Public-facing dashboard that receives metrics from hub.
|
||||
|
||||
```bash
|
||||
cd ctrl/edge
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 2. Hub (Local Server)
|
||||
Runs on your local network, receives from collectors, forwards to edge.
|
||||
|
||||
```bash
|
||||
cd ctrl/hub
|
||||
EDGE_URL=wss://sysmonstm.mcrn.ar/ws EDGE_API_KEY=xxx docker compose up -d
|
||||
```
|
||||
|
||||
### 3. Collectors (Each Machine)
|
||||
Run on each machine you want to monitor.
|
||||
|
||||
```bash
|
||||
docker run -d --name sysmonstm-collector --network host \
|
||||
-e HUB_URL=ws://hub-machine:8080/ws \
|
||||
-e MACHINE_ID=$(hostname) \
|
||||
-e API_KEY=xxx \
|
||||
registry.mcrn.ar/sysmonstm/collector:latest
|
||||
```
|
||||
|
||||
## Development (Full Stack)
|
||||
|
||||
For local development with the complete gRPC-based architecture:
|
||||
## Local Development
|
||||
|
||||
```bash
|
||||
# From repo root
|
||||
docker compose up
|
||||
```
|
||||
|
||||
This runs: aggregator, gateway, collector, alerts, redis, timescaledb
|
||||
Runs: aggregator, gateway, collector, alerts, redis, timescaledb
|
||||
|
||||
## Production Deployment
|
||||
|
||||
### 1. Deploy Edge to AWS
|
||||
|
||||
```bash
|
||||
cd ctrl/edge
|
||||
docker compose up -d
|
||||
```
|
||||
|
||||
### 2. Run Full Stack Locally with Edge Forwarding
|
||||
|
||||
```bash
|
||||
EDGE_URL=wss://sysmonstm.mcrn.ar/ws EDGE_API_KEY=xxx docker compose up
|
||||
```
|
||||
|
||||
### 3. Run Collectors on Other Machines
|
||||
|
||||
```bash
|
||||
docker run -d --name sysmonstm-collector --network host \
|
||||
-e AGGREGATOR_URL=<local-gateway-ip>:50051 \
|
||||
-e MACHINE_ID=$(hostname) \
|
||||
registry.mcrn.ar/sysmonstm/collector:latest
|
||||
```
|
||||
|
||||
## Environment Variables
|
||||
|
||||
### Collector
|
||||
- `HUB_URL` - WebSocket URL of hub (default: ws://localhost:8080/ws)
|
||||
- `MACHINE_ID` - Identifier for this machine (default: hostname)
|
||||
- `API_KEY` - Authentication key
|
||||
- `INTERVAL` - Seconds between collections (default: 5)
|
||||
|
||||
### Hub
|
||||
- `API_KEY` - Key required from collectors
|
||||
- `EDGE_URL` - WebSocket URL of edge (optional, for forwarding)
|
||||
- `EDGE_API_KEY` - Key for authenticating to edge
|
||||
### Gateway (for edge forwarding)
|
||||
- `EDGE_URL` - WebSocket URL of edge (e.g., wss://sysmonstm.mcrn.ar/ws)
|
||||
- `EDGE_API_KEY` - Authentication key for edge
|
||||
|
||||
### Edge
|
||||
- `API_KEY` - Key required from hub
|
||||
- `API_KEY` - Key required from gateway
|
||||
|
||||
### Collector
|
||||
- `AGGREGATOR_URL` - gRPC URL of aggregator (e.g., localhost:50051)
|
||||
- `MACHINE_ID` - Identifier for this machine
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir psutil websockets
|
||||
|
||||
COPY collector.py .
|
||||
|
||||
# Default environment variables
|
||||
ENV HUB_URL=ws://localhost:8080/ws
|
||||
ENV MACHINE_ID=""
|
||||
ENV API_KEY=""
|
||||
ENV INTERVAL=5
|
||||
ENV LOG_LEVEL=INFO
|
||||
|
||||
CMD ["python", "collector.py"]
|
||||
@@ -1,136 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Lightweight WebSocket metrics collector for sysmonstm standalone deployment."""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
|
||||
import psutil
|
||||
|
||||
# Configuration from environment
|
||||
HUB_URL = os.environ.get("HUB_URL", "ws://localhost:8080/ws")
|
||||
MACHINE_ID = os.environ.get("MACHINE_ID", socket.gethostname())
|
||||
API_KEY = os.environ.get("API_KEY", "")
|
||||
INTERVAL = int(os.environ.get("INTERVAL", "5"))
|
||||
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger("collector")
|
||||
|
||||
|
||||
def collect_metrics() -> dict:
|
||||
"""Collect system metrics using psutil."""
|
||||
metrics = {
|
||||
"type": "metrics",
|
||||
"machine_id": MACHINE_ID,
|
||||
"hostname": socket.gethostname(),
|
||||
"timestamp": time.time(),
|
||||
}
|
||||
|
||||
# CPU
|
||||
try:
|
||||
metrics["cpu"] = psutil.cpu_percent(interval=None)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Memory
|
||||
try:
|
||||
mem = psutil.virtual_memory()
|
||||
metrics["memory"] = mem.percent
|
||||
metrics["memory_used_gb"] = round(mem.used / (1024**3), 2)
|
||||
metrics["memory_total_gb"] = round(mem.total / (1024**3), 2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Disk
|
||||
try:
|
||||
disk = psutil.disk_usage("/")
|
||||
metrics["disk"] = disk.percent
|
||||
metrics["disk_used_gb"] = round(disk.used / (1024**3), 2)
|
||||
metrics["disk_total_gb"] = round(disk.total / (1024**3), 2)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Load average (Unix only)
|
||||
try:
|
||||
load1, load5, load15 = psutil.getloadavg()
|
||||
metrics["load_1m"] = round(load1, 2)
|
||||
metrics["load_5m"] = round(load5, 2)
|
||||
metrics["load_15m"] = round(load15, 2)
|
||||
except (AttributeError, OSError):
|
||||
pass
|
||||
|
||||
# Network connections count
|
||||
try:
|
||||
metrics["connections"] = len(psutil.net_connections(kind="inet"))
|
||||
except (psutil.AccessDenied, PermissionError):
|
||||
pass
|
||||
|
||||
# Process count
|
||||
try:
|
||||
metrics["processes"] = len(psutil.pids())
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
async def run_collector():
|
||||
"""Main collector loop with auto-reconnect."""
|
||||
import websockets
|
||||
|
||||
# Build URL with API key if provided
|
||||
url = HUB_URL
|
||||
if API_KEY:
|
||||
separator = "&" if "?" in url else "?"
|
||||
url = f"{url}{separator}key={API_KEY}"
|
||||
|
||||
# Prime CPU percent (first call always returns 0)
|
||||
psutil.cpu_percent(interval=None)
|
||||
|
||||
while True:
|
||||
try:
|
||||
log.info(f"Connecting to {HUB_URL}...")
|
||||
async with websockets.connect(url) as ws:
|
||||
log.info(
|
||||
f"Connected. Sending metrics every {INTERVAL}s as '{MACHINE_ID}'"
|
||||
)
|
||||
|
||||
while True:
|
||||
metrics = collect_metrics()
|
||||
await ws.send(json.dumps(metrics))
|
||||
log.debug(
|
||||
f"Sent: cpu={metrics.get('cpu', '?')}% mem={metrics.get('memory', '?')}% disk={metrics.get('disk', '?')}%"
|
||||
)
|
||||
await asyncio.sleep(INTERVAL)
|
||||
|
||||
except asyncio.CancelledError:
|
||||
log.info("Collector stopped")
|
||||
break
|
||||
except Exception as e:
|
||||
log.warning(f"Connection error: {e}. Reconnecting in 5s...")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
|
||||
def main():
|
||||
log.info("sysmonstm collector starting")
|
||||
log.info(f" Hub: {HUB_URL}")
|
||||
log.info(f" Machine: {MACHINE_ID}")
|
||||
log.info(f" Interval: {INTERVAL}s")
|
||||
|
||||
try:
|
||||
asyncio.run(run_collector())
|
||||
except KeyboardInterrupt:
|
||||
log.info("Stopped")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -1,16 +0,0 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
RUN pip install --no-cache-dir fastapi uvicorn[standard] websockets
|
||||
|
||||
COPY hub.py .
|
||||
|
||||
ENV API_KEY=""
|
||||
ENV EDGE_URL=""
|
||||
ENV EDGE_API_KEY=""
|
||||
ENV LOG_LEVEL=INFO
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
CMD ["uvicorn", "hub:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||
@@ -1,12 +0,0 @@
|
||||
services:
|
||||
hub:
|
||||
build: .
|
||||
container_name: sysmonstm-hub
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
- API_KEY=${API_KEY:-}
|
||||
- EDGE_URL=${EDGE_URL:-}
|
||||
- EDGE_API_KEY=${EDGE_API_KEY:-}
|
||||
- LOG_LEVEL=${LOG_LEVEL:-INFO}
|
||||
ports:
|
||||
- "8080:8080"
|
||||
151
ctrl/hub/hub.py
151
ctrl/hub/hub.py
@@ -1,151 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
sysmonstm hub - Local aggregator that receives from collectors and forwards to edge.
|
||||
|
||||
Runs on the local network, receives metrics from collectors via WebSocket,
|
||||
and forwards them to the cloud edge.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
from fastapi import FastAPI, Query, WebSocket, WebSocketDisconnect
|
||||
|
||||
# Configuration
|
||||
API_KEY = os.environ.get("API_KEY", "")
|
||||
EDGE_URL = os.environ.get("EDGE_URL", "") # e.g., wss://sysmonstm.mcrn.ar/ws
|
||||
EDGE_API_KEY = os.environ.get("EDGE_API_KEY", "")
|
||||
LOG_LEVEL = os.environ.get("LOG_LEVEL", "INFO")
|
||||
|
||||
# Logging setup
|
||||
logging.basicConfig(
|
||||
level=getattr(logging, LOG_LEVEL.upper(), logging.INFO),
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger("hub")
|
||||
|
||||
app = FastAPI(title="sysmonstm-hub")
|
||||
|
||||
# State
|
||||
collector_connections: list[WebSocket] = []
|
||||
machines: dict = {}
|
||||
edge_ws = None
|
||||
|
||||
|
||||
async def connect_to_edge():
|
||||
"""Maintain persistent connection to edge and forward metrics."""
|
||||
global edge_ws
|
||||
|
||||
if not EDGE_URL:
|
||||
log.info("No EDGE_URL configured, running in local-only mode")
|
||||
return
|
||||
|
||||
import websockets
|
||||
|
||||
url = EDGE_URL
|
||||
if EDGE_API_KEY:
|
||||
separator = "&" if "?" in url else "?"
|
||||
url = f"{url}{separator}key={EDGE_API_KEY}"
|
||||
|
||||
while True:
|
||||
try:
|
||||
log.info(f"Connecting to edge: {EDGE_URL}")
|
||||
async with websockets.connect(url) as ws:
|
||||
edge_ws = ws
|
||||
log.info("Connected to edge")
|
||||
|
||||
while True:
|
||||
try:
|
||||
msg = await asyncio.wait_for(ws.recv(), timeout=30)
|
||||
# Ignore messages from edge (pings, etc)
|
||||
except asyncio.TimeoutError:
|
||||
await ws.ping()
|
||||
|
||||
except asyncio.CancelledError:
|
||||
break
|
||||
except Exception as e:
|
||||
edge_ws = None
|
||||
log.warning(f"Edge connection error: {e}. Reconnecting in 5s...")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
|
||||
async def forward_to_edge(data: dict):
|
||||
"""Forward metrics to edge if connected."""
|
||||
global edge_ws
|
||||
if edge_ws:
|
||||
try:
|
||||
await edge_ws.send(json.dumps(data))
|
||||
log.debug(f"Forwarded to edge: {data.get('machine_id')}")
|
||||
except Exception as e:
|
||||
log.warning(f"Failed to forward to edge: {e}")
|
||||
|
||||
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
asyncio.create_task(connect_to_edge())
|
||||
|
||||
|
||||
@app.get("/health")
|
||||
async def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"machines": len(machines),
|
||||
"collectors": len(collector_connections),
|
||||
"edge_connected": edge_ws is not None,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/machines")
|
||||
async def get_machines():
|
||||
return machines
|
||||
|
||||
|
||||
@app.websocket("/ws")
|
||||
async def websocket_endpoint(websocket: WebSocket, key: str = Query(default="")):
|
||||
# Validate API key
|
||||
if API_KEY and key != API_KEY:
|
||||
log.warning(f"Invalid API key from {websocket.client}")
|
||||
await websocket.close(code=4001, reason="Invalid API key")
|
||||
return
|
||||
|
||||
await websocket.accept()
|
||||
collector_connections.append(websocket)
|
||||
client = websocket.client.host if websocket.client else "unknown"
|
||||
log.info(f"Collector connected: {client}")
|
||||
|
||||
try:
|
||||
while True:
|
||||
try:
|
||||
msg = await asyncio.wait_for(websocket.receive_text(), timeout=30)
|
||||
data = json.loads(msg)
|
||||
|
||||
if data.get("type") == "metrics":
|
||||
machine_id = data.get("machine_id", "unknown")
|
||||
machines[machine_id] = data
|
||||
log.debug(f"Metrics from {machine_id}: cpu={data.get('cpu')}%")
|
||||
|
||||
# Forward to edge
|
||||
await forward_to_edge(data)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
await websocket.send_json({"type": "ping"})
|
||||
|
||||
except WebSocketDisconnect:
|
||||
log.info(f"Collector disconnected: {client}")
|
||||
except Exception as e:
|
||||
log.error(f"WebSocket error: {e}")
|
||||
finally:
|
||||
if websocket in collector_connections:
|
||||
collector_connections.remove(websocket)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
log.info("Starting sysmonstm hub")
|
||||
log.info(f" API key: {'configured' if API_KEY else 'not set (open)'}")
|
||||
log.info(f" Edge URL: {EDGE_URL or 'not configured (local only)'}")
|
||||
uvicorn.run(app, host="0.0.0.0", port=8080)
|
||||
Reference in New Issue
Block a user