diff --git a/docs/architecture/01-system-overview.dot b/docs/architecture/01-system-overview.dot index c9bc4b0..6f6f56c 100644 --- a/docs/architecture/01-system-overview.dot +++ b/docs/architecture/01-system-overview.dot @@ -24,9 +24,19 @@ digraph SystemOverview { machines [label="Monitored\nMachines", fillcolor="#FFF3E0", shape=box3d]; } + // Edge (AWS) + subgraph cluster_edge { + label="AWS (sysmonstm.mcrn.ar)"; + style=filled; + color="#F3E5F5"; + fillcolor="#F3E5F5"; + + edge_relay [label="Edge\n(WebSocket Relay)", fillcolor="#E1BEE7"]; + } + // Core Services subgraph cluster_services { - label="Application Services"; + label="Local Stack"; style=filled; color="#E8F5E9"; fillcolor="#E8F5E9"; @@ -59,7 +69,10 @@ digraph SystemOverview { } // Connections - browser -> gateway [label="WebSocket\nREST", color="#1976D2"]; + browser -> edge_relay [label="WebSocket", color="#1976D2"]; + edge_relay -> gateway [label="WebSocket\nForward", color="#1976D2", dir=back]; + + browser -> gateway [label="WebSocket\n(local dev)", color="#1976D2", style=dashed]; gateway -> aggregator [label="gRPC", color="#388E3C"]; gateway -> redis [label="State\nQuery", style=dashed]; gateway -> timescale [label="Historical\nQuery", style=dashed]; @@ -73,6 +86,4 @@ digraph SystemOverview { events -> alerts [label="Subscribe", color="#7B1FA2"]; events -> gateway [label="Subscribe", color="#7B1FA2"]; - - alerts -> timescale [label="Store\nAlerts", style=dashed]; } diff --git a/docs/architecture/01-system-overview.svg b/docs/architecture/01-system-overview.svg index aefe71c..424bd8d 100644 --- a/docs/architecture/01-system-overview.svg +++ b/docs/architecture/01-system-overview.svg @@ -1,193 +1,212 @@ - - - + + SystemOverview - -System Monitoring Platform - Architecture Overview + +System Monitoring Platform - Architecture Overview cluster_external - -External + +External -cluster_services - -Application Services +cluster_edge + +AWS (sysmonstm.mcrn.ar) -cluster_data - -Data Layer +cluster_services + +Local Stack +cluster_data + +Data Layer + + cluster_events - -Event Stream + +Event Stream browser - -Browser -(Dashboard) + +Browser +(Dashboard) + + + +edge_relay + +Edge +(WebSocket Relay) + + + +browser->edge_relay + + +WebSocket - + gateway - -Gateway -(FastAPI) + +Gateway +(FastAPI) - + browser->gateway - - -WebSocket -REST + + +WebSocket +(local dev) machines - - - - -Monitored -Machines + + + + +Monitored +Machines - + collector - -Collector -(gRPC Client) + +Collector +(gRPC Client) - + machines->collector - - -psutil + + +psutil + + + +edge_relay->gateway + + +WebSocket +Forward - + aggregator - -Aggregator -(gRPC Server) + +Aggregator +(gRPC Server) - + gateway->aggregator - - -gRPC + + +gRPC - + redis - - -Redis -(Pub/Sub + State) + + +Redis +(Pub/Sub + State) - + gateway->redis - - -State -Query + + +State +Query - + timescale - - -TimescaleDB -(Time-series) + + +TimescaleDB +(Time-series) - + gateway->timescale - - -Historical -Query + + +Historical +Query - + aggregator->redis - - -Current -State + + +Current +State - + aggregator->timescale - - -Store -Metrics + + +Store +Metrics - + events - -Redis Pub/Sub -(Events) + +Redis Pub/Sub +(Events) - + aggregator->events - - -Publish + + +Publish - + alerts - -Alerts -Service - - - -alerts->timescale - - -Store -Alerts + +Alerts +Service - + collector->aggregator - - -gRPC -Stream + + +gRPC +Stream - + events->gateway - - -Subscribe + + +Subscribe - + events->alerts - - -Subscribe + + +Subscribe diff --git a/docs/architecture/02-data-flow.dot b/docs/architecture/02-data-flow.dot index ac77851..dc88712 100644 --- a/docs/architecture/02-data-flow.dot +++ b/docs/architecture/02-data-flow.dot @@ -60,7 +60,16 @@ digraph DataFlow { alerts [label="Alert\nService", fillcolor="#C5CAE9"]; gateway [label="Gateway\n(WebSocket)", fillcolor="#9FA8DA"]; - lambda [label="Lambda\nAggregator", fillcolor="#7986CB", style="rounded,filled,dashed"]; + } + + // Edge + Browser + subgraph cluster_delivery { + label="Delivery (AWS)"; + style=filled; + fillcolor="#F3E5F5"; + + edge_relay [label="Edge\n(WS Relay)", fillcolor="#E1BEE7"]; + browser [label="Browser\n(Dashboard)", fillcolor="#CE93D8"]; } // Flow @@ -75,9 +84,9 @@ digraph DataFlow { redis_pubsub -> alerts [label="metrics.*"]; redis_pubsub -> gateway [label="metrics.*"]; + gateway -> edge_relay [label="WebSocket\nForward"]; + edge_relay -> browser [label="WebSocket"]; + raw -> agg_1m [label="Continuous\nAggregate", style=dashed]; agg_1m -> agg_1h [label="Hourly\nJob", style=dashed]; - - raw -> lambda [label="SQS\nTrigger", style=dotted]; - lambda -> agg_1m [label="Batch\nWrite", style=dotted]; } diff --git a/docs/architecture/02-data-flow.svg b/docs/architecture/02-data-flow.svg index 5735a45..da799fc 100644 --- a/docs/architecture/02-data-flow.svg +++ b/docs/architecture/02-data-flow.svg @@ -1,134 +1,139 @@ - - - + + DataFlow - -Metrics Data Flow Pipeline + +Metrics Data Flow Pipeline cluster_collect - -Collection (5s) + +Collection (5s) cluster_ingest - -Ingestion + +Ingestion cluster_hot - -Hot Path (Real-time) + +Hot Path (Real-time) cluster_warm - -Warm Path (Historical) + +Warm Path (Historical) cluster_consume - -Consumers + +Consumers + + +cluster_delivery + +Delivery (AWS) psutil - - + -psutil -(CPU, Mem, Disk) + +psutil +(CPU, Mem, Disk) collector - -Collector -Service + +Collector +Service psutil->collector - - -Metrics + + +Metrics aggregator - -Aggregator -(gRPC) + +Aggregator +(gRPC) collector->aggregator - - -gRPC -Stream + + +gRPC +Stream validate - -Validate & -Normalize + +Validate & +Normalize aggregator->validate - - + + redis_state - - -Redis -Current State + + +Redis +Current State validate->redis_state - - -Upsert + + +Upsert redis_pubsub - -Redis -Pub/Sub + +Redis +Pub/Sub validate->redis_pubsub - - -Publish + + +Publish raw - - -metrics_raw -(5s, 24h) + + +metrics_raw +(5s, 24h) validate->raw - - -Insert + + +Insert @@ -140,9 +145,9 @@ redis_pubsub->alerts - - -metrics.* + + +metrics.* @@ -154,64 +159,70 @@ redis_pubsub->gateway - - -metrics.* + + +metrics.* agg_1m - - -metrics_1m -(1m, 7d) + + +metrics_1m +(1m, 7d) - -raw->agg_1m - - -Continuous -Aggregate - - - -lambda - -Lambda -Aggregator - - -raw->lambda - - -SQS -Trigger +raw->agg_1m + + +Continuous +Aggregate agg_1h - - -metrics_1h -(1h, 90d) + + +metrics_1h +(1h, 90d) - -agg_1m->agg_1h - - -Hourly -Job - - -lambda->agg_1m - - -Batch -Write +agg_1m->agg_1h + + +Hourly +Job + + + +edge_relay + +Edge +(WS Relay) + + + +gateway->edge_relay + + +WebSocket +Forward + + + +browser + +Browser +(Dashboard) + + + +edge_relay->browser + + +WebSocket diff --git a/docs/architecture/03-deployment.dot b/docs/architecture/03-deployment.dot index fe3b29d..775726f 100644 --- a/docs/architecture/03-deployment.dot +++ b/docs/architecture/03-deployment.dot @@ -11,60 +11,36 @@ digraph Deployment { node [shape=box, style="rounded,filled"]; - // Local Development + // Local Stack subgraph cluster_local { - label="Local Development"; - style=filled; - fillcolor="#E3F2FD"; - - subgraph cluster_kind { - label="Kind Cluster"; - style=filled; - fillcolor="#BBDEFB"; - - tilt [label="Tilt\n(Live Reload)", shape=component, fillcolor="#90CAF9"]; - k8s_local [label="K8s Pods\n(via Kustomize)", fillcolor="#64B5F6"]; - } - - compose [label="Docker Compose\n(Alternative)", fillcolor="#90CAF9", style="rounded,dashed"]; - } - - // AWS Staging/Demo - subgraph cluster_aws { - label="AWS (sysmonstm.mcrn.ar)"; + label="Local Stack (Docker Compose)"; style=filled; fillcolor="#E8F5E9"; - subgraph cluster_ec2 { - label="EC2 t2.small"; - style=filled; - fillcolor="#C8E6C9"; + aggregator [label="Aggregator\n(gRPC Server)", fillcolor="#A5D6A7"]; + gateway [label="Gateway\n(FastAPI)", fillcolor="#A5D6A7"]; + alerts [label="Alerts\nService", fillcolor="#A5D6A7"]; + redis [label="Redis", shape=cylinder, fillcolor="#C8E6C9"]; + timescaledb [label="TimescaleDB", shape=cylinder, fillcolor="#C8E6C9"]; + } - compose_ec2 [label="Docker Compose\n(All Services)", fillcolor="#A5D6A7"]; - nginx [label="Nginx\n(SSL Termination)", fillcolor="#81C784"]; - } + // AWS Edge + subgraph cluster_aws { + label="AWS (sysmonstm.mcrn.ar)"; + style=filled; + fillcolor="#F3E5F5"; - subgraph cluster_lambda { - label="Lambda (Data Processing)"; - style=filled; - fillcolor="#DCEDC8"; - - lambda_agg [label="Aggregator\nLambda", fillcolor="#AED581"]; - lambda_compact [label="Compactor\nLambda", fillcolor="#9CCC65"]; - } - - sqs [label="SQS\n(Buffer)", shape=hexagon, fillcolor="#FFE082"]; - s3 [label="S3\n(Backup)", shape=cylinder, fillcolor="#FFE082"]; + edge_relay [label="Edge\n(WebSocket Relay)", fillcolor="#CE93D8"]; } // CI/CD subgraph cluster_cicd { label="CI/CD"; style=filled; - fillcolor="#F3E5F5"; + fillcolor="#E3F2FD"; - woodpecker [label="Woodpecker CI", fillcolor="#CE93D8"]; - registry [label="Container\nRegistry", shape=cylinder, fillcolor="#BA68C8"]; + woodpecker [label="Woodpecker CI", fillcolor="#90CAF9"]; + registry [label="Container\nRegistry", shape=cylinder, fillcolor="#64B5F6"]; } // Collectors (External) @@ -78,18 +54,22 @@ digraph Deployment { coll3 [label="Collector\n(Machine N)", fillcolor="#FFCCBC"]; } + // Browser + browser [label="Browser\n(Dashboard)", fillcolor="#FFF3E0"]; + // Connections - tilt -> k8s_local [style=invis]; + coll1 -> aggregator [label="gRPC"]; + coll2 -> aggregator [label="gRPC"]; + coll3 -> aggregator [label="gRPC"]; + + aggregator -> redis [label="State"]; + aggregator -> timescaledb [label="Store"]; + gateway -> aggregator [label="gRPC"]; + gateway -> edge_relay [label="WebSocket\nForward"]; + + edge_relay -> browser [label="WebSocket", dir=both]; + woodpecker -> registry [label="Push"]; - registry -> compose_ec2 [label="Pull"]; - registry -> k8s_local [label="Pull", style=dashed]; - - nginx -> compose_ec2 [label="Proxy"]; - compose_ec2 -> sqs [label="Events"]; - sqs -> lambda_agg [label="Trigger"]; - lambda_compact -> s3 [label="Archive"]; - - coll1 -> compose_ec2 [label="gRPC", lhead=cluster_ec2]; - coll2 -> compose_ec2 [label="gRPC", lhead=cluster_ec2]; - coll3 -> compose_ec2 [label="gRPC", lhead=cluster_ec2]; + registry -> edge_relay [label="Pull", style=dashed]; + registry -> aggregator [label="Pull", style=dashed, lhead=cluster_local]; } diff --git a/docs/architecture/03-deployment.svg b/docs/architecture/03-deployment.svg index cc1cf45..d4afb16 100644 --- a/docs/architecture/03-deployment.svg +++ b/docs/architecture/03-deployment.svg @@ -1,221 +1,197 @@ - - - + + Deployment - -Deployment Architecture + +Deployment Architecture cluster_local - -Local Development + +Local Stack (Docker Compose) -cluster_kind - -Kind Cluster +cluster_aws + +AWS (sysmonstm.mcrn.ar) -cluster_aws - -AWS (sysmonstm.mcrn.ar) +cluster_cicd + +CI/CD -cluster_ec2 - -EC2 t2.small - - -cluster_lambda - -Lambda (Data Processing) - - -cluster_cicd - -CI/CD - - cluster_collectors - -Monitored Machines + +Monitored Machines - + -tilt - - - -Tilt -(Live Reload) +aggregator + +Aggregator +(gRPC Server) - - -k8s_local - -K8s Pods -(via Kustomize) - - - - -compose - -Docker Compose -(Alternative) - - + -compose_ec2 - -Docker Compose -(All Services) +redis + + +Redis - - -sqs - -SQS -(Buffer) + + +aggregator->redis + + +State - - -compose_ec2->sqs - - -Events - - + -nginx - -Nginx -(SSL Termination) +timescaledb + + +TimescaleDB - + -nginx->compose_ec2 - - -Proxy +aggregator->timescaledb + + +Store - + + +gateway + +Gateway +(FastAPI) + + + +gateway->aggregator + + +gRPC + + -lambda_agg - -Aggregator -Lambda +edge_relay + +Edge +(WebSocket Relay) - - -lambda_compact - -Compactor -Lambda - - - -s3 - - -S3 -(Backup) - - - -lambda_compact->s3 - - -Archive - - + -sqs->lambda_agg - - -Trigger +gateway->edge_relay + + +WebSocket +Forward + + + +alerts + +Alerts +Service + + + +browser + +Browser +(Dashboard) + + + +edge_relay->browser + + + +WebSocket - + woodpecker - -Woodpecker CI + +Woodpecker CI - + registry - - -Container -Registry + + +Container +Registry - + woodpecker->registry - - -Push + + +Push - - -registry->k8s_local - - -Pull + + +registry->aggregator + + +Pull - - -registry->compose_ec2 - - -Pull + + +registry->edge_relay + + +Pull - + coll1 - -Collector -(Machine 1) + +Collector +(Machine 1) - - -coll1->compose_ec2 - - -gRPC + + +coll1->aggregator + + +gRPC - + coll2 - -Collector -(Machine 2) + +Collector +(Machine 2) - - -coll2->compose_ec2 - - -gRPC + + +coll2->aggregator + + +gRPC - + coll3 - -Collector -(Machine N) + +Collector +(Machine N) - - -coll3->compose_ec2 - - -gRPC + + +coll3->aggregator + + +gRPC diff --git a/docs/architecture/styles.css b/docs/architecture/styles.css index 4f251b9..558f464 100644 --- a/docs/architecture/styles.css +++ b/docs/architecture/styles.css @@ -86,8 +86,7 @@ main { border-radius: 4px; padding: 1rem; margin-bottom: 1rem; - overflow: auto; - max-height: 400px; + overflow: visible; } .graph-preview img { diff --git a/docs/explainer/sysmonstm-from-start-to-finish.md b/docs/explainer/sysmonstm-from-start-to-finish.md index 9f08b9c..9e1ab2f 100644 --- a/docs/explainer/sysmonstm-from-start-to-finish.md +++ b/docs/explainer/sysmonstm-from-start-to-finish.md @@ -231,6 +231,31 @@ machine_metrics_cache[machine_id].update(incoming_metrics) New metrics merge with existing. The broadcast includes the full merged state. +### Edge Relay - Public Dashboard Without the Cost + +The full stack (aggregator, Redis, TimescaleDB) runs on local hardware. But the dashboard needs to be publicly accessible at `sysmonstm.mcrn.ar`. Running the full stack on AWS would be expensive and unnecessary. + +The solution is an edge relay (`ctrl/edge/edge.py`). It's a minimal FastAPI app that does one thing: relay WebSocket messages. The gateway forwards metrics to the edge via WebSocket, and the edge broadcasts them to connected browsers: + +```python +# Gateway forwards to edge when EDGE_URL is configured +async def forward_to_edge(data: dict): + if edge_ws: + await edge_ws.send(json.dumps(data)) +``` + +The edge receives these and broadcasts to all dashboard viewers: + +```python +@app.websocket("/ws") +async def dashboard_ws(websocket: WebSocket): + await websocket.accept() + clients.add(websocket) + # ... broadcasts incoming metrics to all clients +``` + +This keeps heavy processing (gRPC, storage, event evaluation) on local hardware and puts only a lightweight relay in the cloud. The AWS instance has no databases, no gRPC, no storage — just WebSocket in, WebSocket out. + ## Phase 3: Alerts - Adding Intelligence The alerts service subscribes to metric events and evaluates them against rules. @@ -402,7 +427,8 @@ Set `COLLECTOR_AGGREGATOR_URL=192.168.1.100:50051` and it overrides the default. | Redis events | `shared/events/redis_pubsub.py` | Redis Pub/Sub implementation | | Configuration | `shared/config.py` | Pydantic settings for all services | | DB initialization | `scripts/init-db.sql` | TimescaleDB schema, hypertables | -| Docker setup | `docker-compose.yml` | Full stack orchestration | +| Edge relay | `ctrl/edge/edge.py` | WebSocket relay for AWS dashboard | +| Docker setup | `ctrl/dev/docker-compose.yml` | Full stack orchestration | ## Running It diff --git a/docs/index.html b/docs/index.html index fb9651e..43088fd 100644 --- a/docs/index.html +++ b/docs/index.html @@ -80,39 +80,6 @@
- -

Explainer Articles

- - - -
-
@@ -155,6 +122,11 @@ Alerts: Subscribes to events, evaluates thresholds, triggers actions +
  • + Edge: Lightweight WebSocket + relay on AWS, serves public dashboard at + sysmonstm.mcrn.ar +
  • @@ -245,16 +217,17 @@

    Environments

    @@ -301,7 +274,7 @@
    -

    Interview Talking Points

    +

    Key Design Decisions

    Domain Mapping

    @@ -366,16 +339,13 @@

    Infrastructure

    • Docker
    • -
    • Kubernetes
    • -
    • Kind + Tilt
    • -
    • Terraform
    • +
    • Docker Compose

    CI/CD

    @@ -386,7 +356,7 @@