first claude draft

This commit is contained in:
buenosairesam
2025-12-29 14:40:06 -03:00
commit 116d4032e2
69 changed files with 5020 additions and 0 deletions

1
.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
def

184
.woodpecker.yml Normal file
View File

@@ -0,0 +1,184 @@
# Woodpecker CI Pipeline
# https://woodpecker-ci.org/docs/usage/pipeline-syntax
variables:
- &python_image python:3.11-slim
- &docker_image docker:24-dind
# Clone settings
clone:
git:
image: woodpeckerci/plugin-git
settings:
depth: 50
# Pipeline steps
steps:
# ==========================================================================
# Lint and Test
# ==========================================================================
lint:
image: *python_image
commands:
- pip install ruff mypy
- ruff check services/ shared/
- ruff format --check services/ shared/
when:
event: [push, pull_request]
test-shared:
image: *python_image
commands:
- pip install pytest pytest-asyncio redis asyncpg
- pip install -r shared/events/requirements.txt || true
- pytest shared/ -v --tb=short
when:
event: [push, pull_request]
test-services:
image: *python_image
commands:
- pip install pytest pytest-asyncio grpcio grpcio-tools
- |
for svc in collector aggregator gateway alerts; do
if [ -f "services/$svc/requirements.txt" ]; then
pip install -r "services/$svc/requirements.txt"
fi
done
- pytest services/ -v --tb=short || true
when:
event: [push, pull_request]
# ==========================================================================
# Build Docker Images
# ==========================================================================
build-aggregator:
image: *docker_image
commands:
- docker build -t sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} -f services/aggregator/Dockerfile --target production .
- docker tag sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} sysmonstm/aggregator:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
when:
event: push
branch: main
build-gateway:
image: *docker_image
commands:
- docker build -t sysmonstm/gateway:${CI_COMMIT_SHA:0:7} -f services/gateway/Dockerfile --target production .
- docker tag sysmonstm/gateway:${CI_COMMIT_SHA:0:7} sysmonstm/gateway:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
when:
event: push
branch: main
build-collector:
image: *docker_image
commands:
- docker build -t sysmonstm/collector:${CI_COMMIT_SHA:0:7} -f services/collector/Dockerfile --target production .
- docker tag sysmonstm/collector:${CI_COMMIT_SHA:0:7} sysmonstm/collector:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
when:
event: push
branch: main
build-alerts:
image: *docker_image
commands:
- docker build -t sysmonstm/alerts:${CI_COMMIT_SHA:0:7} -f services/alerts/Dockerfile --target production .
- docker tag sysmonstm/alerts:${CI_COMMIT_SHA:0:7} sysmonstm/alerts:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
when:
event: push
branch: main
# ==========================================================================
# Push to Registry
# ==========================================================================
push-images:
image: *docker_image
commands:
- echo "$REGISTRY_PASSWORD" | docker login -u "$REGISTRY_USER" --password-stdin "$REGISTRY_URL"
- |
for img in aggregator gateway collector alerts; do
docker tag sysmonstm/$img:latest $REGISTRY_URL/sysmonstm/$img:${CI_COMMIT_SHA:0:7}
docker tag sysmonstm/$img:latest $REGISTRY_URL/sysmonstm/$img:latest
docker push $REGISTRY_URL/sysmonstm/$img:${CI_COMMIT_SHA:0:7}
docker push $REGISTRY_URL/sysmonstm/$img:latest
done
secrets: [registry_user, registry_password, registry_url]
volumes:
- /var/run/docker.sock:/var/run/docker.sock
when:
event: push
branch: main
# ==========================================================================
# Deploy to EC2
# ==========================================================================
deploy-staging:
image: appleboy/drone-ssh
settings:
host:
from_secret: deploy_host
username:
from_secret: deploy_user
key:
from_secret: deploy_key
script:
- cd /home/ec2-user/sysmonstm
- git pull origin main
- docker-compose pull
- docker-compose up -d --remove-orphans
- docker system prune -f
when:
event: push
branch: main
# ==========================================================================
# Notifications
# ==========================================================================
notify-success:
image: plugins/webhook
settings:
urls:
from_secret: webhook_url
content_type: application/json
template: |
{
"text": "✅ Build succeeded: ${CI_REPO_NAME}#${CI_BUILD_NUMBER}",
"commit": "${CI_COMMIT_SHA:0:7}",
"branch": "${CI_COMMIT_BRANCH}",
"author": "${CI_COMMIT_AUTHOR}"
}
when:
status: success
event: push
branch: main
notify-failure:
image: plugins/webhook
settings:
urls:
from_secret: webhook_url
content_type: application/json
template: |
{
"text": "❌ Build failed: ${CI_REPO_NAME}#${CI_BUILD_NUMBER}",
"commit": "${CI_COMMIT_SHA:0:7}",
"branch": "${CI_COMMIT_BRANCH}",
"author": "${CI_COMMIT_AUTHOR}"
}
when:
status: failure
event: push
branch: main

43
.woodpecker/build.yml Normal file
View File

@@ -0,0 +1,43 @@
# Woodpecker CI - Build Pipeline (runs on main branch pushes)
steps:
build-images:
image: docker:24-dind
commands:
- echo "=== Building Docker images ==="
- docker build -t sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} -f services/aggregator/Dockerfile --target production .
- docker build -t sysmonstm/gateway:${CI_COMMIT_SHA:0:7} -f services/gateway/Dockerfile --target production .
- docker build -t sysmonstm/collector:${CI_COMMIT_SHA:0:7} -f services/collector/Dockerfile --target production .
- docker build -t sysmonstm/alerts:${CI_COMMIT_SHA:0:7} -f services/alerts/Dockerfile --target production .
- echo "=== Tagging as latest ==="
- docker tag sysmonstm/aggregator:${CI_COMMIT_SHA:0:7} sysmonstm/aggregator:latest
- docker tag sysmonstm/gateway:${CI_COMMIT_SHA:0:7} sysmonstm/gateway:latest
- docker tag sysmonstm/collector:${CI_COMMIT_SHA:0:7} sysmonstm/collector:latest
- docker tag sysmonstm/alerts:${CI_COMMIT_SHA:0:7} sysmonstm/alerts:latest
volumes:
- /var/run/docker.sock:/var/run/docker.sock
push-to-registry:
image: docker:24-dind
commands:
- echo "=== Logging into registry ==="
- echo "$REGISTRY_PASSWORD" | docker login -u "$REGISTRY_USER" --password-stdin "$REGISTRY_URL"
- echo "=== Pushing images ==="
- |
for svc in aggregator gateway collector alerts; do
docker tag sysmonstm/$svc:${CI_COMMIT_SHA:0:7} $REGISTRY_URL/sysmonstm/$svc:${CI_COMMIT_SHA:0:7}
docker tag sysmonstm/$svc:latest $REGISTRY_URL/sysmonstm/$svc:latest
docker push $REGISTRY_URL/sysmonstm/$svc:${CI_COMMIT_SHA:0:7}
docker push $REGISTRY_URL/sysmonstm/$svc:latest
echo "Pushed $svc"
done
secrets: [registry_user, registry_password, registry_url]
volumes:
- /var/run/docker.sock:/var/run/docker.sock
depends_on:
- test
when:
event: push
branch: main

61
.woodpecker/deploy.yml Normal file
View File

@@ -0,0 +1,61 @@
# Woodpecker CI - Deploy Pipeline
steps:
deploy-to-staging:
image: appleboy/drone-ssh
settings:
host:
from_secret: deploy_host
username:
from_secret: deploy_user
key:
from_secret: deploy_key
port: 22
script:
- echo "=== Deploying to staging ==="
- cd /home/ec2-user/sysmonstm
- git fetch origin main
- git reset --hard origin/main
- echo "=== Pulling new images ==="
- docker-compose pull
- echo "=== Restarting services ==="
- docker-compose up -d --remove-orphans
- echo "=== Cleaning up ==="
- docker system prune -f
- echo "=== Deployment complete ==="
- docker-compose ps
health-check:
image: curlimages/curl
commands:
- echo "=== Waiting for services to start ==="
- sleep 10
- echo "=== Checking gateway health ==="
- curl -f http://$DEPLOY_HOST:8000/health || exit 1
- echo "=== Health check passed ==="
secrets: [deploy_host]
notify:
image: plugins/webhook
settings:
urls:
from_secret: webhook_url
content_type: application/json
template: |
{
"text": "🚀 Deployed to staging",
"repo": "${CI_REPO_NAME}",
"commit": "${CI_COMMIT_SHA:0:7}",
"message": "${CI_COMMIT_MESSAGE}",
"author": "${CI_COMMIT_AUTHOR}",
"url": "https://sysmonstm.mcrn.ar"
}
when:
status: success
depends_on:
- build
when:
event: push
branch: main

40
.woodpecker/test.yml Normal file
View File

@@ -0,0 +1,40 @@
# Woodpecker CI - Test Pipeline (runs on PRs and pushes)
# Separate file for cleaner organization
steps:
lint:
image: python:3.11-slim
commands:
- pip install --quiet ruff mypy
- echo "=== Linting with ruff ==="
- ruff check services/ shared/ --output-format=github
- echo "=== Checking formatting ==="
- ruff format --check services/ shared/
typecheck:
image: python:3.11-slim
commands:
- pip install --quiet mypy types-redis
- echo "=== Type checking shared/ ==="
- mypy shared/ --ignore-missing-imports || true
unit-tests:
image: python:3.11-slim
commands:
- pip install --quiet pytest pytest-asyncio pytest-cov
- pip install --quiet redis asyncpg grpcio grpcio-tools psutil pydantic pydantic-settings structlog
- echo "=== Running unit tests ==="
- pytest shared/ services/ -v --tb=short --cov=shared --cov=services --cov-report=term-missing || true
proto-check:
image: python:3.11-slim
commands:
- pip install --quiet grpcio-tools
- echo "=== Validating proto definitions ==="
- python -m grpc_tools.protoc -I./proto --python_out=/tmp --grpc_python_out=/tmp ./proto/metrics.proto
- echo "Proto compilation successful"
depends_on: []
when:
event: [push, pull_request]

492
CLAUDE.md Normal file
View File

@@ -0,0 +1,492 @@
# Distributed System Monitoring Platform
## Project Overview
A real-time system monitoring platform that streams metrics from multiple machines to a central hub with live web dashboard. Built to demonstrate production microservices patterns (gRPC, FastAPI, streaming, event-driven architecture) while solving a real problem: monitoring development infrastructure across multiple machines.
**Primary Goal:** Interview demonstration project for Python Microservices Engineer position
**Secondary Goal:** Actually useful tool for managing multi-machine development environment
**Time Investment:** Phased approach - MVP in weekend, polish over 2-3 weeks
## Why This Project
**Interview Alignment:**
- Demonstrates gRPC-based microservices architecture (core requirement)
- Shows streaming patterns (server-side and bidirectional)
- Real-time data aggregation and processing
- Alert/threshold monitoring (maps to fraud detection)
- Event-driven patterns
- Multiple data sources requiring normalization (maps to multiple payment processors)
**Personal Utility:**
- Monitors existing multi-machine dev setup
- Dashboard stays open, provides real value
- Solves actual pain point
- Will continue running post-interview
**Domain Mapping for Interview:**
- Machine = Payment Processor
- Metrics Stream = Transaction Stream
- Resource Thresholds = Fraud/Limit Detection
- Alert System = Risk Management
- Aggregation Service = Payment Processing Hub
## Technical Stack
### Core Technologies (Must Use - From JD)
- **Python 3.11+** - Primary language
- **FastAPI** - Web gateway, REST endpoints, WebSocket streaming
- **gRPC** - Inter-service communication, metric streaming
- **PostgreSQL/TimescaleDB** - Time-series historical data
- **Redis** - Current state, caching, alert rules
- **Docker Compose** - Orchestration
### Supporting Technologies
- **Protocol Buffers** - gRPC message definitions
- **WebSockets** - Browser streaming
- **htmx + Alpine.js** - Lightweight reactive frontend (avoid heavy SPA)
- **Chart.js or Apache ECharts** - Real-time graphs
- **asyncio** - Async patterns throughout
### Development Tools
- **grpcio & grpcio-tools** - Python gRPC
- **psutil** - System metrics collection
- **uvicorn** - FastAPI server
- **pytest** - Testing
- **docker-compose** - Local orchestration
## Architecture
```
┌─────────────────────────────────────────────────────────────┐
│ Browser │
│ ┌──────────────────────────────────────────────────────┐ │
│ │ Dashboard (htmx + Alpine.js + WebSockets) │ │
│ └──────────────────────────────────────────────────────┘ │
└────────────────────────┬────────────────────────────────────┘
│ WebSocket
┌─────────────────────────────────────────────────────────────┐
│ Web Gateway Service │
│ (FastAPI + WebSockets) │
│ - Serves dashboard │
│ - Streams updates to browser │
│ - REST API for historical queries │
└────────────────────────┬────────────────────────────────────┘
│ gRPC
┌─────────────────────────────────────────────────────────────┐
│ Aggregator Service (gRPC) │
│ - Receives metric streams from all collectors │
│ - Normalizes data from different sources │
│ - Enriches with machine context │
│ - Publishes to event stream │
│ - Checks alert thresholds │
└─────┬───────────────────────────────────┬───────────────────┘
│ │
│ Stores │ Publishes events
▼ ▼
┌──────────────┐ ┌────────────────┐
│ TimescaleDB │ │ Event Stream │
│ (historical)│ │ (Redis Pub/Sub│
└──────────────┘ │ or RabbitMQ) │
└────────┬───────┘
┌──────────────┐ │
│ Redis │ │ Subscribes
│ (current │◄───────────────────────────┘
│ state) │ │
└──────────────┘ ▼
┌────────────────┐
▲ │ Alert Service │
│ │ - Processes │
│ │ events │
│ gRPC Streaming │ - Triggers │
│ │ actions │
┌─────┴────────────────────────────┴────────────────┘
│ Multiple Collector Services (one per machine)
│ ┌───────────────────────────────────────┐
│ │ Metrics Collector (gRPC Client) │
│ │ - Gathers system metrics (psutil) │
│ │ - Streams to Aggregator via gRPC │
│ │ - CPU, Memory, Disk, Network │
│ │ - Process list │
│ │ - Docker container stats (optional) │
│ └───────────────────────────────────────┘
└──► Machine 1, Machine 2, Machine 3, ...
```
## Implementation Phases
### Phase 1: MVP - Core Streaming (Weekend - 8-12 hours)
**Goal:** Prove the gRPC streaming works end-to-end
**Deliverables:**
1. Metrics Collector Service (gRPC client)
- Collects CPU, memory, disk on localhost
- Streams to aggregator every 5 seconds
2. Aggregator Service (gRPC server)
- Receives metric stream
- Stores current state in Redis
- Logs to console
3. Proto definitions for metric messages
4. Docker Compose setup
**Success Criteria:**
- Run collector, see metrics flowing to aggregator
- Redis contains current state
- Can query Redis manually for latest metrics
### Phase 2: Web Dashboard (1 week)
**Goal:** Make it visible and useful
**Deliverables:**
1. Web Gateway Service (FastAPI)
- WebSocket endpoint for streaming
- REST endpoints for current/historical data
2. Dashboard UI
- Real-time CPU/Memory graphs per machine
- Current state table
- Simple, clean design
3. WebSocket bridge (Gateway ↔ Aggregator)
4. TimescaleDB integration
- Store historical metrics
- Query endpoints for time ranges
**Success Criteria:**
- Open dashboard, see live graphs updating
- Graphs show last hour of data
- Multiple machines displayed separately
### Phase 3: Alerts & Intelligence (1 week)
**Goal:** Add decision-making layer (interview focus)
**Deliverables:**
1. Alert Service
- Subscribes to event stream
- Evaluates threshold rules
- Triggers notifications
2. Configuration Service (gRPC)
- Dynamic threshold management
- Alert rule CRUD
- Stored in PostgreSQL
3. Event Stream implementation (Redis Pub/Sub or RabbitMQ)
4. Enhanced dashboard
- Alert indicators
- Alert history
- Threshold configuration UI
**Success Criteria:**
- Set CPU threshold at 80%
- Generate load (stress-ng)
- See alert trigger in dashboard
- Alert logged to database
### Phase 4: Interview Polish (Final week)
**Goal:** Demo-ready, production patterns visible
**Deliverables:**
1. Observability
- OpenTelemetry tracing (optional)
- Structured logging
- Health check endpoints
2. "Synthetic Transactions"
- Simulate business operations through system
- Track end-to-end latency
- Maps directly to payment processing demo
3. Documentation
- Architecture diagram
- Service interaction flows
- Deployment guide
4. Demo script
- Story to walk through
- Key talking points
- Domain mapping explanations
**Success Criteria:**
- Can deploy entire stack with one command
- Can explain every service's role
- Can map architecture to payment processing
- Demo runs smoothly without hiccups
## Key Technical Patterns to Demonstrate
### 1. gRPC Streaming Patterns
**Server-Side Streaming:**
```python
# Collector streams metrics to aggregator
service MetricsService {
rpc StreamMetrics(MetricsRequest) returns (stream Metric) {}
}
```
**Bidirectional Streaming:**
```python
# Two-way communication between services
service ControlService {
rpc ManageStream(stream Command) returns (stream Response) {}
}
```
### 2. Service Communication Patterns
- **Synchronous (gRPC):** Query current state, configuration
- **Asynchronous (Events):** Metric updates, alerts, audit logs
- **Streaming (gRPC + WebSocket):** Real-time data flow
### 3. Data Storage Patterns
- **Hot data (Redis):** Current state, recent metrics (last 5 minutes)
- **Warm data (TimescaleDB):** Historical metrics (last 30 days)
- **Cold data (Optional):** Archive to S3-compatible storage
### 4. Error Handling & Resilience
- gRPC retry logic with exponential backoff
- Circuit breaker pattern for service calls
- Graceful degradation (continue if one collector fails)
- Dead letter queue for failed events
## Proto Definitions (Starting Point)
```protobuf
syntax = "proto3";
package monitoring;
service MetricsService {
rpc StreamMetrics(MetricsRequest) returns (stream Metric) {}
rpc GetCurrentState(StateRequest) returns (MachineState) {}
}
message MetricsRequest {
string machine_id = 1;
int32 interval_seconds = 2;
}
message Metric {
string machine_id = 1;
int64 timestamp = 2;
MetricType type = 3;
double value = 4;
map<string, string> labels = 5;
}
enum MetricType {
CPU_PERCENT = 0;
MEMORY_PERCENT = 1;
MEMORY_USED_GB = 2;
DISK_PERCENT = 3;
NETWORK_SENT_MBPS = 4;
NETWORK_RECV_MBPS = 5;
}
message MachineState {
string machine_id = 1;
int64 last_seen = 2;
repeated Metric current_metrics = 3;
HealthStatus health = 4;
}
enum HealthStatus {
HEALTHY = 0;
WARNING = 1;
CRITICAL = 2;
UNKNOWN = 3;
}
```
## Project Structure
```
system-monitor/
├── docker-compose.yml
├── proto/
│ └── metrics.proto
├── services/
│ ├── collector/
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ ├── main.py
│ │ └── metrics.py
│ ├── aggregator/
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ ├── main.py
│ │ └── storage.py
│ ├── gateway/
│ │ ├── Dockerfile
│ │ ├── requirements.txt
│ │ ├── main.py
│ │ └── websocket.py
│ └── alerts/
│ ├── Dockerfile
│ ├── requirements.txt
│ ├── main.py
│ └── rules.py
├── web/
│ ├── static/
│ │ ├── css/
│ │ └── js/
│ └── templates/
│ └── dashboard.html
└── README.md
```
## Interview Talking Points
### Domain Mapping to Payments
**What you say:**
- "I built this to monitor my dev machines, but the architecture directly maps to payment processing"
- "Each machine streaming metrics is like a payment processor streaming transactions"
- "The aggregator normalizes data from different sources - same as aggregating from Stripe, PayPal, bank APIs"
- "Alert thresholds on resource usage are structurally identical to fraud detection thresholds"
- "The event stream for audit trails maps directly to payment audit logs"
### Technical Decisions to Highlight
**gRPC vs REST:**
- "I use gRPC between services for efficiency and strong typing"
- "FastAPI gateway exposes REST/WebSocket for browser clients"
- "This pattern is common - internal gRPC, external REST"
**Streaming vs Polling:**
- "Server-side streaming reduces network overhead"
- "Bidirectional streaming allows dynamic configuration updates"
- "WebSocket to browser maintains single connection"
**State Management:**
- "Redis for hot data - current state, needs fast access"
- "TimescaleDB for historical analysis - optimized for time-series"
- "This tiered storage approach scales to payment transaction volumes"
**Resilience:**
- "Each collector is independent - one failing doesn't affect others"
- "Circuit breaker prevents cascade failures"
- "Event stream decouples alert processing from metric ingestion"
### What NOT to Say
- Don't call it a "toy project" or "learning exercise"
- Don't apologize for running locally vs AWS
- Don't over-explain obvious things
- Don't claim it's production-ready when it's not
### What TO Say
- "I built this to solve a real problem I have"
- "Locally it uses PostgreSQL/Redis, in production these become Aurora/ElastiCache"
- "I focused on the architectural patterns since those transfer directly"
- "I'd keep developing this - it's genuinely useful"
## Development Guidelines
### Code Quality Standards
- Type hints throughout (Python 3.11+ syntax)
- Async/await patterns consistently
- Structured logging (JSON format)
- Error handling at all boundaries
- Unit tests for business logic
- Integration tests for service interactions
### Docker Best Practices
- Multi-stage builds
- Non-root users
- Health checks
- Resource limits
- Volume mounts for development
### Configuration Management
- Environment variables for all config
- Sensible defaults
- Config validation on startup
- No secrets in code
## AWS Mapping (For Interview Discussion)
**What you have → What it becomes:**
- PostgreSQL → Aurora PostgreSQL
- Redis → ElastiCache
- Docker Containers → ECS/Fargate or Lambda
- RabbitMQ/Redis Pub/Sub → SQS/SNS
- Docker Compose → CloudFormation/Terraform
- Local networking → VPC, Security Groups
**Key point:** "The architecture and patterns are production-ready, the infrastructure is local for development convenience"
## Common Pitfalls to Avoid
1. **Over-engineering Phase 1** - Resist adding features, just get streaming working
2. **Ugly UI** - Don't waste time on design, htmx + basic CSS is fine
3. **Perfect metrics** - Mock data is OK early on, real psutil data comes later
4. **Complete coverage** - Better to have 3 services working perfectly than 10 half-done
5. **AWS deployment** - Local is fine, AWS costs money and adds complexity
## Success Metrics
**For Yourself:**
- [ ] Actually use the dashboard daily
- [ ] Catches a real issue before you notice
- [ ] Runs stable for 1+ week without intervention
**For Interview:**
- [ ] Can demo end-to-end in 5 minutes
- [ ] Can explain every service interaction
- [ ] Can map to payment domain fluently
- [ ] Shows understanding of production patterns
## Next Steps
1. Set up project structure
2. Define proto messages
3. Build Phase 1 MVP
4. Iterate based on what feels useful
5. Polish for demo when interview approaches
## Resources
- gRPC Python docs: https://grpc.io/docs/languages/python/
- FastAPI WebSockets: https://fastapi.tiangolo.com/advanced/websockets/
- TimescaleDB: https://docs.timescale.com/
- htmx: https://htmx.org/
## Questions to Ask Yourself During Development
- "Would I actually use this feature?"
- "How does this map to payments?"
- "Can I explain why I built it this way?"
- "What would break if X service failed?"
- "How would this scale to 1000 machines?"
---
## Final Note
This project works because it's:
1. **Real** - You'll use it
2. **Focused** - Shows specific patterns they care about
3. **Mappable** - Clear connection to their domain
4. **Yours** - Not a tutorial copy, demonstrates your thinking
Build it in phases, use it daily, and by interview time you'll have natural stories about trade-offs, failures, and learnings. That authenticity is more valuable than perfect code.
Good luck! 🚀

119
Tiltfile Normal file
View File

@@ -0,0 +1,119 @@
# -*- mode: Python -*-
# Tiltfile for sysmonstm - local Kubernetes development
# Load extensions
load('ext://restart_process', 'docker_build_with_restart')
load('ext://namespace', 'namespace_create')
# Configuration
config.define_bool("no-volumes")
cfg = config.parse()
no_volumes = cfg.get("no-volumes", False)
# Create namespace
namespace_create('sysmonstm')
k8s_yaml(kustomize('k8s/overlays/local'))
# ============================================================================
# Docker builds with live reload
# ============================================================================
# Aggregator service
docker_build(
'sysmonstm-aggregator',
context='.',
dockerfile='services/aggregator/Dockerfile',
target='development',
live_update=[
sync('./services/aggregator', '/app/services/aggregator'),
sync('./shared', '/app/shared'),
sync('./proto', '/app/proto'),
],
)
# Gateway service
docker_build(
'sysmonstm-gateway',
context='.',
dockerfile='services/gateway/Dockerfile',
target='development',
live_update=[
sync('./services/gateway', '/app/services/gateway'),
sync('./shared', '/app/shared'),
sync('./proto', '/app/proto'),
sync('./web', '/app/web'),
],
)
# Alerts service
docker_build(
'sysmonstm-alerts',
context='.',
dockerfile='services/alerts/Dockerfile',
target='development',
live_update=[
sync('./services/alerts', '/app/services/alerts'),
sync('./shared', '/app/shared'),
],
)
# ============================================================================
# Resource configuration
# ============================================================================
# Infrastructure
k8s_resource('redis', labels=['infra'])
k8s_resource('timescaledb', labels=['infra'])
# Application services
k8s_resource(
'aggregator',
labels=['app'],
resource_deps=['redis', 'timescaledb'],
port_forwards=['50051:50051'],
)
k8s_resource(
'gateway',
labels=['app'],
resource_deps=['aggregator', 'redis'],
port_forwards=['8000:8000'],
)
k8s_resource(
'alerts',
labels=['app'],
resource_deps=['redis', 'timescaledb'],
)
# ============================================================================
# Local resources (optional - for running collector locally)
# ============================================================================
local_resource(
'collector-local',
serve_cmd='cd services/collector && python main.py',
deps=['services/collector', 'shared'],
resource_deps=['aggregator'],
labels=['collector'],
auto_init=False, # Don't start automatically
env={
'AGGREGATOR_URL': 'localhost:50051',
'MACHINE_ID': 'tilt-dev',
'COLLECTION_INTERVAL': '5',
'LOG_LEVEL': 'DEBUG',
'PYTHONPATH': '.',
},
)
# ============================================================================
# Convenience buttons
# ============================================================================
local_resource(
'proto-gen',
cmd='python -m grpc_tools.protoc -I./proto --python_out=./shared --grpc_python_out=./shared ./proto/metrics.proto',
deps=['proto/metrics.proto'],
labels=['tools'],
auto_init=False,
)

32
ctlptl.yaml Normal file
View File

@@ -0,0 +1,32 @@
# ctlptl configuration for Kind cluster
# Usage: ctlptl apply -f ctlptl.yaml
apiVersion: ctlptl.dev/v1alpha1
kind: Registry
name: sysmonstm-registry
port: 5005
---
apiVersion: ctlptl.dev/v1alpha1
kind: Cluster
product: kind
registry: sysmonstm-registry
kindV1Alpha4Cluster:
name: sysmonstm
nodes:
- role: control-plane
extraPortMappings:
# Gateway HTTP
- containerPort: 30080
hostPort: 8080
protocol: TCP
# Aggregator gRPC
- containerPort: 30051
hostPort: 50051
protocol: TCP
# Resource limits for t2.small compatibility
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
system-reserved: memory=256Mi

View File

@@ -0,0 +1,48 @@
# Development overrides - hot reload, mounted volumes, debug settings
# Usage: docker compose up (automatically includes this file)
version: "3.8"
services:
aggregator:
build:
target: development
volumes:
- ./services/aggregator:/app/services/aggregator:ro
- ./shared:/app/shared:ro
- ./proto:/app/proto:ro
environment:
LOG_LEVEL: DEBUG
RELOAD: "true"
gateway:
build:
target: development
volumes:
- ./services/gateway:/app/services/gateway:ro
- ./shared:/app/shared:ro
- ./proto:/app/proto:ro
- ./web:/app/web:ro
environment:
LOG_LEVEL: DEBUG
RELOAD: "true"
alerts:
build:
target: development
volumes:
- ./services/alerts:/app/services/alerts:ro
- ./shared:/app/shared:ro
environment:
LOG_LEVEL: DEBUG
collector:
build:
target: development
volumes:
- ./services/collector:/app/services/collector:ro
- ./shared:/app/shared:ro
- ./proto:/app/proto:ro
environment:
LOG_LEVEL: DEBUG
COLLECTION_INTERVAL: 2

154
docker-compose.yml Normal file
View File

@@ -0,0 +1,154 @@
version: "3.8"
# This file works both locally and on EC2 for demo purposes.
# For local dev with hot-reload, use: docker compose -f docker-compose.yml -f docker-compose.override.yml up
x-common-env: &common-env
REDIS_URL: redis://redis:6379
TIMESCALE_URL: postgresql://monitor:monitor@timescaledb:5432/monitor
EVENTS_BACKEND: redis_pubsub
LOG_LEVEL: ${LOG_LEVEL:-INFO}
LOG_FORMAT: json
x-healthcheck-defaults: &healthcheck-defaults
interval: 10s
timeout: 5s
retries: 3
start_period: 10s
services:
# =============================================================================
# Infrastructure
# =============================================================================
redis:
image: redis:7-alpine
ports:
- "${REDIS_PORT:-6379}:6379"
volumes:
- redis-data:/data
healthcheck:
<<: *healthcheck-defaults
test: ["CMD", "redis-cli", "ping"]
deploy:
resources:
limits:
memory: 128M
timescaledb:
image: timescale/timescaledb:latest-pg15
environment:
POSTGRES_USER: monitor
POSTGRES_PASSWORD: monitor
POSTGRES_DB: monitor
ports:
- "${TIMESCALE_PORT:-5432}:5432"
volumes:
- timescale-data:/var/lib/postgresql/data
- ./scripts/init-db.sql:/docker-entrypoint-initdb.d/init.sql:ro
healthcheck:
<<: *healthcheck-defaults
test: ["CMD-SHELL", "pg_isready -U monitor -d monitor"]
deploy:
resources:
limits:
memory: 512M
# =============================================================================
# Application Services
# =============================================================================
aggregator:
build:
context: .
dockerfile: services/aggregator/Dockerfile
environment:
<<: *common-env
GRPC_PORT: 50051
SERVICE_NAME: aggregator
ports:
- "${AGGREGATOR_GRPC_PORT:-50051}:50051"
depends_on:
redis:
condition: service_healthy
timescaledb:
condition: service_healthy
healthcheck:
<<: *healthcheck-defaults
test: ["CMD", "/bin/grpc_health_probe", "-addr=:50051"]
deploy:
resources:
limits:
memory: 256M
gateway:
build:
context: .
dockerfile: services/gateway/Dockerfile
environment:
<<: *common-env
HTTP_PORT: 8000
AGGREGATOR_URL: aggregator:50051
SERVICE_NAME: gateway
ports:
- "${GATEWAY_PORT:-8000}:8000"
depends_on:
- aggregator
- redis
healthcheck:
<<: *healthcheck-defaults
test: ["CMD", "curl", "-f", "http://localhost:8000/health"]
deploy:
resources:
limits:
memory: 256M
alerts:
build:
context: .
dockerfile: services/alerts/Dockerfile
environment:
<<: *common-env
SERVICE_NAME: alerts
depends_on:
redis:
condition: service_healthy
timescaledb:
condition: service_healthy
healthcheck:
<<: *healthcheck-defaults
test: ["CMD", "python", "-c", "import sys; sys.exit(0)"]
deploy:
resources:
limits:
memory: 128M
# Collector runs separately on each machine being monitored
# For local testing, we run one instance
collector:
build:
context: .
dockerfile: services/collector/Dockerfile
environment:
<<: *common-env
AGGREGATOR_URL: aggregator:50051
MACHINE_ID: ${MACHINE_ID:-local-dev}
COLLECTION_INTERVAL: ${COLLECTION_INTERVAL:-5}
SERVICE_NAME: collector
depends_on:
- aggregator
deploy:
resources:
limits:
memory: 64M
# For actual system metrics, you might need:
# privileged: true
# pid: host
volumes:
redis-data:
timescale-data:
networks:
default:
name: sysmonstm

View File

@@ -0,0 +1,78 @@
digraph SystemOverview {
// Graph settings
rankdir=TB;
compound=true;
fontname="Helvetica";
node [fontname="Helvetica", fontsize=11];
edge [fontname="Helvetica", fontsize=10];
// Title
labelloc="t";
label="System Monitoring Platform - Architecture Overview";
fontsize=16;
// Styling
node [shape=box, style="rounded,filled"];
// External
subgraph cluster_external {
label="External";
style=dashed;
color=gray;
browser [label="Browser\n(Dashboard)", fillcolor="#E3F2FD"];
machines [label="Monitored\nMachines", fillcolor="#FFF3E0", shape=box3d];
}
// Core Services
subgraph cluster_services {
label="Application Services";
style=filled;
color="#E8F5E9";
fillcolor="#E8F5E9";
gateway [label="Gateway\n(FastAPI)", fillcolor="#C8E6C9"];
aggregator [label="Aggregator\n(gRPC Server)", fillcolor="#C8E6C9"];
alerts [label="Alerts\nService", fillcolor="#C8E6C9"];
collector [label="Collector\n(gRPC Client)", fillcolor="#DCEDC8"];
}
// Data Layer
subgraph cluster_data {
label="Data Layer";
style=filled;
color="#FFF8E1";
fillcolor="#FFF8E1";
redis [label="Redis\n(Pub/Sub + State)", fillcolor="#FFECB3", shape=cylinder];
timescale [label="TimescaleDB\n(Time-series)", fillcolor="#FFECB3", shape=cylinder];
}
// Event Stream
subgraph cluster_events {
label="Event Stream";
style=filled;
color="#F3E5F5";
fillcolor="#F3E5F5";
events [label="Redis Pub/Sub\n(Events)", fillcolor="#E1BEE7", shape=hexagon];
}
// Connections
browser -> gateway [label="WebSocket\nREST", color="#1976D2"];
gateway -> aggregator [label="gRPC", color="#388E3C"];
gateway -> redis [label="State\nQuery", style=dashed];
gateway -> timescale [label="Historical\nQuery", style=dashed];
machines -> collector [label="psutil", color="#F57C00", style=dotted];
collector -> aggregator [label="gRPC\nStream", color="#388E3C"];
aggregator -> redis [label="Current\nState", color="#FFA000"];
aggregator -> timescale [label="Store\nMetrics", color="#FFA000"];
aggregator -> events [label="Publish", color="#7B1FA2"];
events -> alerts [label="Subscribe", color="#7B1FA2"];
events -> gateway [label="Subscribe", color="#7B1FA2"];
alerts -> timescale [label="Store\nAlerts", style=dashed];
}

View File

@@ -0,0 +1,193 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 14.1.1 (0)
-->
<!-- Title: SystemOverview Pages: 1 -->
<svg width="444pt" height="508pt"
viewBox="0.00 0.00 444.00 508.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 503.78)">
<title>SystemOverview</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-503.78 440,-503.78 440,4 -4,4"/>
<text xml:space="preserve" text-anchor="middle" x="218" y="-480.58" font-family="Helvetica,sans-Serif" font-size="16.00">System Monitoring Platform &#45; Architecture Overview</text>
<g id="clust1" class="cluster">
<title>cluster_external</title>
<polygon fill="none" stroke="gray" stroke-dasharray="5,2" points="45.5,-374.2 45.5,-453.7 235.5,-453.7 235.5,-374.2 45.5,-374.2"/>
<text xml:space="preserve" text-anchor="middle" x="140.5" y="-434.5" font-family="Helvetica,sans-Serif" font-size="16.00">External</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_services</title>
<polygon fill="#e8f5e9" stroke="#e8f5e9" points="101.5,-143.12 101.5,-320.12 363.5,-320.12 363.5,-143.12 101.5,-143.12"/>
<text xml:space="preserve" text-anchor="middle" x="232.5" y="-300.93" font-family="Helvetica,sans-Serif" font-size="16.00">Application Services</text>
</g>
<g id="clust3" class="cluster">
<title>cluster_data</title>
<polygon fill="#fff8e1" stroke="#fff8e1" points="22.5,-8 22.5,-99.62 260.5,-99.62 260.5,-8 22.5,-8"/>
<text xml:space="preserve" text-anchor="middle" x="141.5" y="-80.42" font-family="Helvetica,sans-Serif" font-size="16.00">Data Layer</text>
</g>
<g id="clust4" class="cluster">
<title>cluster_events</title>
<polygon fill="#f3e5f5" stroke="#f3e5f5" points="243.5,-363.62 243.5,-464.28 413.5,-464.28 413.5,-363.62 243.5,-363.62"/>
<text xml:space="preserve" text-anchor="middle" x="328.5" y="-445.08" font-family="Helvetica,sans-Serif" font-size="16.00">Event Stream</text>
</g>
<!-- browser -->
<g id="node1" class="node">
<title>browser</title>
<path fill="#e3f2fd" stroke="black" d="M125.62,-418.2C125.62,-418.2 65.38,-418.2 65.38,-418.2 59.38,-418.2 53.38,-412.2 53.38,-406.2 53.38,-406.2 53.38,-394.2 53.38,-394.2 53.38,-388.2 59.38,-382.2 65.38,-382.2 65.38,-382.2 125.62,-382.2 125.62,-382.2 131.62,-382.2 137.62,-388.2 137.62,-394.2 137.62,-394.2 137.62,-406.2 137.62,-406.2 137.62,-412.2 131.62,-418.2 125.62,-418.2"/>
<text xml:space="preserve" text-anchor="middle" x="95.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Browser</text>
<text xml:space="preserve" text-anchor="middle" x="95.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">(Dashboard)</text>
</g>
<!-- gateway -->
<g id="node3" class="node">
<title>gateway</title>
<path fill="#c8e6c9" stroke="black" d="M161.88,-284.62C161.88,-284.62 121.12,-284.62 121.12,-284.62 115.12,-284.62 109.12,-278.62 109.12,-272.62 109.12,-272.62 109.12,-260.62 109.12,-260.62 109.12,-254.62 115.12,-248.62 121.12,-248.62 121.12,-248.62 161.88,-248.62 161.88,-248.62 167.88,-248.62 173.88,-254.62 173.88,-260.62 173.88,-260.62 173.88,-272.62 173.88,-272.62 173.88,-278.62 167.88,-284.62 161.88,-284.62"/>
<text xml:space="preserve" text-anchor="middle" x="141.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Gateway</text>
<text xml:space="preserve" text-anchor="middle" x="141.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">(FastAPI)</text>
</g>
<!-- browser&#45;&gt;gateway -->
<g id="edge1" class="edge">
<title>browser&#45;&gt;gateway</title>
<path fill="none" stroke="#1976d2" d="M92.73,-381.75C91.08,-367.05 90.32,-345.66 96.25,-328.12 100.5,-315.57 108.45,-303.5 116.51,-293.49"/>
<polygon fill="#1976d2" stroke="#1976d2" points="119.02,-295.94 122.86,-286.06 113.7,-291.39 119.02,-295.94"/>
<text xml:space="preserve" text-anchor="middle" x="122.88" y="-344.12" font-family="Helvetica,sans-Serif" font-size="10.00">WebSocket</text>
<text xml:space="preserve" text-anchor="middle" x="122.88" y="-331.38" font-family="Helvetica,sans-Serif" font-size="10.00">REST</text>
</g>
<!-- machines -->
<g id="node2" class="node">
<title>machines</title>
<polygon fill="#fff3e0" stroke="black" points="227.25,-418.2 159.75,-418.2 155.75,-414.2 155.75,-382.2 223.25,-382.2 227.25,-386.2 227.25,-418.2"/>
<polyline fill="none" stroke="black" points="223.25,-414.2 155.75,-414.2"/>
<polyline fill="none" stroke="black" points="223.25,-414.2 223.25,-382.2"/>
<polyline fill="none" stroke="black" points="223.25,-414.2 227.25,-418.2"/>
<text xml:space="preserve" text-anchor="middle" x="191.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Monitored</text>
<text xml:space="preserve" text-anchor="middle" x="191.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">Machines</text>
</g>
<!-- collector -->
<g id="node6" class="node">
<title>collector</title>
<path fill="#dcedc8" stroke="black" d="M343.88,-284.62C343.88,-284.62 279.12,-284.62 279.12,-284.62 273.12,-284.62 267.12,-278.62 267.12,-272.62 267.12,-272.62 267.12,-260.62 267.12,-260.62 267.12,-254.62 273.12,-248.62 279.12,-248.62 279.12,-248.62 343.88,-248.62 343.88,-248.62 349.88,-248.62 355.88,-254.62 355.88,-260.62 355.88,-260.62 355.88,-272.62 355.88,-272.62 355.88,-278.62 349.88,-284.62 343.88,-284.62"/>
<text xml:space="preserve" text-anchor="middle" x="311.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Collector</text>
<text xml:space="preserve" text-anchor="middle" x="311.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">(gRPC Client)</text>
</g>
<!-- machines&#45;&gt;collector -->
<g id="edge5" class="edge">
<title>machines&#45;&gt;collector</title>
<path fill="none" stroke="#f57c00" stroke-dasharray="1,5" d="M210.81,-381.83C219.12,-375.21 229.26,-368.17 239.5,-363.62 260.21,-354.43 273.06,-369.22 289.5,-353.62 304.98,-338.94 310.15,-314.98 311.64,-296.08"/>
<polygon fill="#f57c00" stroke="#f57c00" points="315.12,-296.47 312.08,-286.32 308.13,-296.15 315.12,-296.47"/>
<text xml:space="preserve" text-anchor="middle" x="318.1" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">psutil</text>
</g>
<!-- aggregator -->
<g id="node4" class="node">
<title>aggregator</title>
<path fill="#c8e6c9" stroke="black" d="M343.12,-187.12C343.12,-187.12 273.88,-187.12 273.88,-187.12 267.88,-187.12 261.88,-181.12 261.88,-175.12 261.88,-175.12 261.88,-163.12 261.88,-163.12 261.88,-157.12 267.88,-151.12 273.88,-151.12 273.88,-151.12 343.12,-151.12 343.12,-151.12 349.12,-151.12 355.12,-157.12 355.12,-163.12 355.12,-163.12 355.12,-175.12 355.12,-175.12 355.12,-181.12 349.12,-187.12 343.12,-187.12"/>
<text xml:space="preserve" text-anchor="middle" x="308.5" y="-172.18" font-family="Helvetica,sans-Serif" font-size="11.00">Aggregator</text>
<text xml:space="preserve" text-anchor="middle" x="308.5" y="-158.68" font-family="Helvetica,sans-Serif" font-size="11.00">(gRPC Server)</text>
</g>
<!-- gateway&#45;&gt;aggregator -->
<g id="edge2" class="edge">
<title>gateway&#45;&gt;aggregator</title>
<path fill="none" stroke="#388e3c" d="M171.74,-248.33C198.77,-232.88 238.56,-210.12 268.26,-193.13"/>
<polygon fill="#388e3c" stroke="#388e3c" points="269.66,-196.37 276.6,-188.36 266.19,-190.29 269.66,-196.37"/>
<text xml:space="preserve" text-anchor="middle" x="257.62" y="-214.75" font-family="Helvetica,sans-Serif" font-size="10.00">gRPC</text>
</g>
<!-- redis -->
<g id="node7" class="node">
<title>redis</title>
<path fill="#ffecb3" stroke="black" d="M146,-59.75C146,-62.16 120.23,-64.12 88.5,-64.12 56.77,-64.12 31,-62.16 31,-59.75 31,-59.75 31,-20.38 31,-20.38 31,-17.96 56.77,-16 88.5,-16 120.23,-16 146,-17.96 146,-20.38 146,-20.38 146,-59.75 146,-59.75"/>
<path fill="none" stroke="black" d="M146,-59.75C146,-57.34 120.23,-55.38 88.5,-55.38 56.77,-55.38 31,-57.34 31,-59.75"/>
<text xml:space="preserve" text-anchor="middle" x="88.5" y="-43.11" font-family="Helvetica,sans-Serif" font-size="11.00">Redis</text>
<text xml:space="preserve" text-anchor="middle" x="88.5" y="-29.61" font-family="Helvetica,sans-Serif" font-size="11.00">(Pub/Sub + State)</text>
</g>
<!-- gateway&#45;&gt;redis -->
<g id="edge3" class="edge">
<title>gateway&#45;&gt;redis</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M122.74,-248.35C108.28,-233.68 89.42,-211.2 81.25,-187.12 68.86,-150.62 73.72,-106.03 79.72,-75.79"/>
<polygon fill="black" stroke="black" points="83.14,-76.56 81.82,-66.04 76.29,-75.08 83.14,-76.56"/>
<text xml:space="preserve" text-anchor="middle" x="95.88" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">State</text>
<text xml:space="preserve" text-anchor="middle" x="95.88" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Query</text>
</g>
<!-- timescale -->
<g id="node8" class="node">
<title>timescale</title>
<path fill="#ffecb3" stroke="black" d="M252.88,-59.75C252.88,-62.16 232.99,-64.12 208.5,-64.12 184.01,-64.12 164.12,-62.16 164.12,-59.75 164.12,-59.75 164.12,-20.38 164.12,-20.38 164.12,-17.96 184.01,-16 208.5,-16 232.99,-16 252.88,-17.96 252.88,-20.38 252.88,-20.38 252.88,-59.75 252.88,-59.75"/>
<path fill="none" stroke="black" d="M252.88,-59.75C252.88,-57.34 232.99,-55.38 208.5,-55.38 184.01,-55.38 164.12,-57.34 164.12,-59.75"/>
<text xml:space="preserve" text-anchor="middle" x="208.5" y="-43.11" font-family="Helvetica,sans-Serif" font-size="11.00">TimescaleDB</text>
<text xml:space="preserve" text-anchor="middle" x="208.5" y="-29.61" font-family="Helvetica,sans-Serif" font-size="11.00">(Time&#45;series)</text>
</g>
<!-- gateway&#45;&gt;timescale -->
<g id="edge4" class="edge">
<title>gateway&#45;&gt;timescale</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M143.41,-248.29C146.34,-224.28 152.82,-179.73 164,-143.12 171.19,-119.57 182.25,-94.18 191.54,-74.62"/>
<polygon fill="black" stroke="black" points="194.62,-76.29 195.83,-65.76 188.32,-73.24 194.62,-76.29"/>
<text xml:space="preserve" text-anchor="middle" x="187.25" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">Historical</text>
<text xml:space="preserve" text-anchor="middle" x="187.25" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Query</text>
</g>
<!-- aggregator&#45;&gt;redis -->
<g id="edge7" class="edge">
<title>aggregator&#45;&gt;redis</title>
<path fill="none" stroke="#ffa000" d="M267.27,-150.69C261,-148.11 254.59,-145.52 248.5,-143.12 236.59,-138.44 233.22,-138.25 221.5,-133.12 191.36,-119.95 182.76,-118.04 155.5,-99.62 143.6,-91.59 131.5,-81.66 120.93,-72.28"/>
<polygon fill="#ffa000" stroke="#ffa000" points="123.32,-69.73 113.56,-65.6 118.62,-74.91 123.32,-69.73"/>
<text xml:space="preserve" text-anchor="middle" x="239.5" y="-123.62" font-family="Helvetica,sans-Serif" font-size="10.00">Current</text>
<text xml:space="preserve" text-anchor="middle" x="239.5" y="-110.88" font-family="Helvetica,sans-Serif" font-size="10.00">State</text>
</g>
<!-- aggregator&#45;&gt;timescale -->
<g id="edge8" class="edge">
<title>aggregator&#45;&gt;timescale</title>
<path fill="none" stroke="#ffa000" d="M294.81,-150.72C279.15,-130.84 253.2,-97.86 233.84,-73.25"/>
<polygon fill="#ffa000" stroke="#ffa000" points="236.64,-71.16 227.71,-65.47 231.14,-75.49 236.64,-71.16"/>
<text xml:space="preserve" text-anchor="middle" x="296.95" y="-123.62" font-family="Helvetica,sans-Serif" font-size="10.00">Store</text>
<text xml:space="preserve" text-anchor="middle" x="296.95" y="-110.88" font-family="Helvetica,sans-Serif" font-size="10.00">Metrics</text>
</g>
<!-- events -->
<g id="node9" class="node">
<title>events</title>
<path fill="#e1bee7" stroke="black" d="M395.63,-407.37C395.63,-407.37 376.5,-421.61 376.5,-421.61 371.69,-425.2 360.88,-428.78 354.88,-428.78 354.88,-428.78 302.12,-428.78 302.12,-428.78 296.12,-428.78 285.31,-425.2 280.5,-421.61 280.5,-421.61 261.37,-407.37 261.37,-407.37 256.56,-403.79 256.56,-396.62 261.37,-393.04 261.37,-393.04 280.5,-378.79 280.5,-378.79 285.31,-375.21 296.12,-371.62 302.12,-371.62 302.12,-371.62 354.88,-371.62 354.88,-371.62 360.88,-371.62 371.69,-375.21 376.5,-378.79 376.5,-378.79 395.63,-393.04 395.63,-393.04 400.44,-396.62 400.44,-403.79 395.63,-407.37"/>
<text xml:space="preserve" text-anchor="middle" x="328.5" y="-403.25" font-family="Helvetica,sans-Serif" font-size="11.00">Redis Pub/Sub</text>
<text xml:space="preserve" text-anchor="middle" x="328.5" y="-389.75" font-family="Helvetica,sans-Serif" font-size="11.00">(Events)</text>
</g>
<!-- aggregator&#45;&gt;events -->
<g id="edge9" class="edge">
<title>aggregator&#45;&gt;events</title>
<path fill="none" stroke="#7b1fa2" d="M333.16,-187.49C339.14,-192.63 345.07,-198.63 349.5,-205.12 361.02,-222.03 361.12,-228.46 364.5,-248.62 369.75,-279.97 371.24,-289.07 364.5,-320.12 361.48,-334.06 355.78,-348.49 349.79,-361.14"/>
<polygon fill="#7b1fa2" stroke="#7b1fa2" points="346.73,-359.44 345.42,-369.95 353,-362.55 346.73,-359.44"/>
<text xml:space="preserve" text-anchor="middle" x="386.64" y="-263.5" font-family="Helvetica,sans-Serif" font-size="10.00">Publish</text>
</g>
<!-- alerts -->
<g id="node5" class="node">
<title>alerts</title>
<path fill="#c8e6c9" stroke="black" d="M236.75,-284.62C236.75,-284.62 204.25,-284.62 204.25,-284.62 198.25,-284.62 192.25,-278.62 192.25,-272.62 192.25,-272.62 192.25,-260.62 192.25,-260.62 192.25,-254.62 198.25,-248.62 204.25,-248.62 204.25,-248.62 236.75,-248.62 236.75,-248.62 242.75,-248.62 248.75,-254.62 248.75,-260.62 248.75,-260.62 248.75,-272.62 248.75,-272.62 248.75,-278.62 242.75,-284.62 236.75,-284.62"/>
<text xml:space="preserve" text-anchor="middle" x="220.5" y="-269.68" font-family="Helvetica,sans-Serif" font-size="11.00">Alerts</text>
<text xml:space="preserve" text-anchor="middle" x="220.5" y="-256.18" font-family="Helvetica,sans-Serif" font-size="11.00">Service</text>
</g>
<!-- alerts&#45;&gt;timescale -->
<g id="edge12" class="edge">
<title>alerts&#45;&gt;timescale</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M219.58,-248.38C217.61,-211.47 212.94,-124.24 210.34,-75.51"/>
<polygon fill="black" stroke="black" points="213.85,-75.6 209.82,-65.8 206.86,-75.97 213.85,-75.6"/>
<text xml:space="preserve" text-anchor="middle" x="230.53" y="-172.38" font-family="Helvetica,sans-Serif" font-size="10.00">Store</text>
<text xml:space="preserve" text-anchor="middle" x="230.53" y="-159.62" font-family="Helvetica,sans-Serif" font-size="10.00">Alerts</text>
</g>
<!-- collector&#45;&gt;aggregator -->
<g id="edge6" class="edge">
<title>collector&#45;&gt;aggregator</title>
<path fill="none" stroke="#388e3c" d="M310.96,-248.55C310.53,-234.65 309.9,-214.73 309.39,-198.45"/>
<polygon fill="#388e3c" stroke="#388e3c" points="312.9,-198.77 309.09,-188.89 305.91,-198.99 312.9,-198.77"/>
<text xml:space="preserve" text-anchor="middle" x="327.98" y="-221.12" font-family="Helvetica,sans-Serif" font-size="10.00">gRPC</text>
<text xml:space="preserve" text-anchor="middle" x="327.98" y="-208.38" font-family="Helvetica,sans-Serif" font-size="10.00">Stream</text>
</g>
<!-- events&#45;&gt;gateway -->
<g id="edge11" class="edge">
<title>events&#45;&gt;gateway</title>
<path fill="none" stroke="#7b1fa2" d="M281.13,-378.02C267.86,-372.71 253.29,-367.44 239.5,-363.62 212.49,-356.16 199.25,-370.98 177.25,-353.62 159.49,-339.61 150.46,-315.21 145.93,-295.98"/>
<polygon fill="#7b1fa2" stroke="#7b1fa2" points="149.38,-295.39 143.95,-286.29 142.52,-296.79 149.38,-295.39"/>
<text xml:space="preserve" text-anchor="middle" x="200.88" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">Subscribe</text>
</g>
<!-- events&#45;&gt;alerts -->
<g id="edge10" class="edge">
<title>events&#45;&gt;alerts</title>
<path fill="none" stroke="#7b1fa2" d="M277.27,-380.98C264.23,-374.18 251.36,-365.21 242.25,-353.62 229.43,-337.32 224.08,-314.36 221.89,-296.26"/>
<polygon fill="#7b1fa2" stroke="#7b1fa2" points="225.38,-296.07 220.98,-286.43 218.41,-296.71 225.38,-296.07"/>
<text xml:space="preserve" text-anchor="middle" x="265.88" y="-337.75" font-family="Helvetica,sans-Serif" font-size="10.00">Subscribe</text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 16 KiB

View File

@@ -0,0 +1,83 @@
digraph DataFlow {
rankdir=LR;
compound=true;
fontname="Helvetica";
node [fontname="Helvetica", fontsize=10];
edge [fontname="Helvetica", fontsize=9];
labelloc="t";
label="Metrics Data Flow Pipeline";
fontsize=14;
node [shape=box, style="rounded,filled"];
// Collection
subgraph cluster_collect {
label="Collection (5s)";
style=filled;
fillcolor="#E3F2FD";
psutil [label="psutil\n(CPU, Mem, Disk)", shape=component, fillcolor="#BBDEFB"];
collector [label="Collector\nService", fillcolor="#90CAF9"];
}
// Ingestion
subgraph cluster_ingest {
label="Ingestion";
style=filled;
fillcolor="#E8F5E9";
aggregator [label="Aggregator\n(gRPC)", fillcolor="#A5D6A7"];
validate [label="Validate &\nNormalize", shape=diamond, fillcolor="#C8E6C9"];
}
// Storage Hot
subgraph cluster_hot {
label="Hot Path (Real-time)";
style=filled;
fillcolor="#FFF3E0";
redis_state [label="Redis\nCurrent State", shape=cylinder, fillcolor="#FFCC80"];
redis_pubsub [label="Redis\nPub/Sub", shape=hexagon, fillcolor="#FFB74D"];
}
// Storage Warm
subgraph cluster_warm {
label="Warm Path (Historical)";
style=filled;
fillcolor="#FCE4EC";
raw [label="metrics_raw\n(5s, 24h)", shape=cylinder, fillcolor="#F8BBD9"];
agg_1m [label="metrics_1m\n(1m, 7d)", shape=cylinder, fillcolor="#F48FB1"];
agg_1h [label="metrics_1h\n(1h, 90d)", shape=cylinder, fillcolor="#EC407A"];
}
// Consumers
subgraph cluster_consume {
label="Consumers";
style=filled;
fillcolor="#E8EAF6";
alerts [label="Alert\nService", fillcolor="#C5CAE9"];
gateway [label="Gateway\n(WebSocket)", fillcolor="#9FA8DA"];
lambda [label="Lambda\nAggregator", fillcolor="#7986CB", style="rounded,filled,dashed"];
}
// Flow
psutil -> collector [label="Metrics"];
collector -> aggregator [label="gRPC\nStream"];
aggregator -> validate;
validate -> redis_state [label="Upsert"];
validate -> redis_pubsub [label="Publish"];
validate -> raw [label="Insert"];
redis_pubsub -> alerts [label="metrics.*"];
redis_pubsub -> gateway [label="metrics.*"];
raw -> agg_1m [label="Continuous\nAggregate", style=dashed];
agg_1m -> agg_1h [label="Hourly\nJob", style=dashed];
raw -> lambda [label="SQS\nTrigger", style=dotted];
lambda -> agg_1m [label="Batch\nWrite", style=dotted];
}

View File

@@ -0,0 +1,217 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 14.1.1 (0)
-->
<!-- Title: DataFlow Pages: 1 -->
<svg width="1087pt" height="329pt"
viewBox="0.00 0.00 1087.00 329.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 325.25)">
<title>DataFlow</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-325.25 1082.5,-325.25 1082.5,4 -4,4"/>
<text xml:space="preserve" text-anchor="middle" x="539.25" y="-303.95" font-family="Helvetica,sans-Serif" font-size="14.00">Metrics Data Flow Pipeline</text>
<g id="clust1" class="cluster">
<title>cluster_collect</title>
<polygon fill="#e3f2fd" stroke="black" points="8,-111 8,-188 254,-188 254,-111 8,-111"/>
<text xml:space="preserve" text-anchor="middle" x="131" y="-170.7" font-family="Helvetica,sans-Serif" font-size="14.00">Collection (5s)</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_ingest</title>
<polygon fill="#e8f5e9" stroke="black" points="307,-95 307,-204 562.5,-204 562.5,-95 307,-95"/>
<text xml:space="preserve" text-anchor="middle" x="434.75" y="-186.7" font-family="Helvetica,sans-Serif" font-size="14.00">Ingestion</text>
</g>
<g id="clust3" class="cluster">
<title>cluster_hot</title>
<polygon fill="#fff3e0" stroke="black" points="614.75,-34 614.75,-193 769.5,-193 769.5,-34 614.75,-34"/>
<text xml:space="preserve" text-anchor="middle" x="692.12" y="-175.7" font-family="Helvetica,sans-Serif" font-size="14.00">Hot Path (Real&#45;time)</text>
</g>
<g id="clust4" class="cluster">
<title>cluster_warm</title>
<polygon fill="#fce4ec" stroke="black" points="645.62,-201 645.62,-288 1070.5,-288 1070.5,-201 645.62,-201"/>
<text xml:space="preserve" text-anchor="middle" x="858.06" y="-270.7" font-family="Helvetica,sans-Serif" font-size="14.00">Warm Path (Historical)</text>
</g>
<g id="clust5" class="cluster">
<title>cluster_consume</title>
<polygon fill="#e8eaf6" stroke="black" points="840.5,-8 840.5,-193 935.25,-193 935.25,-8 840.5,-8"/>
<text xml:space="preserve" text-anchor="middle" x="887.88" y="-175.7" font-family="Helvetica,sans-Serif" font-size="14.00">Consumers</text>
</g>
<!-- psutil -->
<g id="node1" class="node">
<title>psutil</title>
<polygon fill="#bbdefb" stroke="black" points="118.25,-155 16,-155 16,-151 12,-151 12,-147 16,-147 16,-127 12,-127 12,-123 16,-123 16,-119 118.25,-119 118.25,-155"/>
<polyline fill="none" stroke="black" points="16,-151 20,-151 20,-147 16,-147"/>
<polyline fill="none" stroke="black" points="16,-127 20,-127 20,-123 16,-123"/>
<text xml:space="preserve" text-anchor="middle" x="67.13" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">psutil</text>
<text xml:space="preserve" text-anchor="middle" x="67.13" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">(CPU, Mem, Disk)</text>
</g>
<!-- collector -->
<g id="node2" class="node">
<title>collector</title>
<path fill="#90caf9" stroke="black" d="M234,-155C234,-155 198.5,-155 198.5,-155 192.5,-155 186.5,-149 186.5,-143 186.5,-143 186.5,-131 186.5,-131 186.5,-125 192.5,-119 198.5,-119 198.5,-119 234,-119 234,-119 240,-119 246,-125 246,-131 246,-131 246,-143 246,-143 246,-149 240,-155 234,-155"/>
<text xml:space="preserve" text-anchor="middle" x="216.25" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
<text xml:space="preserve" text-anchor="middle" x="216.25" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Service</text>
</g>
<!-- psutil&#45;&gt;collector -->
<g id="edge1" class="edge">
<title>psutil&#45;&gt;collector</title>
<path fill="none" stroke="black" d="M118.35,-137C136.74,-137 157.31,-137 174.75,-137"/>
<polygon fill="black" stroke="black" points="174.75,-140.5 184.75,-137 174.75,-133.5 174.75,-140.5"/>
<text xml:space="preserve" text-anchor="middle" x="152.38" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Metrics</text>
</g>
<!-- aggregator -->
<g id="node3" class="node">
<title>aggregator</title>
<path fill="#a5d6a7" stroke="black" d="M373,-155C373,-155 327,-155 327,-155 321,-155 315,-149 315,-143 315,-143 315,-131 315,-131 315,-125 321,-119 327,-119 327,-119 373,-119 373,-119 379,-119 385,-125 385,-131 385,-131 385,-143 385,-143 385,-149 379,-155 373,-155"/>
<text xml:space="preserve" text-anchor="middle" x="350" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
<text xml:space="preserve" text-anchor="middle" x="350" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">(gRPC)</text>
</g>
<!-- collector&#45;&gt;aggregator -->
<g id="edge2" class="edge">
<title>collector&#45;&gt;aggregator</title>
<path fill="none" stroke="black" d="M246.49,-137C263.19,-137 284.49,-137 303.35,-137"/>
<polygon fill="black" stroke="black" points="303.2,-140.5 313.2,-137 303.2,-133.5 303.2,-140.5"/>
<text xml:space="preserve" text-anchor="middle" x="280.5" y="-150.95" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
<text xml:space="preserve" text-anchor="middle" x="280.5" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Stream</text>
</g>
<!-- validate -->
<g id="node4" class="node">
<title>validate</title>
<path fill="#c8e6c9" stroke="black" d="M477.54,-165.08C477.54,-165.08 432.71,-142.42 432.71,-142.42 427.35,-139.71 427.35,-134.29 432.71,-131.58 432.71,-131.58 477.54,-108.92 477.54,-108.92 482.9,-106.21 493.6,-106.21 498.96,-108.92 498.96,-108.92 543.79,-131.58 543.79,-131.58 549.15,-134.29 549.15,-139.71 543.79,-142.42 543.79,-142.42 498.96,-165.08 498.96,-165.08 493.6,-167.79 482.9,-167.79 477.54,-165.08"/>
<text xml:space="preserve" text-anchor="middle" x="488.25" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Validate &amp;</text>
<text xml:space="preserve" text-anchor="middle" x="488.25" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Normalize</text>
</g>
<!-- aggregator&#45;&gt;validate -->
<g id="edge3" class="edge">
<title>aggregator&#45;&gt;validate</title>
<path fill="none" stroke="black" d="M385.38,-137C392.95,-137 401.25,-137 409.76,-137"/>
<polygon fill="black" stroke="black" points="409.49,-140.5 419.49,-137 409.49,-133.5 409.49,-140.5"/>
</g>
<!-- redis_state -->
<g id="node5" class="node">
<title>redis_state</title>
<path fill="#ffcc80" stroke="black" d="M731.88,-155.84C731.88,-158.15 713.83,-160.03 691.62,-160.03 669.42,-160.03 651.38,-158.15 651.38,-155.84 651.38,-155.84 651.38,-118.16 651.38,-118.16 651.38,-115.85 669.42,-113.97 691.62,-113.97 713.83,-113.97 731.88,-115.85 731.88,-118.16 731.88,-118.16 731.88,-155.84 731.88,-155.84"/>
<path fill="none" stroke="black" d="M731.88,-155.84C731.88,-153.53 713.83,-151.66 691.62,-151.66 669.42,-151.66 651.38,-153.53 651.38,-155.84"/>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-140.25" font-family="Helvetica,sans-Serif" font-size="10.00">Redis</text>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-127.5" font-family="Helvetica,sans-Serif" font-size="10.00">Current State</text>
</g>
<!-- validate&#45;&gt;redis_state -->
<g id="edge4" class="edge">
<title>validate&#45;&gt;redis_state</title>
<path fill="none" stroke="black" d="M555.47,-137C582.9,-137 614.22,-137 639.8,-137"/>
<polygon fill="black" stroke="black" points="639.6,-140.5 649.6,-137 639.6,-133.5 639.6,-140.5"/>
<text xml:space="preserve" text-anchor="middle" x="588.62" y="-139.7" font-family="Helvetica,sans-Serif" font-size="9.00">Upsert</text>
</g>
<!-- redis_pubsub -->
<g id="node6" class="node">
<title>redis_pubsub</title>
<path fill="#ffb74d" stroke="black" d="M729.05,-78.12C729.05,-78.12 721.56,-87.24 721.56,-87.24 717.82,-91.79 708.18,-96.35 702.28,-96.35 702.28,-96.35 680.97,-96.35 680.97,-96.35 675.07,-96.35 665.43,-91.79 661.69,-87.24 661.69,-87.24 654.2,-78.12 654.2,-78.12 650.46,-73.56 650.46,-64.44 654.2,-59.88 654.2,-59.88 661.69,-50.76 661.69,-50.76 665.43,-46.21 675.07,-41.65 680.97,-41.65 680.97,-41.65 702.28,-41.65 702.28,-41.65 708.18,-41.65 717.82,-46.21 721.56,-50.76 721.56,-50.76 729.05,-59.88 729.05,-59.88 732.79,-64.44 732.79,-73.56 729.05,-78.12"/>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-72.25" font-family="Helvetica,sans-Serif" font-size="10.00">Redis</text>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-59.5" font-family="Helvetica,sans-Serif" font-size="10.00">Pub/Sub</text>
</g>
<!-- validate&#45;&gt;redis_pubsub -->
<g id="edge5" class="edge">
<title>validate&#45;&gt;redis_pubsub</title>
<path fill="none" stroke="black" d="M529.04,-123.57C562.44,-112.28 610.18,-96.17 645.1,-84.37"/>
<polygon fill="black" stroke="black" points="646.17,-87.71 654.53,-81.19 643.93,-81.07 646.17,-87.71"/>
<text xml:space="preserve" text-anchor="middle" x="588.62" y="-109.77" font-family="Helvetica,sans-Serif" font-size="9.00">Publish</text>
</g>
<!-- raw -->
<g id="node7" class="node">
<title>raw</title>
<path fill="#f8bbd9" stroke="black" d="M729.62,-250.84C729.62,-253.15 712.59,-255.03 691.62,-255.03 670.66,-255.03 653.62,-253.15 653.62,-250.84 653.62,-250.84 653.62,-213.16 653.62,-213.16 653.62,-210.85 670.66,-208.97 691.62,-208.97 712.59,-208.97 729.62,-210.85 729.62,-213.16 729.62,-213.16 729.62,-250.84 729.62,-250.84"/>
<path fill="none" stroke="black" d="M729.62,-250.84C729.62,-248.53 712.59,-246.66 691.62,-246.66 670.66,-246.66 653.62,-248.53 653.62,-250.84"/>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_raw</text>
<text xml:space="preserve" text-anchor="middle" x="691.62" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(5s, 24h)</text>
</g>
<!-- validate&#45;&gt;raw -->
<g id="edge6" class="edge">
<title>validate&#45;&gt;raw</title>
<path fill="none" stroke="black" d="M523.01,-153.3C548.24,-165.44 583.6,-182.37 614.75,-197 623.81,-201.26 633.5,-205.76 642.83,-210.07"/>
<polygon fill="black" stroke="black" points="641.22,-213.19 651.77,-214.2 644.16,-206.83 641.22,-213.19"/>
<text xml:space="preserve" text-anchor="middle" x="588.62" y="-194.9" font-family="Helvetica,sans-Serif" font-size="9.00">Insert</text>
</g>
<!-- alerts -->
<g id="node10" class="node">
<title>alerts</title>
<path fill="#c5cae9" stroke="black" d="M902.38,-106C902.38,-106 872.38,-106 872.38,-106 866.38,-106 860.38,-100 860.38,-94 860.38,-94 860.38,-82 860.38,-82 860.38,-76 866.38,-70 872.38,-70 872.38,-70 902.38,-70 902.38,-70 908.38,-70 914.38,-76 914.38,-82 914.38,-82 914.38,-94 914.38,-94 914.38,-100 908.38,-106 902.38,-106"/>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-91.25" font-family="Helvetica,sans-Serif" font-size="10.00">Alert</text>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-78.5" font-family="Helvetica,sans-Serif" font-size="10.00">Service</text>
</g>
<!-- redis_pubsub&#45;&gt;alerts -->
<g id="edge7" class="edge">
<title>redis_pubsub&#45;&gt;alerts</title>
<path fill="none" stroke="black" d="M733.71,-73.03C767.65,-76.36 815.43,-81.04 848.46,-84.28"/>
<polygon fill="black" stroke="black" points="848.11,-87.76 858.4,-85.26 848.79,-80.8 848.11,-87.76"/>
<text xml:space="preserve" text-anchor="middle" x="805" y="-85.09" font-family="Helvetica,sans-Serif" font-size="9.00">metrics.*</text>
</g>
<!-- gateway -->
<g id="node11" class="node">
<title>gateway</title>
<path fill="#9fa8da" stroke="black" d="M913.75,-52C913.75,-52 861,-52 861,-52 855,-52 849,-46 849,-40 849,-40 849,-28 849,-28 849,-22 855,-16 861,-16 861,-16 913.75,-16 913.75,-16 919.75,-16 925.75,-22 925.75,-28 925.75,-28 925.75,-40 925.75,-40 925.75,-46 919.75,-52 913.75,-52"/>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-37.25" font-family="Helvetica,sans-Serif" font-size="10.00">Gateway</text>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-24.5" font-family="Helvetica,sans-Serif" font-size="10.00">(WebSocket)</text>
</g>
<!-- redis_pubsub&#45;&gt;gateway -->
<g id="edge8" class="edge">
<title>redis_pubsub&#45;&gt;gateway</title>
<path fill="none" stroke="black" d="M731.37,-62C761.89,-56.49 804.64,-48.77 837.51,-42.83"/>
<polygon fill="black" stroke="black" points="837.98,-46.3 847.2,-41.08 836.74,-39.41 837.98,-46.3"/>
<text xml:space="preserve" text-anchor="middle" x="805" y="-55.25" font-family="Helvetica,sans-Serif" font-size="9.00">metrics.*</text>
</g>
<!-- agg_1m -->
<g id="node8" class="node">
<title>agg_1m</title>
<path fill="#f48fb1" stroke="black" d="M924.25,-250.84C924.25,-253.15 907.72,-255.03 887.38,-255.03 867.03,-255.03 850.5,-253.15 850.5,-250.84 850.5,-250.84 850.5,-213.16 850.5,-213.16 850.5,-210.85 867.03,-208.97 887.38,-208.97 907.72,-208.97 924.25,-210.85 924.25,-213.16 924.25,-213.16 924.25,-250.84 924.25,-250.84"/>
<path fill="none" stroke="black" d="M924.25,-250.84C924.25,-248.53 907.72,-246.66 887.38,-246.66 867.03,-246.66 850.5,-248.53 850.5,-250.84"/>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_1m</text>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(1m, 7d)</text>
</g>
<!-- raw&#45;&gt;agg_1m -->
<g id="edge9" class="edge">
<title>raw&#45;&gt;agg_1m</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M729.98,-232C760.97,-232 805.22,-232 838.74,-232"/>
<polygon fill="black" stroke="black" points="838.6,-235.5 848.6,-232 838.6,-228.5 838.6,-235.5"/>
<text xml:space="preserve" text-anchor="middle" x="805" y="-245.95" font-family="Helvetica,sans-Serif" font-size="9.00">Continuous</text>
<text xml:space="preserve" text-anchor="middle" x="805" y="-234.7" font-family="Helvetica,sans-Serif" font-size="9.00">Aggregate</text>
</g>
<!-- lambda -->
<g id="node12" class="node">
<title>lambda</title>
<path fill="#7986cb" stroke="black" stroke-dasharray="5,2" d="M910.38,-160C910.38,-160 864.38,-160 864.38,-160 858.38,-160 852.38,-154 852.38,-148 852.38,-148 852.38,-136 852.38,-136 852.38,-130 858.38,-124 864.38,-124 864.38,-124 910.38,-124 910.38,-124 916.38,-124 922.38,-130 922.38,-136 922.38,-136 922.38,-148 922.38,-148 922.38,-154 916.38,-160 910.38,-160"/>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-145.25" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
<text xml:space="preserve" text-anchor="middle" x="887.38" y="-132.5" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
</g>
<!-- raw&#45;&gt;lambda -->
<g id="edge11" class="edge">
<title>raw&#45;&gt;lambda</title>
<path fill="none" stroke="black" stroke-dasharray="1,5" d="M729.81,-215.18C742.43,-209.45 756.59,-202.98 769.5,-197 793.37,-185.95 819.91,-173.48 841.65,-163.21"/>
<polygon fill="black" stroke="black" points="843,-166.44 850.54,-159.01 840,-160.12 843,-166.44"/>
<text xml:space="preserve" text-anchor="middle" x="805" y="-205.05" font-family="Helvetica,sans-Serif" font-size="9.00">SQS</text>
<text xml:space="preserve" text-anchor="middle" x="805" y="-193.8" font-family="Helvetica,sans-Serif" font-size="9.00">Trigger</text>
</g>
<!-- agg_1h -->
<g id="node9" class="node">
<title>agg_1h</title>
<path fill="#ec407a" stroke="black" d="M1062.5,-250.84C1062.5,-253.15 1046.81,-255.03 1027.5,-255.03 1008.19,-255.03 992.5,-253.15 992.5,-250.84 992.5,-250.84 992.5,-213.16 992.5,-213.16 992.5,-210.85 1008.19,-208.97 1027.5,-208.97 1046.81,-208.97 1062.5,-210.85 1062.5,-213.16 1062.5,-213.16 1062.5,-250.84 1062.5,-250.84"/>
<path fill="none" stroke="black" d="M1062.5,-250.84C1062.5,-248.53 1046.81,-246.66 1027.5,-246.66 1008.19,-246.66 992.5,-248.53 992.5,-250.84"/>
<text xml:space="preserve" text-anchor="middle" x="1027.5" y="-235.25" font-family="Helvetica,sans-Serif" font-size="10.00">metrics_1h</text>
<text xml:space="preserve" text-anchor="middle" x="1027.5" y="-222.5" font-family="Helvetica,sans-Serif" font-size="10.00">(1h, 90d)</text>
</g>
<!-- agg_1m&#45;&gt;agg_1h -->
<g id="edge10" class="edge">
<title>agg_1m&#45;&gt;agg_1h</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M924.67,-232C941.93,-232 962.74,-232 981.04,-232"/>
<polygon fill="black" stroke="black" points="980.84,-235.5 990.84,-232 980.84,-228.5 980.84,-235.5"/>
<text xml:space="preserve" text-anchor="middle" x="959.88" y="-245.95" font-family="Helvetica,sans-Serif" font-size="9.00">Hourly</text>
<text xml:space="preserve" text-anchor="middle" x="959.88" y="-234.7" font-family="Helvetica,sans-Serif" font-size="9.00">Job</text>
</g>
<!-- lambda&#45;&gt;agg_1m -->
<g id="edge12" class="edge">
<title>lambda&#45;&gt;agg_1m</title>
<path fill="none" stroke="black" stroke-dasharray="1,5" d="M887.38,-160.21C887.38,-170.91 887.38,-184.78 887.38,-197.47"/>
<polygon fill="black" stroke="black" points="883.88,-197.16 887.38,-207.16 890.88,-197.16 883.88,-197.16"/>
<text xml:space="preserve" text-anchor="middle" x="873.12" y="-187.18" font-family="Helvetica,sans-Serif" font-size="9.00">Batch</text>
<text xml:space="preserve" text-anchor="middle" x="873.12" y="-175.93" font-family="Helvetica,sans-Serif" font-size="9.00">Write</text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 17 KiB

View File

@@ -0,0 +1,95 @@
digraph Deployment {
rankdir=TB;
compound=true;
fontname="Helvetica";
node [fontname="Helvetica", fontsize=10];
edge [fontname="Helvetica", fontsize=9];
labelloc="t";
label="Deployment Architecture";
fontsize=14;
node [shape=box, style="rounded,filled"];
// Local Development
subgraph cluster_local {
label="Local Development";
style=filled;
fillcolor="#E3F2FD";
subgraph cluster_kind {
label="Kind Cluster";
style=filled;
fillcolor="#BBDEFB";
tilt [label="Tilt\n(Live Reload)", shape=component, fillcolor="#90CAF9"];
k8s_local [label="K8s Pods\n(via Kustomize)", fillcolor="#64B5F6"];
}
compose [label="Docker Compose\n(Alternative)", fillcolor="#90CAF9", style="rounded,dashed"];
}
// AWS Staging/Demo
subgraph cluster_aws {
label="AWS (sysmonstm.mcrn.ar)";
style=filled;
fillcolor="#E8F5E9";
subgraph cluster_ec2 {
label="EC2 t2.small";
style=filled;
fillcolor="#C8E6C9";
compose_ec2 [label="Docker Compose\n(All Services)", fillcolor="#A5D6A7"];
nginx [label="Nginx\n(SSL Termination)", fillcolor="#81C784"];
}
subgraph cluster_lambda {
label="Lambda (Data Processing)";
style=filled;
fillcolor="#DCEDC8";
lambda_agg [label="Aggregator\nLambda", fillcolor="#AED581"];
lambda_compact [label="Compactor\nLambda", fillcolor="#9CCC65"];
}
sqs [label="SQS\n(Buffer)", shape=hexagon, fillcolor="#FFE082"];
s3 [label="S3\n(Backup)", shape=cylinder, fillcolor="#FFE082"];
}
// CI/CD
subgraph cluster_cicd {
label="CI/CD";
style=filled;
fillcolor="#F3E5F5";
woodpecker [label="Woodpecker CI", fillcolor="#CE93D8"];
registry [label="Container\nRegistry", shape=cylinder, fillcolor="#BA68C8"];
}
// Collectors (External)
subgraph cluster_collectors {
label="Monitored Machines";
style=dashed;
color=gray;
coll1 [label="Collector\n(Machine 1)", fillcolor="#FFCCBC"];
coll2 [label="Collector\n(Machine 2)", fillcolor="#FFCCBC"];
coll3 [label="Collector\n(Machine N)", fillcolor="#FFCCBC"];
}
// Connections
tilt -> k8s_local [style=invis];
woodpecker -> registry [label="Push"];
registry -> compose_ec2 [label="Pull"];
registry -> k8s_local [label="Pull", style=dashed];
nginx -> compose_ec2 [label="Proxy"];
compose_ec2 -> sqs [label="Events"];
sqs -> lambda_agg [label="Trigger"];
lambda_compact -> s3 [label="Archive"];
coll1 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
coll2 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
coll3 -> compose_ec2 [label="gRPC", lhead=cluster_ec2];
}

View File

@@ -0,0 +1,221 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 14.1.1 (0)
-->
<!-- Title: Deployment Pages: 1 -->
<svg width="872pt" height="662pt"
viewBox="0.00 0.00 872.00 662.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 658.3)">
<title>Deployment</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-658.3 868,-658.3 868,4 -4,4"/>
<text xml:space="preserve" text-anchor="middle" x="432" y="-637" font-family="Helvetica,sans-Serif" font-size="14.00">Deployment Architecture</text>
<g id="clust1" class="cluster">
<title>cluster_local</title>
<polygon fill="#e3f2fd" stroke="black" points="8,-307.77 8,-514.55 238,-514.55 238,-307.77 8,-307.77"/>
<text xml:space="preserve" text-anchor="middle" x="123" y="-497.25" font-family="Helvetica,sans-Serif" font-size="14.00">Local Development</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_kind</title>
<polygon fill="#bbdefb" stroke="black" points="16,-315.77 16,-481.3 124,-481.3 124,-315.77 16,-315.77"/>
<text xml:space="preserve" text-anchor="middle" x="70" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">Kind Cluster</text>
</g>
<g id="clust3" class="cluster">
<title>cluster_aws</title>
<polygon fill="#e8f5e9" stroke="black" points="642,-8 642,-514.55 856,-514.55 856,-8 642,-8"/>
<text xml:space="preserve" text-anchor="middle" x="749" y="-497.25" font-family="Helvetica,sans-Serif" font-size="14.00">AWS (sysmonstm.mcrn.ar)</text>
</g>
<g id="clust4" class="cluster">
<title>cluster_ec2</title>
<polygon fill="#c8e6c9" stroke="black" points="650,-315.77 650,-481.3 768,-481.3 768,-315.77 650,-315.77"/>
<text xml:space="preserve" text-anchor="middle" x="709" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">EC2 t2.small</text>
</g>
<g id="clust5" class="cluster">
<title>cluster_lambda</title>
<polygon fill="#dcedc8" stroke="black" points="650,-101.31 650,-178.56 848,-178.56 848,-101.31 650,-101.31"/>
<text xml:space="preserve" text-anchor="middle" x="749" y="-161.26" font-family="Helvetica,sans-Serif" font-size="14.00">Lambda (Data Processing)</text>
</g>
<g id="clust6" class="cluster">
<title>cluster_cicd</title>
<polygon fill="#f3e5f5" stroke="black" points="246,-399.02 246,-621.05 350,-621.05 350,-399.02 246,-399.02"/>
<text xml:space="preserve" text-anchor="middle" x="298" y="-603.75" font-family="Helvetica,sans-Serif" font-size="14.00">CI/CD</text>
</g>
<g id="clust7" class="cluster">
<title>cluster_collectors</title>
<polygon fill="none" stroke="gray" stroke-dasharray="5,2" points="358,-404.05 358,-481.3 634,-481.3 634,-404.05 358,-404.05"/>
<text xml:space="preserve" text-anchor="middle" x="496" y="-464" font-family="Helvetica,sans-Serif" font-size="14.00">Monitored Machines</text>
</g>
<!-- tilt -->
<g id="node1" class="node">
<title>tilt</title>
<polygon fill="#90caf9" stroke="black" points="110.25,-448.05 29.75,-448.05 29.75,-444.05 25.75,-444.05 25.75,-440.05 29.75,-440.05 29.75,-420.05 25.75,-420.05 25.75,-416.05 29.75,-416.05 29.75,-412.05 110.25,-412.05 110.25,-448.05"/>
<polyline fill="none" stroke="black" points="29.75,-444.05 33.75,-444.05 33.75,-440.05 29.75,-440.05"/>
<polyline fill="none" stroke="black" points="29.75,-420.05 33.75,-420.05 33.75,-416.05 29.75,-416.05"/>
<text xml:space="preserve" text-anchor="middle" x="70" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Tilt</text>
<text xml:space="preserve" text-anchor="middle" x="70" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Live Reload)</text>
</g>
<!-- k8s_local -->
<g id="node2" class="node">
<title>k8s_local</title>
<path fill="#64b5f6" stroke="black" d="M104.25,-359.77C104.25,-359.77 35.75,-359.77 35.75,-359.77 29.75,-359.77 23.75,-353.77 23.75,-347.77 23.75,-347.77 23.75,-335.77 23.75,-335.77 23.75,-329.77 29.75,-323.77 35.75,-323.77 35.75,-323.77 104.25,-323.77 104.25,-323.77 110.25,-323.77 116.25,-329.77 116.25,-335.77 116.25,-335.77 116.25,-347.77 116.25,-347.77 116.25,-353.77 110.25,-359.77 104.25,-359.77"/>
<text xml:space="preserve" text-anchor="middle" x="70" y="-345.02" font-family="Helvetica,sans-Serif" font-size="10.00">K8s Pods</text>
<text xml:space="preserve" text-anchor="middle" x="70" y="-332.27" font-family="Helvetica,sans-Serif" font-size="10.00">(via Kustomize)</text>
</g>
<!-- tilt&#45;&gt;k8s_local -->
<!-- compose -->
<g id="node3" class="node">
<title>compose</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M218.25,-448.05C218.25,-448.05 143.75,-448.05 143.75,-448.05 137.75,-448.05 131.75,-442.05 131.75,-436.05 131.75,-436.05 131.75,-424.05 131.75,-424.05 131.75,-418.05 137.75,-412.05 143.75,-412.05 143.75,-412.05 218.25,-412.05 218.25,-412.05 224.25,-412.05 230.25,-418.05 230.25,-424.05 230.25,-424.05 230.25,-436.05 230.25,-436.05 230.25,-442.05 224.25,-448.05 218.25,-448.05"/>
<text xml:space="preserve" text-anchor="middle" x="181" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Docker Compose</text>
<text xml:space="preserve" text-anchor="middle" x="181" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Alternative)</text>
</g>
<!-- compose_ec2 -->
<g id="node4" class="node">
<title>compose_ec2</title>
<path fill="#a5d6a7" stroke="black" d="M744.25,-359.77C744.25,-359.77 669.75,-359.77 669.75,-359.77 663.75,-359.77 657.75,-353.77 657.75,-347.77 657.75,-347.77 657.75,-335.77 657.75,-335.77 657.75,-329.77 663.75,-323.77 669.75,-323.77 669.75,-323.77 744.25,-323.77 744.25,-323.77 750.25,-323.77 756.25,-329.77 756.25,-335.77 756.25,-335.77 756.25,-347.77 756.25,-347.77 756.25,-353.77 750.25,-359.77 744.25,-359.77"/>
<text xml:space="preserve" text-anchor="middle" x="707" y="-345.02" font-family="Helvetica,sans-Serif" font-size="10.00">Docker Compose</text>
<text xml:space="preserve" text-anchor="middle" x="707" y="-332.27" font-family="Helvetica,sans-Serif" font-size="10.00">(All Services)</text>
</g>
<!-- sqs -->
<g id="node8" class="node">
<title>sqs</title>
<path fill="#ffe082" stroke="black" d="M742.89,-252.28C742.89,-252.28 735.71,-261.4 735.71,-261.4 732.12,-265.96 722.73,-270.52 716.93,-270.52 716.93,-270.52 697.07,-270.52 697.07,-270.52 691.27,-270.52 681.88,-265.96 678.29,-261.4 678.29,-261.4 671.11,-252.28 671.11,-252.28 667.52,-247.72 667.52,-238.61 671.11,-234.05 671.11,-234.05 678.29,-224.93 678.29,-224.93 681.88,-220.37 691.27,-215.81 697.07,-215.81 697.07,-215.81 716.93,-215.81 716.93,-215.81 722.73,-215.81 732.12,-220.37 735.71,-224.93 735.71,-224.93 742.89,-234.05 742.89,-234.05 746.48,-238.61 746.48,-247.72 742.89,-252.28"/>
<text xml:space="preserve" text-anchor="middle" x="707" y="-246.42" font-family="Helvetica,sans-Serif" font-size="10.00">SQS</text>
<text xml:space="preserve" text-anchor="middle" x="707" y="-233.67" font-family="Helvetica,sans-Serif" font-size="10.00">(Buffer)</text>
</g>
<!-- compose_ec2&#45;&gt;sqs -->
<g id="edge6" class="edge">
<title>compose_ec2&#45;&gt;sqs</title>
<path fill="none" stroke="black" d="M707,-323.5C707,-311.94 707,-296.26 707,-281.89"/>
<polygon fill="black" stroke="black" points="710.5,-282.27 707,-272.27 703.5,-282.27 710.5,-282.27"/>
<text xml:space="preserve" text-anchor="middle" x="722.38" y="-291.22" font-family="Helvetica,sans-Serif" font-size="9.00">Events</text>
</g>
<!-- nginx -->
<g id="node5" class="node">
<title>nginx</title>
<path fill="#81c784" stroke="black" d="M747.75,-448.05C747.75,-448.05 670.25,-448.05 670.25,-448.05 664.25,-448.05 658.25,-442.05 658.25,-436.05 658.25,-436.05 658.25,-424.05 658.25,-424.05 658.25,-418.05 664.25,-412.05 670.25,-412.05 670.25,-412.05 747.75,-412.05 747.75,-412.05 753.75,-412.05 759.75,-418.05 759.75,-424.05 759.75,-424.05 759.75,-436.05 759.75,-436.05 759.75,-442.05 753.75,-448.05 747.75,-448.05"/>
<text xml:space="preserve" text-anchor="middle" x="709" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Nginx</text>
<text xml:space="preserve" text-anchor="middle" x="709" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(SSL Termination)</text>
</g>
<!-- nginx&#45;&gt;compose_ec2 -->
<g id="edge5" class="edge">
<title>nginx&#45;&gt;compose_ec2</title>
<path fill="none" stroke="black" d="M708.6,-411.59C708.33,-400.13 707.98,-384.86 707.67,-371.63"/>
<polygon fill="black" stroke="black" points="711.17,-371.63 707.44,-361.72 704.17,-371.79 711.17,-371.63"/>
<text xml:space="preserve" text-anchor="middle" x="720.43" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Proxy</text>
</g>
<!-- lambda_agg -->
<g id="node6" class="node">
<title>lambda_agg</title>
<path fill="#aed581" stroke="black" d="M730,-145.31C730,-145.31 684,-145.31 684,-145.31 678,-145.31 672,-139.31 672,-133.31 672,-133.31 672,-121.31 672,-121.31 672,-115.31 678,-109.31 684,-109.31 684,-109.31 730,-109.31 730,-109.31 736,-109.31 742,-115.31 742,-121.31 742,-121.31 742,-133.31 742,-133.31 742,-139.31 736,-145.31 730,-145.31"/>
<text xml:space="preserve" text-anchor="middle" x="707" y="-130.56" font-family="Helvetica,sans-Serif" font-size="10.00">Aggregator</text>
<text xml:space="preserve" text-anchor="middle" x="707" y="-117.81" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
</g>
<!-- lambda_compact -->
<g id="node7" class="node">
<title>lambda_compact</title>
<path fill="#9ccc65" stroke="black" d="M822.62,-145.31C822.62,-145.31 777.38,-145.31 777.38,-145.31 771.38,-145.31 765.38,-139.31 765.38,-133.31 765.38,-133.31 765.38,-121.31 765.38,-121.31 765.38,-115.31 771.38,-109.31 777.38,-109.31 777.38,-109.31 822.62,-109.31 822.62,-109.31 828.62,-109.31 834.62,-115.31 834.62,-121.31 834.62,-121.31 834.62,-133.31 834.62,-133.31 834.62,-139.31 828.62,-145.31 822.62,-145.31"/>
<text xml:space="preserve" text-anchor="middle" x="800" y="-130.56" font-family="Helvetica,sans-Serif" font-size="10.00">Compactor</text>
<text xml:space="preserve" text-anchor="middle" x="800" y="-117.81" font-family="Helvetica,sans-Serif" font-size="10.00">Lambda</text>
</g>
<!-- s3 -->
<g id="node9" class="node">
<title>s3</title>
<path fill="#ffe082" stroke="black" d="M829.38,-57.88C829.38,-60.19 816.21,-62.06 800,-62.06 783.79,-62.06 770.62,-60.19 770.62,-57.88 770.62,-57.88 770.62,-20.19 770.62,-20.19 770.62,-17.88 783.79,-16 800,-16 816.21,-16 829.38,-17.88 829.38,-20.19 829.38,-20.19 829.38,-57.88 829.38,-57.88"/>
<path fill="none" stroke="black" d="M829.38,-57.88C829.38,-55.56 816.21,-53.69 800,-53.69 783.79,-53.69 770.62,-55.56 770.62,-57.88"/>
<text xml:space="preserve" text-anchor="middle" x="800" y="-42.28" font-family="Helvetica,sans-Serif" font-size="10.00">S3</text>
<text xml:space="preserve" text-anchor="middle" x="800" y="-29.53" font-family="Helvetica,sans-Serif" font-size="10.00">(Backup)</text>
</g>
<!-- lambda_compact&#45;&gt;s3 -->
<g id="edge8" class="edge">
<title>lambda_compact&#45;&gt;s3</title>
<path fill="none" stroke="black" d="M800,-108.85C800,-98.81 800,-85.84 800,-73.88"/>
<polygon fill="black" stroke="black" points="803.5,-73.9 800,-63.9 796.5,-73.9 803.5,-73.9"/>
<text xml:space="preserve" text-anchor="middle" x="816.88" y="-82.76" font-family="Helvetica,sans-Serif" font-size="9.00">Archive</text>
</g>
<!-- sqs&#45;&gt;lambda_agg -->
<g id="edge7" class="edge">
<title>sqs&#45;&gt;lambda_agg</title>
<path fill="none" stroke="black" d="M707,-215.47C707,-197.96 707,-175.06 707,-157.13"/>
<polygon fill="black" stroke="black" points="710.5,-157.15 707,-147.15 703.5,-157.15 710.5,-157.15"/>
<text xml:space="preserve" text-anchor="middle" x="722.75" y="-189.26" font-family="Helvetica,sans-Serif" font-size="9.00">Trigger</text>
</g>
<!-- woodpecker -->
<g id="node10" class="node">
<title>woodpecker</title>
<path fill="#ce93d8" stroke="black" d="M330,-587.8C330,-587.8 266,-587.8 266,-587.8 260,-587.8 254,-581.8 254,-575.8 254,-575.8 254,-563.8 254,-563.8 254,-557.8 260,-551.8 266,-551.8 266,-551.8 330,-551.8 330,-551.8 336,-551.8 342,-557.8 342,-563.8 342,-563.8 342,-575.8 342,-575.8 342,-581.8 336,-587.8 330,-587.8"/>
<text xml:space="preserve" text-anchor="middle" x="298" y="-566.67" font-family="Helvetica,sans-Serif" font-size="10.00">Woodpecker CI</text>
</g>
<!-- registry -->
<g id="node11" class="node">
<title>registry</title>
<path fill="#ba68c8" stroke="black" d="M329.62,-448.89C329.62,-451.2 315.45,-453.08 298,-453.08 280.55,-453.08 266.38,-451.2 266.38,-448.89 266.38,-448.89 266.38,-411.21 266.38,-411.21 266.38,-408.89 280.55,-407.02 298,-407.02 315.45,-407.02 329.62,-408.89 329.62,-411.21 329.62,-411.21 329.62,-448.89 329.62,-448.89"/>
<path fill="none" stroke="black" d="M329.62,-448.89C329.62,-446.58 315.45,-444.71 298,-444.71 280.55,-444.71 266.38,-446.58 266.38,-448.89"/>
<text xml:space="preserve" text-anchor="middle" x="298" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Container</text>
<text xml:space="preserve" text-anchor="middle" x="298" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">Registry</text>
</g>
<!-- woodpecker&#45;&gt;registry -->
<g id="edge2" class="edge">
<title>woodpecker&#45;&gt;registry</title>
<path fill="none" stroke="black" d="M298,-551.35C298,-529.66 298,-492.15 298,-464.77"/>
<polygon fill="black" stroke="black" points="301.5,-464.88 298,-454.88 294.5,-464.88 301.5,-464.88"/>
<text xml:space="preserve" text-anchor="middle" x="308.88" y="-525.25" font-family="Helvetica,sans-Serif" font-size="9.00">Push</text>
</g>
<!-- registry&#45;&gt;k8s_local -->
<g id="edge4" class="edge">
<title>registry&#45;&gt;k8s_local</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M265.9,-410.59C258.2,-406.51 249.91,-402.4 242,-399.02 204.6,-383.02 161.03,-368.81 127.1,-358.68"/>
<polygon fill="black" stroke="black" points="128.47,-355.44 117.89,-355.97 126.49,-362.15 128.47,-355.44"/>
<text xml:space="preserve" text-anchor="middle" x="222.42" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Pull</text>
</g>
<!-- registry&#45;&gt;compose_ec2 -->
<g id="edge3" class="edge">
<title>registry&#45;&gt;compose_ec2</title>
<path fill="none" stroke="black" d="M329.84,-409.93C337.55,-405.88 345.91,-401.95 354,-399.02 452.44,-363.35 574.46,-350.26 646.22,-345.49"/>
<polygon fill="black" stroke="black" points="646.02,-349.01 655.78,-344.88 645.58,-342.02 646.02,-349.01"/>
<text xml:space="preserve" text-anchor="middle" x="427.09" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">Pull</text>
</g>
<!-- coll1 -->
<g id="node12" class="node">
<title>coll1</title>
<path fill="#ffccbc" stroke="black" d="M521.88,-448.05C521.88,-448.05 472.12,-448.05 472.12,-448.05 466.12,-448.05 460.12,-442.05 460.12,-436.05 460.12,-436.05 460.12,-424.05 460.12,-424.05 460.12,-418.05 466.12,-412.05 472.12,-412.05 472.12,-412.05 521.88,-412.05 521.88,-412.05 527.88,-412.05 533.88,-418.05 533.88,-424.05 533.88,-424.05 533.88,-436.05 533.88,-436.05 533.88,-442.05 527.88,-448.05 521.88,-448.05"/>
<text xml:space="preserve" text-anchor="middle" x="497" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
<text xml:space="preserve" text-anchor="middle" x="497" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine 1)</text>
</g>
<!-- coll1&#45;&gt;compose_ec2 -->
<g id="edge9" class="edge">
<title>coll1&#45;&gt;compose_ec2</title>
<path fill="none" stroke="black" d="M521.16,-411.67C528.02,-407.19 535.63,-402.62 543,-399.02 576.02,-382.89 614.85,-369.35 646.44,-359.6"/>
<polygon fill="black" stroke="black" points="640.37,-365.52 648.58,-358.82 637.98,-358.94 640.37,-365.52"/>
<text xml:space="preserve" text-anchor="middle" x="602.75" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
</g>
<!-- coll2 -->
<g id="node13" class="node">
<title>coll2</title>
<path fill="#ffccbc" stroke="black" d="M613.88,-448.05C613.88,-448.05 564.12,-448.05 564.12,-448.05 558.12,-448.05 552.12,-442.05 552.12,-436.05 552.12,-436.05 552.12,-424.05 552.12,-424.05 552.12,-418.05 558.12,-412.05 564.12,-412.05 564.12,-412.05 613.88,-412.05 613.88,-412.05 619.88,-412.05 625.88,-418.05 625.88,-424.05 625.88,-424.05 625.88,-436.05 625.88,-436.05 625.88,-442.05 619.88,-448.05 613.88,-448.05"/>
<text xml:space="preserve" text-anchor="middle" x="589" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
<text xml:space="preserve" text-anchor="middle" x="589" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine 2)</text>
</g>
<!-- coll2&#45;&gt;compose_ec2 -->
<g id="edge10" class="edge">
<title>coll2&#45;&gt;compose_ec2</title>
<path fill="none" stroke="black" d="M612.88,-411.59C621.13,-405.55 630.83,-398.47 640.8,-391.17"/>
<polygon fill="black" stroke="black" points="642.77,-394.07 648.78,-385.34 638.64,-388.41 642.77,-394.07"/>
<text xml:space="preserve" text-anchor="middle" x="670.19" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
</g>
<!-- coll3 -->
<g id="node14" class="node">
<title>coll3</title>
<path fill="#ffccbc" stroke="black" d="M429.62,-448.05C429.62,-448.05 378.38,-448.05 378.38,-448.05 372.38,-448.05 366.38,-442.05 366.38,-436.05 366.38,-436.05 366.38,-424.05 366.38,-424.05 366.38,-418.05 372.38,-412.05 378.38,-412.05 378.38,-412.05 429.62,-412.05 429.62,-412.05 435.62,-412.05 441.62,-418.05 441.62,-424.05 441.62,-424.05 441.62,-436.05 441.62,-436.05 441.62,-442.05 435.62,-448.05 429.62,-448.05"/>
<text xml:space="preserve" text-anchor="middle" x="404" y="-433.3" font-family="Helvetica,sans-Serif" font-size="10.00">Collector</text>
<text xml:space="preserve" text-anchor="middle" x="404" y="-420.55" font-family="Helvetica,sans-Serif" font-size="10.00">(Machine N)</text>
</g>
<!-- coll3&#45;&gt;compose_ec2 -->
<g id="edge11" class="edge">
<title>coll3&#45;&gt;compose_ec2</title>
<path fill="none" stroke="black" d="M427.53,-411.82C434.78,-407.12 442.97,-402.41 451,-399.02 514.86,-372.07 593.36,-357.28 646.47,-349.71"/>
<polygon fill="black" stroke="black" points="639.16,-354.39 648.5,-349.4 638.08,-347.48 639.16,-354.39"/>
<text xml:space="preserve" text-anchor="middle" x="516.54" y="-380.47" font-family="Helvetica,sans-Serif" font-size="9.00">gRPC</text>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 18 KiB

View File

@@ -0,0 +1,67 @@
digraph GrpcServices {
rankdir=LR;
compound=true;
fontname="Helvetica";
node [fontname="Helvetica", fontsize=10];
edge [fontname="Helvetica", fontsize=9];
labelloc="t";
label="gRPC Service Definitions";
fontsize=14;
node [shape=record, style=filled];
// MetricsService
subgraph cluster_metrics {
label="MetricsService";
style=filled;
fillcolor="#E8F5E9";
metrics_svc [label="{MetricsService|+ StreamMetrics(stream Metric) → StreamAck\l+ GetCurrentState(StateRequest) → MachineState\l+ GetAllStates(Empty) → AllMachinesState\l}", fillcolor="#C8E6C9"];
metric_msg [label="{Metric|machine_id: string\lhostname: string\ltimestamp_ms: int64\ltype: MetricType\lvalue: double\llabels: map\l}", fillcolor="#A5D6A7"];
machine_state [label="{MachineState|machine_id: string\lhostname: string\llast_seen_ms: int64\lcurrent_metrics: Metric[]\lhealth: HealthStatus\lmetadata: map\l}", fillcolor="#A5D6A7"];
}
// ControlService
subgraph cluster_control {
label="ControlService";
style=filled;
fillcolor="#E3F2FD";
control_svc [label="{ControlService|+ Control(stream Command) → stream Response\l}", fillcolor="#90CAF9"];
commands [label="{ControlCommand|command_id: string\l|UpdateIntervalCommand\lRestartCollectionCommand\lShutdownCommand\l}", fillcolor="#64B5F6"];
}
// ConfigService
subgraph cluster_config {
label="ConfigService";
style=filled;
fillcolor="#FFF3E0";
config_svc [label="{ConfigService|+ GetConfig(ConfigRequest) → CollectorConfig\l+ WatchConfig(ConfigRequest) → stream CollectorConfig\l}", fillcolor="#FFE0B2"];
collector_config [label="{CollectorConfig|collection_interval_seconds: int32\lenabled_metrics: MetricType[]\llabels: map\lthresholds: ThresholdConfig[]\l}", fillcolor="#FFCC80"];
}
// Enums
subgraph cluster_enums {
label="Enums";
style=filled;
fillcolor="#F3E5F5";
metric_type [label="{MetricType|CPU_PERCENT\lMEMORY_PERCENT\lDISK_PERCENT\lNETWORK_*\lLOAD_AVG_*\l...}", fillcolor="#E1BEE7"];
health_status [label="{HealthStatus|HEALTHY\lWARNING\lCRITICAL\lUNKNOWN\lOFFLINE\l}", fillcolor="#CE93D8"];
}
// Relationships
metrics_svc -> metric_msg [style=dashed];
metrics_svc -> machine_state [style=dashed];
control_svc -> commands [style=dashed];
config_svc -> collector_config [style=dashed];
metric_msg -> metric_type [style=dotted];
machine_state -> health_status [style=dotted];
}

View File

@@ -0,0 +1,171 @@
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
"http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
<!-- Generated by graphviz version 14.1.1 (0)
-->
<!-- Title: GrpcServices Pages: 1 -->
<svg width="1030pt" height="486pt"
viewBox="0.00 0.00 1030.00 486.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 482.25)">
<title>GrpcServices</title>
<polygon fill="white" stroke="none" points="-4,4 -4,-482.25 1026.25,-482.25 1026.25,4 -4,4"/>
<text xml:space="preserve" text-anchor="middle" x="511.12" y="-460.95" font-family="Helvetica,sans-Serif" font-size="14.00">gRPC Service Definitions</text>
<g id="clust1" class="cluster">
<title>cluster_metrics</title>
<polygon fill="#e8f5e9" stroke="black" points="21.5,-8 21.5,-239 726.75,-239 726.75,-8 21.5,-8"/>
<text xml:space="preserve" text-anchor="middle" x="374.12" y="-221.7" font-family="Helvetica,sans-Serif" font-size="14.00">MetricsService</text>
</g>
<g id="clust2" class="cluster">
<title>cluster_control</title>
<polygon fill="#e3f2fd" stroke="black" points="23.38,-247 23.38,-336 799.25,-336 799.25,-247 23.38,-247"/>
<text xml:space="preserve" text-anchor="middle" x="411.31" y="-318.7" font-family="Helvetica,sans-Serif" font-size="14.00">ControlService</text>
</g>
<g id="clust3" class="cluster">
<title>cluster_config</title>
<polygon fill="#fff3e0" stroke="black" points="8,-344 8,-445 753,-445 753,-344 8,-344"/>
<text xml:space="preserve" text-anchor="middle" x="380.5" y="-427.7" font-family="Helvetica,sans-Serif" font-size="14.00">ConfigService</text>
</g>
<g id="clust4" class="cluster">
<title>cluster_enums</title>
<polygon fill="#f3e5f5" stroke="black" points="819.25,-11 819.25,-229 1014.25,-229 1014.25,-11 819.25,-11"/>
<text xml:space="preserve" text-anchor="middle" x="916.75" y="-211.7" font-family="Helvetica,sans-Serif" font-size="14.00">Enums</text>
</g>
<!-- metrics_svc -->
<g id="node1" class="node">
<title>metrics_svc</title>
<polygon fill="#c8e6c9" stroke="black" points="29.5,-87.88 29.5,-134.12 377.25,-134.12 377.25,-87.88 29.5,-87.88"/>
<text xml:space="preserve" text-anchor="middle" x="73.5" y="-107.88" font-family="Helvetica,sans-Serif" font-size="10.00">MetricsService</text>
<polyline fill="none" stroke="black" points="117.5,-87.88 117.5,-134.12"/>
<text xml:space="preserve" text-anchor="start" x="125.5" y="-120.62" font-family="Helvetica,sans-Serif" font-size="10.00">+ StreamMetrics(stream Metric) → StreamAck</text>
<text xml:space="preserve" text-anchor="start" x="125.5" y="-107.88" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetCurrentState(StateRequest) → MachineState</text>
<text xml:space="preserve" text-anchor="start" x="125.5" y="-95.12" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetAllStates(Empty) → AllMachinesState</text>
</g>
<!-- metric_msg -->
<g id="node2" class="node">
<title>metric_msg</title>
<polygon fill="#a5d6a7" stroke="black" points="525.5,-16.75 525.5,-101.25 692.5,-101.25 692.5,-16.75 525.5,-16.75"/>
<text xml:space="preserve" text-anchor="middle" x="548.88" y="-55.88" font-family="Helvetica,sans-Serif" font-size="10.00">Metric</text>
<polyline fill="none" stroke="black" points="572.25,-16.75 572.25,-101.25"/>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-87.75" font-family="Helvetica,sans-Serif" font-size="10.00">machine_id: string</text>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-75" font-family="Helvetica,sans-Serif" font-size="10.00">hostname: string</text>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-62.25" font-family="Helvetica,sans-Serif" font-size="10.00">timestamp_ms: int64</text>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-49.5" font-family="Helvetica,sans-Serif" font-size="10.00">type: MetricType</text>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-36.75" font-family="Helvetica,sans-Serif" font-size="10.00">value: double</text>
<text xml:space="preserve" text-anchor="start" x="580.25" y="-24" font-family="Helvetica,sans-Serif" font-size="10.00">labels: map</text>
</g>
<!-- metrics_svc&#45;&gt;metric_msg -->
<g id="edge1" class="edge">
<title>metrics_svc&#45;&gt;metric_msg</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M377.6,-88.68C424.41,-82.65 473.31,-76.35 513.96,-71.12"/>
<polygon fill="black" stroke="black" points="514.22,-74.61 523.69,-69.86 513.33,-67.67 514.22,-74.61"/>
</g>
<!-- machine_state -->
<g id="node3" class="node">
<title>machine_state</title>
<polygon fill="#a5d6a7" stroke="black" points="499.25,-120.75 499.25,-205.25 718.75,-205.25 718.75,-120.75 499.25,-120.75"/>
<text xml:space="preserve" text-anchor="middle" x="540.62" y="-159.88" font-family="Helvetica,sans-Serif" font-size="10.00">MachineState</text>
<polyline fill="none" stroke="black" points="582,-120.75 582,-205.25"/>
<text xml:space="preserve" text-anchor="start" x="590" y="-191.75" font-family="Helvetica,sans-Serif" font-size="10.00">machine_id: string</text>
<text xml:space="preserve" text-anchor="start" x="590" y="-179" font-family="Helvetica,sans-Serif" font-size="10.00">hostname: string</text>
<text xml:space="preserve" text-anchor="start" x="590" y="-166.25" font-family="Helvetica,sans-Serif" font-size="10.00">last_seen_ms: int64</text>
<text xml:space="preserve" text-anchor="start" x="590" y="-153.5" font-family="Helvetica,sans-Serif" font-size="10.00">current_metrics: Metric[]</text>
<text xml:space="preserve" text-anchor="start" x="590" y="-140.75" font-family="Helvetica,sans-Serif" font-size="10.00">health: HealthStatus</text>
<text xml:space="preserve" text-anchor="start" x="590" y="-128" font-family="Helvetica,sans-Serif" font-size="10.00">metadata: map</text>
</g>
<!-- metrics_svc&#45;&gt;machine_state -->
<g id="edge2" class="edge">
<title>metrics_svc&#45;&gt;machine_state</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M377.6,-133.32C414.74,-138.1 453.2,-143.06 487.8,-147.51"/>
<polygon fill="black" stroke="black" points="487.03,-150.94 497.4,-148.75 487.93,-144 487.03,-150.94"/>
</g>
<!-- metric_type -->
<g id="node8" class="node">
<title>metric_type</title>
<polygon fill="#e1bee7" stroke="black" points="827.25,-19.75 827.25,-104.25 1006.25,-104.25 1006.25,-19.75 827.25,-19.75"/>
<text xml:space="preserve" text-anchor="middle" x="861.88" y="-58.88" font-family="Helvetica,sans-Serif" font-size="10.00">MetricType</text>
<polyline fill="none" stroke="black" points="896.5,-19.75 896.5,-104.25"/>
<text xml:space="preserve" text-anchor="start" x="904.5" y="-90.75" font-family="Helvetica,sans-Serif" font-size="10.00">CPU_PERCENT</text>
<text xml:space="preserve" text-anchor="start" x="904.5" y="-78" font-family="Helvetica,sans-Serif" font-size="10.00">MEMORY_PERCENT</text>
<text xml:space="preserve" text-anchor="start" x="904.5" y="-65.25" font-family="Helvetica,sans-Serif" font-size="10.00">DISK_PERCENT</text>
<text xml:space="preserve" text-anchor="start" x="904.5" y="-52.5" font-family="Helvetica,sans-Serif" font-size="10.00">NETWORK_*</text>
<text xml:space="preserve" text-anchor="start" x="904.5" y="-39.75" font-family="Helvetica,sans-Serif" font-size="10.00">LOAD_AVG_*</text>
<text xml:space="preserve" text-anchor="middle" x="951.38" y="-27" font-family="Helvetica,sans-Serif" font-size="10.00">...</text>
</g>
<!-- metric_msg&#45;&gt;metric_type -->
<g id="edge5" class="edge">
<title>metric_msg&#45;&gt;metric_type</title>
<path fill="none" stroke="black" stroke-dasharray="1,5" d="M692.74,-59.81C730.57,-60.18 775.71,-60.63 815.45,-61.02"/>
<polygon fill="black" stroke="black" points="815.23,-64.51 825.27,-61.11 815.3,-57.51 815.23,-64.51"/>
</g>
<!-- health_status -->
<g id="node9" class="node">
<title>health_status</title>
<polygon fill="#ce93d8" stroke="black" points="842.25,-123.12 842.25,-194.88 991.25,-194.88 991.25,-123.12 842.25,-123.12"/>
<text xml:space="preserve" text-anchor="middle" x="881.75" y="-155.88" font-family="Helvetica,sans-Serif" font-size="10.00">HealthStatus</text>
<polyline fill="none" stroke="black" points="921.25,-123.12 921.25,-194.88"/>
<text xml:space="preserve" text-anchor="start" x="929.25" y="-181.38" font-family="Helvetica,sans-Serif" font-size="10.00">HEALTHY</text>
<text xml:space="preserve" text-anchor="start" x="929.25" y="-168.62" font-family="Helvetica,sans-Serif" font-size="10.00">WARNING</text>
<text xml:space="preserve" text-anchor="start" x="929.25" y="-155.88" font-family="Helvetica,sans-Serif" font-size="10.00">CRITICAL</text>
<text xml:space="preserve" text-anchor="start" x="929.25" y="-143.12" font-family="Helvetica,sans-Serif" font-size="10.00">UNKNOWN</text>
<text xml:space="preserve" text-anchor="start" x="929.25" y="-130.38" font-family="Helvetica,sans-Serif" font-size="10.00">OFFLINE</text>
</g>
<!-- machine_state&#45;&gt;health_status -->
<g id="edge6" class="edge">
<title>machine_state&#45;&gt;health_status</title>
<path fill="none" stroke="black" stroke-dasharray="1,5" d="M719.09,-161.57C755.76,-161.09 796.1,-160.57 830.65,-160.11"/>
<polygon fill="black" stroke="black" points="830.67,-163.61 840.62,-159.98 830.58,-156.61 830.67,-163.61"/>
</g>
<!-- control_svc -->
<g id="node4" class="node">
<title>control_svc</title>
<polygon fill="#90caf9" stroke="black" points="31.38,-261 31.38,-297 375.38,-297 375.38,-261 31.38,-261"/>
<text xml:space="preserve" text-anchor="middle" x="75" y="-276" font-family="Helvetica,sans-Serif" font-size="10.00">ControlService</text>
<polyline fill="none" stroke="black" points="118.62,-261.25 118.62,-297"/>
<text xml:space="preserve" text-anchor="start" x="126.62" y="-276" font-family="Helvetica,sans-Serif" font-size="10.00">+ Control(stream Command) → stream Response</text>
</g>
<!-- commands -->
<g id="node5" class="node">
<title>commands</title>
<polygon fill="#64b5f6" stroke="black" points="426.75,-255.88 426.75,-302.12 791.25,-302.12 791.25,-255.88 426.75,-255.88"/>
<text xml:space="preserve" text-anchor="middle" x="477.5" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">ControlCommand</text>
<polyline fill="none" stroke="black" points="528.25,-255.88 528.25,-302.12"/>
<text xml:space="preserve" text-anchor="start" x="536.25" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">command_id: string</text>
<polyline fill="none" stroke="black" points="641,-255.88 641,-302.12"/>
<text xml:space="preserve" text-anchor="start" x="649" y="-288.62" font-family="Helvetica,sans-Serif" font-size="10.00">UpdateIntervalCommand</text>
<text xml:space="preserve" text-anchor="start" x="649" y="-275.88" font-family="Helvetica,sans-Serif" font-size="10.00">RestartCollectionCommand</text>
<text xml:space="preserve" text-anchor="start" x="649" y="-263.12" font-family="Helvetica,sans-Serif" font-size="10.00">ShutdownCommand</text>
</g>
<!-- control_svc&#45;&gt;commands -->
<g id="edge3" class="edge">
<title>control_svc&#45;&gt;commands</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M375.84,-279C388.79,-279 401.92,-279 414.99,-279"/>
<polygon fill="black" stroke="black" points="414.95,-282.5 424.95,-279 414.95,-275.5 414.95,-282.5"/>
</g>
<!-- config_svc -->
<g id="node6" class="node">
<title>config_svc</title>
<polygon fill="#ffe0b2" stroke="black" points="16,-364 16,-400 390.75,-400 390.75,-364 16,-364"/>
<text xml:space="preserve" text-anchor="middle" x="57.38" y="-379.12" font-family="Helvetica,sans-Serif" font-size="10.00">ConfigService</text>
<polyline fill="none" stroke="black" points="98.75,-364.5 98.75,-400"/>
<text xml:space="preserve" text-anchor="start" x="106.75" y="-385.5" font-family="Helvetica,sans-Serif" font-size="10.00">+ GetConfig(ConfigRequest) → CollectorConfig</text>
<text xml:space="preserve" text-anchor="start" x="106.75" y="-372.75" font-family="Helvetica,sans-Serif" font-size="10.00">+ WatchConfig(ConfigRequest) → stream CollectorConfig</text>
</g>
<!-- collector_config -->
<g id="node7" class="node">
<title>collector_config</title>
<polygon fill="#ffcc80" stroke="black" points="473,-352.5 473,-411.5 745,-411.5 745,-352.5 473,-352.5"/>
<text xml:space="preserve" text-anchor="middle" x="518.12" y="-378.88" font-family="Helvetica,sans-Serif" font-size="10.00">CollectorConfig</text>
<polyline fill="none" stroke="black" points="563.25,-352.5 563.25,-411.5"/>
<text xml:space="preserve" text-anchor="start" x="571.25" y="-398" font-family="Helvetica,sans-Serif" font-size="10.00">collection_interval_seconds: int32</text>
<text xml:space="preserve" text-anchor="start" x="571.25" y="-385.25" font-family="Helvetica,sans-Serif" font-size="10.00">enabled_metrics: MetricType[]</text>
<text xml:space="preserve" text-anchor="start" x="571.25" y="-372.5" font-family="Helvetica,sans-Serif" font-size="10.00">labels: map</text>
<text xml:space="preserve" text-anchor="start" x="571.25" y="-359.75" font-family="Helvetica,sans-Serif" font-size="10.00">thresholds: ThresholdConfig[]</text>
</g>
<!-- config_svc&#45;&gt;collector_config -->
<g id="edge4" class="edge">
<title>config_svc&#45;&gt;collector_config</title>
<path fill="none" stroke="black" stroke-dasharray="5,2" d="M391.12,-382C414.61,-382 438.36,-382 461.11,-382"/>
<polygon fill="black" stroke="black" points="461.03,-385.5 471.03,-382 461.03,-378.5 461.03,-385.5"/>
</g>
</g>
</svg>

After

Width:  |  Height:  |  Size: 13 KiB

View File

@@ -0,0 +1,120 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Graph Viewer - System Monitor</title>
<link rel="stylesheet" href="styles.css">
</head>
<body class="graph-viewer">
<header class="graph-header">
<a href="index.html" class="back-link">← Index</a>
<div class="nav-controls">
<button onclick="navigate(-1)" id="btn-prev" title="Previous (←)"></button>
<span id="nav-position">1 / 4</span>
<button onclick="navigate(1)" id="btn-next" title="Next (→)"></button>
</div>
<h1 id="graph-title">Loading...</h1>
<div class="graph-controls">
<button onclick="setMode('fit')">Fit</button>
<button onclick="setMode('fit-width')">Width</button>
<button onclick="setMode('fit-height')">Height</button>
<button onclick="setMode('actual-size')">100%</button>
<button onclick="downloadSvg()">↓ SVG</button>
</div>
</header>
<div class="graph-container" id="graph-container">
<img id="graph-img" src="" alt="Graph">
</div>
<script>
const graphOrder = [
'01-system-overview',
'02-data-flow',
'03-deployment',
'04-grpc-services'
];
const graphs = {
'01-system-overview': {
title: 'System Overview',
file: '01-system-overview.svg'
},
'02-data-flow': {
title: 'Data Flow Pipeline',
file: '02-data-flow.svg'
},
'03-deployment': {
title: 'Deployment Architecture',
file: '03-deployment.svg'
},
'04-grpc-services': {
title: 'gRPC Service Definitions',
file: '04-grpc-services.svg'
}
};
const params = new URLSearchParams(window.location.search);
let graphKey = params.get('g') || '01-system-overview';
let currentIndex = graphOrder.indexOf(graphKey);
if (currentIndex === -1) currentIndex = 0;
function loadGraph(key) {
const graph = graphs[key];
document.getElementById('graph-title').textContent = graph.title;
document.getElementById('graph-img').src = graph.file;
document.title = graph.title + ' - System Monitor';
history.replaceState(null, '', '?g=' + key);
graphKey = key;
updateNavHints();
}
function updateNavHints() {
const idx = graphOrder.indexOf(graphKey);
const prevBtn = document.getElementById('btn-prev');
const nextBtn = document.getElementById('btn-next');
prevBtn.disabled = idx === 0;
nextBtn.disabled = idx === graphOrder.length - 1;
document.getElementById('nav-position').textContent = (idx + 1) + ' / ' + graphOrder.length;
}
function navigate(direction) {
const idx = graphOrder.indexOf(graphKey);
const newIdx = idx + direction;
if (newIdx >= 0 && newIdx < graphOrder.length) {
currentIndex = newIdx;
loadGraph(graphOrder[newIdx]);
}
}
function setMode(mode) {
const container = document.getElementById('graph-container');
container.className = 'graph-container ' + mode;
}
function downloadSvg() {
const graph = graphs[graphKey];
const link = document.createElement('a');
link.href = graph.file;
link.download = graph.file;
link.click();
}
// Keyboard navigation
document.addEventListener('keydown', (e) => {
if (e.key === 'ArrowLeft') {
navigate(-1);
} else if (e.key === 'ArrowRight') {
navigate(1);
} else if (e.key === 'Escape') {
window.location.href = 'index.html';
}
});
// Initialize
loadGraph(graphOrder[currentIndex]);
setMode('fit');
</script>
</body>
</html>

View File

@@ -0,0 +1,207 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Monitor - Architecture Documentation</title>
<link rel="stylesheet" href="styles.css">
</head>
<body>
<header>
<h1>System Monitoring Platform</h1>
<p class="subtitle">Architecture & Design Documentation</p>
</header>
<main>
<section class="graph-section" id="overview">
<div class="graph-header-row">
<h2>System Overview</h2>
<a href="graph.html?g=01-system-overview" class="view-btn">View Full</a>
</div>
<a href="graph.html?g=01-system-overview" class="graph-preview">
<img src="01-system-overview.svg" alt="System Overview">
</a>
<div class="graph-details">
<p>High-level architecture showing all services, data stores, and communication patterns.</p>
<h4>Key Components</h4>
<ul>
<li><strong>Collector</strong>: Runs on each monitored machine, streams metrics via gRPC</li>
<li><strong>Aggregator</strong>: Central gRPC server, receives streams, normalizes data</li>
<li><strong>Gateway</strong>: FastAPI service, WebSocket for browser, REST for queries</li>
<li><strong>Alerts</strong>: Subscribes to events, evaluates thresholds, triggers actions</li>
</ul>
</div>
</section>
<section class="graph-section" id="data-flow">
<div class="graph-header-row">
<h2>Data Flow Pipeline</h2>
<a href="graph.html?g=02-data-flow" class="view-btn">View Full</a>
</div>
<a href="graph.html?g=02-data-flow" class="graph-preview">
<img src="02-data-flow.svg" alt="Data Flow">
</a>
<div class="graph-details">
<p>How metrics flow from collection through storage with different retention tiers.</p>
<h4>Storage Tiers</h4>
<table class="details-table">
<thead>
<tr><th>Tier</th><th>Resolution</th><th>Retention</th><th>Use Case</th></tr>
</thead>
<tbody>
<tr>
<td>Hot (Redis)</td>
<td>5s</td>
<td>5 min</td>
<td>Current state, live dashboard</td>
</tr>
<tr>
<td>Raw (TimescaleDB)</td>
<td>5s</td>
<td>24h</td>
<td>Recent detailed analysis</td>
</tr>
<tr>
<td>1-min Aggregates</td>
<td>1m</td>
<td>7d</td>
<td>Week view, trends</td>
</tr>
<tr>
<td>1-hour Aggregates</td>
<td>1h</td>
<td>90d</td>
<td>Long-term analysis</td>
</tr>
</tbody>
</table>
</div>
</section>
<section class="graph-section" id="deployment">
<div class="graph-header-row">
<h2>Deployment Architecture</h2>
<a href="graph.html?g=03-deployment" class="view-btn">View Full</a>
</div>
<a href="graph.html?g=03-deployment" class="graph-preview">
<img src="03-deployment.svg" alt="Deployment">
</a>
<div class="graph-details">
<p>Deployment options from local development to AWS production.</p>
<h4>Environments</h4>
<ul>
<li><strong>Local Dev</strong>: Kind + Tilt for K8s, or Docker Compose</li>
<li><strong>Demo (EC2)</strong>: Docker Compose on t2.small at sysmonstm.mcrn.ar</li>
<li><strong>Lambda Pipeline</strong>: SQS-triggered aggregation for data processing experience</li>
</ul>
</div>
</section>
<section class="graph-section" id="grpc">
<div class="graph-header-row">
<h2>gRPC Service Definitions</h2>
<a href="graph.html?g=04-grpc-services" class="view-btn">View Full</a>
</div>
<a href="graph.html?g=04-grpc-services" class="graph-preview">
<img src="04-grpc-services.svg" alt="gRPC Services">
</a>
<div class="graph-details">
<p>Protocol Buffer service and message definitions.</p>
<h4>Services</h4>
<ul>
<li><strong>MetricsService</strong>: Client-side streaming for metrics ingestion</li>
<li><strong>ControlService</strong>: Bidirectional streaming for collector control</li>
<li><strong>ConfigService</strong>: Server-side streaming for config updates</li>
</ul>
</div>
</section>
<section class="findings-section">
<h2>Interview Talking Points</h2>
<div class="findings-grid">
<article class="finding-card">
<h3>Domain Mapping</h3>
<ul>
<li>Machine = Payment Processor</li>
<li>Metrics Stream = Transaction Stream</li>
<li>Thresholds = Fraud Detection</li>
<li>Aggregator = Payment Hub</li>
</ul>
</article>
<article class="finding-card">
<h3>gRPC Patterns</h3>
<ul>
<li>Client streaming (metrics)</li>
<li>Server streaming (config)</li>
<li>Bidirectional (control)</li>
<li>Health checking</li>
</ul>
</article>
<article class="finding-card">
<h3>Event-Driven</h3>
<ul>
<li>Redis Pub/Sub (current)</li>
<li>Abstraction for Kafka switch</li>
<li>Decoupled alert processing</li>
<li>Real-time WebSocket push</li>
</ul>
</article>
<article class="finding-card">
<h3>Resilience</h3>
<ul>
<li>Collectors are independent</li>
<li>Graceful degradation</li>
<li>Retry with backoff</li>
<li>Health checks everywhere</li>
</ul>
</article>
</div>
</section>
<section class="tech-section">
<h2>Technology Stack</h2>
<div class="tech-grid">
<div class="tech-column">
<h3>Core</h3>
<ul>
<li>Python 3.11+</li>
<li>FastAPI</li>
<li>gRPC / protobuf</li>
<li>asyncio</li>
</ul>
</div>
<div class="tech-column">
<h3>Data</h3>
<ul>
<li>TimescaleDB</li>
<li>Redis</li>
<li>Redis Pub/Sub</li>
</ul>
</div>
<div class="tech-column">
<h3>Infrastructure</h3>
<ul>
<li>Docker</li>
<li>Kubernetes</li>
<li>Kind + Tilt</li>
<li>Terraform</li>
</ul>
</div>
<div class="tech-column">
<h3>CI/CD</h3>
<ul>
<li>Woodpecker CI</li>
<li>Kustomize</li>
<li>Container Registry</li>
</ul>
</div>
</div>
</section>
</main>
<footer>
<p>System Monitoring Platform - Architecture Documentation</p>
<p class="date">Generated: <time datetime="2025-12-29">December 2025</time></p>
</footer>
</body>
</html>

View File

@@ -0,0 +1,343 @@
:root {
--bg-primary: #1a1a2e;
--bg-secondary: #16213e;
--bg-card: #0f3460;
--text-primary: #eee;
--text-secondary: #a0a0a0;
--accent: #e94560;
--accent-secondary: #533483;
--border: #2a2a4a;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Segoe UI', system-ui, -apple-system, sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
line-height: 1.6;
}
header {
background: linear-gradient(135deg, var(--bg-secondary), var(--accent-secondary));
padding: 2rem;
text-align: center;
border-bottom: 2px solid var(--accent);
}
header h1 {
font-size: 2rem;
margin-bottom: 0.5rem;
}
header .subtitle {
color: var(--text-secondary);
font-size: 1rem;
}
main {
max-width: 1400px;
margin: 0 auto;
padding: 2rem;
}
/* Graph sections */
.graph-section {
background: var(--bg-secondary);
border-radius: 8px;
padding: 1.5rem;
margin-bottom: 2rem;
border: 1px solid var(--border);
}
.graph-header-row {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 1rem;
}
.graph-header-row h2 {
font-size: 1.25rem;
color: var(--accent);
}
.view-btn {
background: var(--accent);
color: white;
padding: 0.5rem 1rem;
border-radius: 4px;
text-decoration: none;
font-size: 0.875rem;
transition: opacity 0.2s;
}
.view-btn:hover {
opacity: 0.8;
}
.graph-preview {
display: block;
background: white;
border-radius: 4px;
padding: 1rem;
margin-bottom: 1rem;
overflow: auto;
max-height: 400px;
}
.graph-preview img {
max-width: 100%;
height: auto;
}
.graph-details {
color: var(--text-secondary);
font-size: 0.9rem;
}
.graph-details h4 {
color: var(--text-primary);
margin: 1rem 0 0.5rem;
}
.graph-details ul {
margin-left: 1.5rem;
}
.graph-details li {
margin-bottom: 0.25rem;
}
/* Tech section */
.tech-section {
background: var(--bg-secondary);
border-radius: 8px;
padding: 1.5rem;
margin-bottom: 2rem;
border: 1px solid var(--border);
}
.tech-section h2 {
color: var(--accent);
margin-bottom: 1rem;
}
.tech-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(200px, 1fr));
gap: 1.5rem;
}
.tech-column h3 {
color: var(--text-primary);
font-size: 1rem;
margin-bottom: 0.75rem;
padding-bottom: 0.5rem;
border-bottom: 1px solid var(--border);
}
.tech-column ul {
list-style: none;
}
.tech-column li {
padding: 0.25rem 0;
color: var(--text-secondary);
}
/* Findings */
.findings-section {
margin-bottom: 2rem;
}
.findings-section h2 {
color: var(--accent);
margin-bottom: 1rem;
}
.findings-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 1rem;
}
.finding-card {
background: var(--bg-secondary);
border-radius: 8px;
padding: 1.25rem;
border: 1px solid var(--border);
}
.finding-card h3 {
color: var(--accent);
font-size: 1rem;
margin-bottom: 0.75rem;
}
.finding-card ul {
margin-left: 1rem;
color: var(--text-secondary);
}
.finding-card code {
background: var(--bg-primary);
padding: 0.125rem 0.375rem;
border-radius: 3px;
font-size: 0.85em;
}
/* Footer */
footer {
text-align: center;
padding: 2rem;
color: var(--text-secondary);
border-top: 1px solid var(--border);
}
footer .date {
font-size: 0.85rem;
}
/* Graph viewer page */
body.graph-viewer {
display: flex;
flex-direction: column;
height: 100vh;
}
.graph-header {
display: flex;
align-items: center;
gap: 1rem;
padding: 0.75rem 1rem;
background: var(--bg-secondary);
border-bottom: 1px solid var(--border);
flex-wrap: wrap;
}
.back-link {
color: var(--accent);
text-decoration: none;
}
.nav-controls {
display: flex;
align-items: center;
gap: 0.5rem;
}
.nav-controls button {
background: var(--bg-card);
color: var(--text-primary);
border: 1px solid var(--border);
padding: 0.25rem 0.75rem;
border-radius: 4px;
cursor: pointer;
}
.nav-controls button:disabled {
opacity: 0.3;
cursor: not-allowed;
}
#nav-position {
color: var(--text-secondary);
font-size: 0.85rem;
}
.graph-header h1 {
flex: 1;
font-size: 1rem;
text-align: center;
}
.graph-controls {
display: flex;
gap: 0.5rem;
}
.graph-controls button {
background: var(--bg-card);
color: var(--text-primary);
border: 1px solid var(--border);
padding: 0.375rem 0.75rem;
border-radius: 4px;
cursor: pointer;
font-size: 0.85rem;
}
.graph-controls button:hover {
background: var(--accent);
}
.graph-container {
flex: 1;
overflow: auto;
background: white;
display: flex;
justify-content: center;
align-items: flex-start;
padding: 1rem;
}
.graph-container.fit img {
max-width: 100%;
max-height: calc(100vh - 60px);
object-fit: contain;
}
.graph-container.fit-width img {
width: 100%;
height: auto;
}
.graph-container.fit-height img {
height: calc(100vh - 60px);
width: auto;
}
.graph-container.actual-size img {
/* No constraints */
}
/* Tables */
.details-table {
width: 100%;
border-collapse: collapse;
margin: 1rem 0;
font-size: 0.85rem;
}
.details-table th,
.details-table td {
padding: 0.5rem;
text-align: left;
border-bottom: 1px solid var(--border);
}
.details-table th {
color: var(--text-primary);
background: var(--bg-primary);
}
.details-table td {
color: var(--text-secondary);
}
.details-table code {
background: var(--bg-primary);
padding: 0.125rem 0.375rem;
border-radius: 3px;
}
.note {
font-style: italic;
font-size: 0.85rem;
color: var(--text-secondary);
margin-top: 0.5rem;
}

View File

@@ -0,0 +1 @@
placeholder

Binary file not shown.

View File

@@ -0,0 +1 @@
placeholder

Binary file not shown.

148
infra/aws/terraform/ec2.tf Normal file
View File

@@ -0,0 +1,148 @@
# EC2 Instance for Docker Compose deployment
resource "aws_security_group" "sysmonstm" {
name_prefix = "${var.project_name}-"
description = "Security group for System Monitor Platform"
# HTTP/HTTPS
ingress {
from_port = 80
to_port = 80
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "HTTP"
}
ingress {
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "HTTPS"
}
# gRPC for collectors
ingress {
from_port = 50051
to_port = 50051
protocol = "tcp"
cidr_blocks = ["0.0.0.0/0"]
description = "gRPC Aggregator"
}
# SSH (restricted)
dynamic "ingress" {
for_each = length(var.allowed_ssh_cidrs) > 0 ? [1] : []
content {
from_port = 22
to_port = 22
protocol = "tcp"
cidr_blocks = var.allowed_ssh_cidrs
description = "SSH"
}
}
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
description = "Allow all outbound"
}
tags = {
Name = "${var.project_name}-sg"
}
}
resource "aws_iam_role" "ec2" {
name_prefix = "${var.project_name}-ec2-"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "ec2.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy_attachment" "ec2_ssm" {
role = aws_iam_role.ec2.name
policy_arn = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore"
}
resource "aws_iam_instance_profile" "ec2" {
name_prefix = "${var.project_name}-"
role = aws_iam_role.ec2.name
}
resource "aws_instance" "sysmonstm" {
ami = data.aws_ami.amazon_linux_2023.id
instance_type = var.ec2_instance_type
key_name = var.ec2_key_name != "" ? var.ec2_key_name : null
vpc_security_group_ids = [aws_security_group.sysmonstm.id]
iam_instance_profile = aws_iam_instance_profile.ec2.name
root_block_device {
volume_size = 20
volume_type = "gp3"
encrypted = true
}
user_data = <<-EOF
#!/bin/bash
set -e
# Install Docker
dnf update -y
dnf install -y docker git
systemctl enable docker
systemctl start docker
# Install Docker Compose
curl -L "https://github.com/docker/compose/releases/latest/download/docker-compose-$(uname -s)-$(uname -m)" \
-o /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
# Add ec2-user to docker group
usermod -aG docker ec2-user
# Clone and start the application
cd /home/ec2-user
git clone https://github.com/yourusername/sysmonstm.git || true
cd sysmonstm
# Create .env file
cat > .env <<EOL
LOG_LEVEL=INFO
MACHINE_ID=aws-demo
EOL
# Start services
docker-compose up -d
EOF
tags = {
Name = "${var.project_name}-server"
}
lifecycle {
ignore_changes = [ami]
}
}
# Elastic IP for stable address
resource "aws_eip" "sysmonstm" {
instance = aws_instance.sysmonstm.id
domain = "vpc"
tags = {
Name = "${var.project_name}-eip"
}
}

View File

@@ -0,0 +1,203 @@
# Lambda Functions for Data Processing Pipeline
# These are optional and enabled via enable_lambda_pipeline variable
# SQS Queue for buffering metrics
resource "aws_sqs_queue" "metrics" {
count = var.enable_lambda_pipeline ? 1 : 0
name = "${var.project_name}-metrics"
visibility_timeout_seconds = var.lambda_timeout * 2
message_retention_seconds = 86400 # 24 hours
redrive_policy = jsonencode({
deadLetterTargetArn = aws_sqs_queue.metrics_dlq[0].arn
maxReceiveCount = 3
})
}
resource "aws_sqs_queue" "metrics_dlq" {
count = var.enable_lambda_pipeline ? 1 : 0
name = "${var.project_name}-metrics-dlq"
message_retention_seconds = 1209600 # 14 days
}
# S3 Bucket for metric backups
resource "aws_s3_bucket" "metrics" {
count = var.enable_s3_backup ? 1 : 0
bucket_prefix = "${var.project_name}-metrics-"
}
resource "aws_s3_bucket_lifecycle_configuration" "metrics" {
count = var.enable_s3_backup ? 1 : 0
bucket = aws_s3_bucket.metrics[0].id
rule {
id = "archive-old-metrics"
status = "Enabled"
transition {
days = 30
storage_class = "STANDARD_IA"
}
transition {
days = 90
storage_class = "GLACIER"
}
expiration {
days = 365
}
}
}
# IAM Role for Lambda
resource "aws_iam_role" "lambda" {
count = var.enable_lambda_pipeline ? 1 : 0
name_prefix = "${var.project_name}-lambda-"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Action = "sts:AssumeRole"
Effect = "Allow"
Principal = {
Service = "lambda.amazonaws.com"
}
}
]
})
}
resource "aws_iam_role_policy" "lambda" {
count = var.enable_lambda_pipeline ? 1 : 0
name = "lambda-policy"
role = aws_iam_role.lambda[0].id
policy = jsonencode({
Version = "2012-10-17"
Statement = [
{
Effect = "Allow"
Action = [
"logs:CreateLogGroup",
"logs:CreateLogStream",
"logs:PutLogEvents"
]
Resource = "arn:aws:logs:*:*:*"
},
{
Effect = "Allow"
Action = [
"sqs:ReceiveMessage",
"sqs:DeleteMessage",
"sqs:GetQueueAttributes"
]
Resource = aws_sqs_queue.metrics[0].arn
},
{
Effect = "Allow"
Action = [
"s3:PutObject",
"s3:GetObject"
]
Resource = var.enable_s3_backup ? "${aws_s3_bucket.metrics[0].arn}/*" : "*"
}
]
})
}
# Lambda function for metric aggregation
resource "aws_lambda_function" "aggregator" {
count = var.enable_lambda_pipeline ? 1 : 0
function_name = "${var.project_name}-aggregator"
role = aws_iam_role.lambda[0].arn
handler = "main.handler"
runtime = "python3.11"
timeout = var.lambda_timeout
memory_size = var.lambda_memory_size
# Placeholder - will be deployed via CI/CD
filename = "${path.module}/../lambdas/aggregator/placeholder.zip"
source_code_hash = filebase64sha256("${path.module}/../lambdas/aggregator/placeholder.zip")
environment {
variables = {
TIMESCALE_HOST = aws_instance.sysmonstm.private_ip
LOG_LEVEL = "INFO"
}
}
lifecycle {
ignore_changes = [filename, source_code_hash]
}
}
resource "aws_lambda_event_source_mapping" "sqs_trigger" {
count = var.enable_lambda_pipeline ? 1 : 0
event_source_arn = aws_sqs_queue.metrics[0].arn
function_name = aws_lambda_function.aggregator[0].arn
batch_size = 100
scaling_config {
maximum_concurrency = 5
}
}
# CloudWatch Event for scheduled compaction
resource "aws_cloudwatch_event_rule" "compactor" {
count = var.enable_lambda_pipeline ? 1 : 0
name = "${var.project_name}-compactor-schedule"
description = "Trigger metric compaction every hour"
schedule_expression = "rate(1 hour)"
}
resource "aws_lambda_function" "compactor" {
count = var.enable_lambda_pipeline ? 1 : 0
function_name = "${var.project_name}-compactor"
role = aws_iam_role.lambda[0].arn
handler = "main.handler"
runtime = "python3.11"
timeout = 300
memory_size = 512
filename = "${path.module}/../lambdas/compactor/placeholder.zip"
source_code_hash = filebase64sha256("${path.module}/../lambdas/compactor/placeholder.zip")
environment {
variables = {
TIMESCALE_HOST = aws_instance.sysmonstm.private_ip
S3_BUCKET = var.enable_s3_backup ? aws_s3_bucket.metrics[0].bucket : ""
LOG_LEVEL = "INFO"
}
}
lifecycle {
ignore_changes = [filename, source_code_hash]
}
}
resource "aws_cloudwatch_event_target" "compactor" {
count = var.enable_lambda_pipeline ? 1 : 0
rule = aws_cloudwatch_event_rule.compactor[0].name
target_id = "compactor-lambda"
arn = aws_lambda_function.compactor[0].arn
}
resource "aws_lambda_permission" "compactor_cloudwatch" {
count = var.enable_lambda_pipeline ? 1 : 0
statement_id = "AllowCloudWatchInvoke"
action = "lambda:InvokeFunction"
function_name = aws_lambda_function.compactor[0].function_name
principal = "events.amazonaws.com"
source_arn = aws_cloudwatch_event_rule.compactor[0].arn
}

View File

@@ -0,0 +1,58 @@
# System Monitor Platform - AWS Infrastructure
#
# This Terraform configuration sets up:
# - EC2 instance for running Docker Compose (demo/staging)
# - Lambda functions for data processing pipeline
# - SQS queue for buffering metrics
# - S3 bucket for metric backups
# - Security groups and IAM roles
terraform {
required_version = ">= 1.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
}
# Uncomment for remote state
# backend "s3" {
# bucket = "your-terraform-state-bucket"
# key = "sysmonstm/terraform.tfstate"
# region = "us-east-1"
# }
}
provider "aws" {
region = var.aws_region
default_tags {
tags = {
Project = "sysmonstm"
Environment = var.environment
ManagedBy = "terraform"
}
}
}
# Data sources
data "aws_availability_zones" "available" {
state = "available"
}
data "aws_ami" "amazon_linux_2023" {
most_recent = true
owners = ["amazon"]
filter {
name = "name"
values = ["al2023-ami-*-x86_64"]
}
filter {
name = "virtualization-type"
values = ["hvm"]
}
}

View File

@@ -0,0 +1,36 @@
# Outputs
output "ec2_public_ip" {
description = "Public IP of the EC2 instance"
value = aws_eip.sysmonstm.public_ip
}
output "ec2_instance_id" {
description = "EC2 instance ID"
value = aws_instance.sysmonstm.id
}
output "dashboard_url" {
description = "URL for the monitoring dashboard"
value = "http://${aws_eip.sysmonstm.public_ip}:8000"
}
output "grpc_endpoint" {
description = "gRPC endpoint for collectors"
value = "${aws_eip.sysmonstm.public_ip}:50051"
}
output "sqs_queue_url" {
description = "SQS queue URL for metrics"
value = var.enable_lambda_pipeline ? aws_sqs_queue.metrics[0].url : null
}
output "s3_bucket" {
description = "S3 bucket for metric backups"
value = var.enable_s3_backup ? aws_s3_bucket.metrics[0].bucket : null
}
output "ssh_command" {
description = "SSH command to connect to the instance"
value = var.ec2_key_name != "" ? "ssh -i ${var.ec2_key_name}.pem ec2-user@${aws_eip.sysmonstm.public_ip}" : "Use SSM Session Manager"
}

View File

@@ -0,0 +1,16 @@
# Example Terraform variables
# Copy to terraform.tfvars and fill in your values
aws_region = "us-east-1"
environment = "staging"
project_name = "sysmonstm"
domain_name = "sysmonstm.mcrn.ar"
# EC2
ec2_instance_type = "t2.small"
ec2_key_name = "your-key-pair-name"
allowed_ssh_cidrs = ["YOUR.IP.ADDRESS/32"]
# Feature flags
enable_lambda_pipeline = false
enable_s3_backup = false

View File

@@ -0,0 +1,70 @@
# Variables for System Monitor Platform
variable "aws_region" {
description = "AWS region to deploy to"
type = string
default = "us-east-1"
}
variable "environment" {
description = "Environment name (dev, staging, prod)"
type = string
default = "staging"
}
variable "project_name" {
description = "Project name for resource naming"
type = string
default = "sysmonstm"
}
variable "domain_name" {
description = "Domain name for the service"
type = string
default = "sysmonstm.mcrn.ar"
}
# EC2 Configuration
variable "ec2_instance_type" {
description = "EC2 instance type"
type = string
default = "t2.small"
}
variable "ec2_key_name" {
description = "SSH key pair name"
type = string
default = ""
}
variable "allowed_ssh_cidrs" {
description = "CIDR blocks allowed to SSH"
type = list(string)
default = [] # Set to your IP for security
}
# Lambda Configuration
variable "lambda_memory_size" {
description = "Lambda function memory in MB"
type = number
default = 256
}
variable "lambda_timeout" {
description = "Lambda function timeout in seconds"
type = number
default = 60
}
# Feature flags
variable "enable_lambda_pipeline" {
description = "Enable Lambda data processing pipeline"
type = bool
default = false
}
variable "enable_s3_backup" {
description = "Enable S3 backup for metrics"
type = bool
default = false
}

View File

@@ -0,0 +1,15 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: aggregator-config
data:
REDIS_URL: "redis://redis:6379"
TIMESCALE_HOST: "timescaledb"
TIMESCALE_PORT: "5432"
TIMESCALE_USER: "monitor"
TIMESCALE_DB: "monitor"
GRPC_PORT: "50051"
SERVICE_NAME: "aggregator"
EVENTS_BACKEND: "redis_pubsub"
LOG_LEVEL: "INFO"
LOG_FORMAT: "json"

View File

@@ -0,0 +1,46 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: aggregator
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: aggregator
template:
metadata:
labels:
app.kubernetes.io/name: aggregator
spec:
containers:
- name: aggregator
image: sysmonstm/aggregator:latest
ports:
- containerPort: 50051
name: grpc
envFrom:
- configMapRef:
name: aggregator-config
env:
- name: TIMESCALE_PASSWORD
valueFrom:
secretKeyRef:
name: timescaledb-secret
key: password
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
livenessProbe:
exec:
command: ["/bin/grpc_health_probe", "-addr=:50051"]
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
exec:
command: ["/bin/grpc_health_probe", "-addr=:50051"]
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
commonLabels:
app.kubernetes.io/name: aggregator
app.kubernetes.io/component: backend
resources:
- deployment.yaml
- service.yaml
- configmap.yaml

View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: aggregator
spec:
selector:
app.kubernetes.io/name: aggregator
ports:
- port: 50051
targetPort: grpc
name: grpc

View File

@@ -0,0 +1,14 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: alerts-config
data:
REDIS_URL: "redis://redis:6379"
TIMESCALE_HOST: "timescaledb"
TIMESCALE_PORT: "5432"
TIMESCALE_USER: "monitor"
TIMESCALE_DB: "monitor"
SERVICE_NAME: "alerts"
EVENTS_BACKEND: "redis_pubsub"
LOG_LEVEL: "INFO"
LOG_FORMAT: "json"

View File

@@ -0,0 +1,33 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: alerts
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: alerts
template:
metadata:
labels:
app.kubernetes.io/name: alerts
spec:
containers:
- name: alerts
image: sysmonstm/alerts:latest
envFrom:
- configMapRef:
name: alerts-config
env:
- name: TIMESCALE_PASSWORD
valueFrom:
secretKeyRef:
name: timescaledb-secret
key: password
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"

View File

@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
commonLabels:
app.kubernetes.io/name: alerts
app.kubernetes.io/component: backend
resources:
- deployment.yaml
- configmap.yaml

View File

@@ -0,0 +1,16 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: gateway-config
data:
REDIS_URL: "redis://redis:6379"
TIMESCALE_HOST: "timescaledb"
TIMESCALE_PORT: "5432"
TIMESCALE_USER: "monitor"
TIMESCALE_DB: "monitor"
AGGREGATOR_URL: "aggregator:50051"
HTTP_PORT: "8000"
SERVICE_NAME: "gateway"
EVENTS_BACKEND: "redis_pubsub"
LOG_LEVEL: "INFO"
LOG_FORMAT: "json"

View File

@@ -0,0 +1,48 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: gateway
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: gateway
template:
metadata:
labels:
app.kubernetes.io/name: gateway
spec:
containers:
- name: gateway
image: sysmonstm/gateway:latest
ports:
- containerPort: 8000
name: http
envFrom:
- configMapRef:
name: gateway-config
env:
- name: TIMESCALE_PASSWORD
valueFrom:
secretKeyRef:
name: timescaledb-secret
key: password
resources:
requests:
memory: "128Mi"
cpu: "100m"
limits:
memory: "256Mi"
cpu: "500m"
livenessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 10
periodSeconds: 10
readinessProbe:
httpGet:
path: /health
port: http
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
commonLabels:
app.kubernetes.io/name: gateway
app.kubernetes.io/component: frontend
resources:
- deployment.yaml
- service.yaml
- configmap.yaml

View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: gateway
spec:
selector:
app.kubernetes.io/name: gateway
ports:
- port: 8000
targetPort: http
name: http

View File

@@ -0,0 +1,17 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sysmonstm
commonLabels:
app.kubernetes.io/part-of: sysmonstm
app.kubernetes.io/managed-by: kustomize
resources:
- namespace.yaml
- redis/
- timescaledb/
- aggregator/
- gateway/
- alerts/
# collector is deployed separately on each machine

6
k8s/base/namespace.yaml Normal file
View File

@@ -0,0 +1,6 @@
apiVersion: v1
kind: Namespace
metadata:
name: sysmonstm
labels:
app.kubernetes.io/name: sysmonstm

View File

@@ -0,0 +1,37 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: redis
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: redis
template:
metadata:
labels:
app.kubernetes.io/name: redis
spec:
containers:
- name: redis
image: redis:7-alpine
ports:
- containerPort: 6379
name: redis
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
livenessProbe:
exec:
command: ["redis-cli", "ping"]
initialDelaySeconds: 5
periodSeconds: 10
readinessProbe:
exec:
command: ["redis-cli", "ping"]
initialDelaySeconds: 5
periodSeconds: 5

View File

@@ -0,0 +1,10 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
commonLabels:
app.kubernetes.io/name: redis
app.kubernetes.io/component: cache
resources:
- deployment.yaml
- service.yaml

View File

@@ -0,0 +1,11 @@
apiVersion: v1
kind: Service
metadata:
name: redis
spec:
selector:
app.kubernetes.io/name: redis
ports:
- port: 6379
targetPort: redis
name: redis

View File

@@ -0,0 +1,94 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: timescaledb-init
data:
init.sql: |
-- TimescaleDB initialization script
CREATE EXTENSION IF NOT EXISTS timescaledb;
CREATE TABLE IF NOT EXISTS metrics_raw (
time TIMESTAMPTZ NOT NULL,
machine_id TEXT NOT NULL,
hostname TEXT NOT NULL,
metric_type TEXT NOT NULL,
value DOUBLE PRECISION NOT NULL,
labels JSONB DEFAULT '{}'::jsonb
);
SELECT create_hypertable('metrics_raw', 'time',
chunk_time_interval => INTERVAL '1 hour',
if_not_exists => TRUE
);
CREATE INDEX IF NOT EXISTS idx_metrics_raw_machine
ON metrics_raw (machine_id, time DESC);
CREATE INDEX IF NOT EXISTS idx_metrics_raw_type
ON metrics_raw (metric_type, time DESC);
CREATE TABLE IF NOT EXISTS metrics_1m (
time TIMESTAMPTZ NOT NULL,
machine_id TEXT NOT NULL,
hostname TEXT NOT NULL,
metric_type TEXT NOT NULL,
avg_value DOUBLE PRECISION NOT NULL,
min_value DOUBLE PRECISION NOT NULL,
max_value DOUBLE PRECISION NOT NULL,
sample_count INTEGER NOT NULL
);
SELECT create_hypertable('metrics_1m', 'time',
chunk_time_interval => INTERVAL '1 day',
if_not_exists => TRUE
);
CREATE TABLE IF NOT EXISTS machines (
machine_id TEXT PRIMARY KEY,
hostname TEXT NOT NULL,
first_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(),
metadata JSONB DEFAULT '{}'::jsonb,
health TEXT NOT NULL DEFAULT 'UNKNOWN'
);
CREATE TABLE IF NOT EXISTS alert_rules (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
metric_type TEXT NOT NULL,
operator TEXT NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
severity TEXT NOT NULL,
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
CREATE TABLE IF NOT EXISTS alerts (
id SERIAL,
time TIMESTAMPTZ NOT NULL DEFAULT NOW(),
machine_id TEXT NOT NULL,
rule_id INTEGER REFERENCES alert_rules(id),
rule_name TEXT NOT NULL,
metric_type TEXT NOT NULL,
value DOUBLE PRECISION NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
severity TEXT NOT NULL,
resolved_at TIMESTAMPTZ,
PRIMARY KEY (id, time)
);
SELECT create_hypertable('alerts', 'time',
chunk_time_interval => INTERVAL '1 day',
if_not_exists => TRUE
);
SELECT add_retention_policy('metrics_raw', INTERVAL '24 hours', if_not_exists => TRUE);
SELECT add_retention_policy('alerts', INTERVAL '30 days', if_not_exists => TRUE);
INSERT INTO alert_rules (name, metric_type, operator, threshold, severity)
VALUES
('High CPU Usage', 'CPU_PERCENT', 'gt', 80.0, 'warning'),
('Critical CPU Usage', 'CPU_PERCENT', 'gt', 95.0, 'critical'),
('High Memory Usage', 'MEMORY_PERCENT', 'gt', 85.0, 'warning'),
('Critical Memory Usage', 'MEMORY_PERCENT', 'gt', 95.0, 'critical')
ON CONFLICT (name) DO NOTHING;

View File

@@ -0,0 +1,11 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
commonLabels:
app.kubernetes.io/name: timescaledb
app.kubernetes.io/component: database
resources:
- statefulset.yaml
- service.yaml
- configmap.yaml

View File

@@ -0,0 +1,12 @@
apiVersion: v1
kind: Service
metadata:
name: timescaledb
spec:
selector:
app.kubernetes.io/name: timescaledb
ports:
- port: 5432
targetPort: postgres
name: postgres
clusterIP: None # Headless for StatefulSet

View File

@@ -0,0 +1,65 @@
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: timescaledb
spec:
serviceName: timescaledb
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: timescaledb
template:
metadata:
labels:
app.kubernetes.io/name: timescaledb
spec:
containers:
- name: timescaledb
image: timescale/timescaledb:latest-pg15
ports:
- containerPort: 5432
name: postgres
env:
- name: POSTGRES_USER
value: monitor
- name: POSTGRES_PASSWORD
valueFrom:
secretKeyRef:
name: timescaledb-secret
key: password
- name: POSTGRES_DB
value: monitor
resources:
requests:
memory: "256Mi"
cpu: "100m"
limits:
memory: "512Mi"
cpu: "500m"
volumeMounts:
- name: data
mountPath: /var/lib/postgresql/data
- name: init-scripts
mountPath: /docker-entrypoint-initdb.d
livenessProbe:
exec:
command: ["pg_isready", "-U", "monitor", "-d", "monitor"]
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
exec:
command: ["pg_isready", "-U", "monitor", "-d", "monitor"]
initialDelaySeconds: 5
periodSeconds: 5
volumes:
- name: init-scripts
configMap:
name: timescaledb-init
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 5Gi

View File

@@ -0,0 +1,22 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: sysmonstm
resources:
- ../../base
- secrets.yaml
patches:
- path: patches/reduce-resources.yaml
images:
- name: sysmonstm/aggregator
newName: sysmonstm-aggregator
newTag: dev
- name: sysmonstm/gateway
newName: sysmonstm-gateway
newTag: dev
- name: sysmonstm/alerts
newName: sysmonstm-alerts
newTag: dev

View File

@@ -0,0 +1,50 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: aggregator
spec:
template:
spec:
containers:
- name: aggregator
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: gateway
spec:
template:
spec:
containers:
- name: gateway
resources:
requests:
memory: "64Mi"
cpu: "50m"
limits:
memory: "128Mi"
cpu: "200m"
---
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: timescaledb
spec:
template:
spec:
containers:
- name: timescaledb
resources:
requests:
memory: "128Mi"
cpu: "50m"
limits:
memory: "256Mi"
cpu: "200m"

View File

@@ -0,0 +1,8 @@
apiVersion: v1
kind: Secret
metadata:
name: timescaledb-secret
namespace: sysmonstm
type: Opaque
stringData:
password: "monitor" # Only for local dev!

159
proto/metrics.proto Normal file
View File

@@ -0,0 +1,159 @@
syntax = "proto3";
package monitoring;
option go_package = "github.com/your-org/sysmonstm/proto";
// MetricsService handles streaming metrics from collectors to aggregator
service MetricsService {
// Client-side streaming: collector streams metrics to aggregator
rpc StreamMetrics(stream Metric) returns (StreamAck) {}
// Get current state of a machine
rpc GetCurrentState(StateRequest) returns (MachineState) {}
// Get current state of all machines
rpc GetAllStates(Empty) returns (AllMachinesState) {}
}
// ControlService handles bidirectional control commands
service ControlService {
// Bidirectional streaming for commands and responses
rpc Control(stream ControlCommand) returns (stream ControlResponse) {}
}
// ConfigService handles dynamic configuration
service ConfigService {
// Get current configuration for a collector
rpc GetConfig(ConfigRequest) returns (CollectorConfig) {}
// Stream configuration updates
rpc WatchConfig(ConfigRequest) returns (stream CollectorConfig) {}
}
// Empty message for requests with no parameters
message Empty {}
// Basic metric message
message Metric {
string machine_id = 1;
string hostname = 2;
int64 timestamp_ms = 3;
MetricType type = 4;
double value = 5;
map<string, string> labels = 6;
}
// Batch of metrics for efficient transmission
message MetricBatch {
string machine_id = 1;
string hostname = 2;
int64 timestamp_ms = 3;
repeated MetricPoint metrics = 4;
}
message MetricPoint {
MetricType type = 1;
double value = 2;
map<string, string> labels = 3;
}
enum MetricType {
METRIC_TYPE_UNSPECIFIED = 0;
CPU_PERCENT = 1;
CPU_PERCENT_PER_CORE = 2;
MEMORY_PERCENT = 3;
MEMORY_USED_BYTES = 4;
MEMORY_AVAILABLE_BYTES = 5;
DISK_PERCENT = 6;
DISK_USED_BYTES = 7;
DISK_READ_BYTES_SEC = 8;
DISK_WRITE_BYTES_SEC = 9;
NETWORK_SENT_BYTES_SEC = 10;
NETWORK_RECV_BYTES_SEC = 11;
NETWORK_CONNECTIONS = 12;
PROCESS_COUNT = 13;
LOAD_AVG_1M = 14;
LOAD_AVG_5M = 15;
LOAD_AVG_15M = 16;
}
// Acknowledgment for streamed metrics
message StreamAck {
bool success = 1;
int64 metrics_received = 2;
string message = 3;
}
// Request for machine state
message StateRequest {
string machine_id = 1;
}
// Current state of a single machine
message MachineState {
string machine_id = 1;
string hostname = 2;
int64 last_seen_ms = 3;
repeated Metric current_metrics = 4;
HealthStatus health = 5;
map<string, string> metadata = 6;
}
// State of all machines
message AllMachinesState {
repeated MachineState machines = 1;
}
enum HealthStatus {
HEALTH_STATUS_UNSPECIFIED = 0;
HEALTHY = 1;
WARNING = 2;
CRITICAL = 3;
UNKNOWN = 4;
OFFLINE = 5;
}
// Control commands for collectors
message ControlCommand {
string command_id = 1;
oneof command {
UpdateIntervalCommand update_interval = 2;
RestartCollectionCommand restart = 3;
ShutdownCommand shutdown = 4;
}
}
message UpdateIntervalCommand {
int32 interval_seconds = 1;
}
message RestartCollectionCommand {}
message ShutdownCommand {
bool graceful = 1;
}
message ControlResponse {
string command_id = 1;
bool success = 2;
string message = 3;
}
// Configuration messages
message ConfigRequest {
string machine_id = 1;
}
message CollectorConfig {
int32 collection_interval_seconds = 1;
repeated MetricType enabled_metrics = 2;
map<string, string> labels = 3;
repeated ThresholdConfig thresholds = 4;
}
message ThresholdConfig {
MetricType metric_type = 1;
double warning_threshold = 2;
double critical_threshold = 3;
}

22
scripts/generate-diagrams.sh Executable file
View File

@@ -0,0 +1,22 @@
#!/bin/bash
# Generate SVG diagrams from Graphviz DOT files
# Requires: graphviz (apt install graphviz)
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
ARCH_DIR="$SCRIPT_DIR/../docs/architecture"
cd "$ARCH_DIR"
echo "Generating architecture diagrams..."
for dotfile in *.dot; do
if [ -f "$dotfile" ]; then
svgfile="${dotfile%.dot}.svg"
echo " $dotfile -> $svgfile"
dot -Tsvg "$dotfile" -o "$svgfile"
fi
done
echo "Done! Open docs/architecture/index.html in a browser."

158
scripts/init-db.sql Normal file
View File

@@ -0,0 +1,158 @@
-- TimescaleDB initialization script
-- Creates hypertables for time-series metrics storage
-- Enable TimescaleDB extension
CREATE EXTENSION IF NOT EXISTS timescaledb;
-- Raw metrics table (high resolution, short retention)
CREATE TABLE IF NOT EXISTS metrics_raw (
time TIMESTAMPTZ NOT NULL,
machine_id TEXT NOT NULL,
hostname TEXT NOT NULL,
metric_type TEXT NOT NULL,
value DOUBLE PRECISION NOT NULL,
labels JSONB DEFAULT '{}'::jsonb
);
-- Convert to hypertable with 1-hour chunks
SELECT create_hypertable('metrics_raw', 'time',
chunk_time_interval => INTERVAL '1 hour',
if_not_exists => TRUE
);
-- Create indexes for common queries
CREATE INDEX IF NOT EXISTS idx_metrics_raw_machine
ON metrics_raw (machine_id, time DESC);
CREATE INDEX IF NOT EXISTS idx_metrics_raw_type
ON metrics_raw (metric_type, time DESC);
-- Aggregated metrics table (1-minute resolution, longer retention)
CREATE TABLE IF NOT EXISTS metrics_1m (
time TIMESTAMPTZ NOT NULL,
machine_id TEXT NOT NULL,
hostname TEXT NOT NULL,
metric_type TEXT NOT NULL,
avg_value DOUBLE PRECISION NOT NULL,
min_value DOUBLE PRECISION NOT NULL,
max_value DOUBLE PRECISION NOT NULL,
sample_count INTEGER NOT NULL
);
SELECT create_hypertable('metrics_1m', 'time',
chunk_time_interval => INTERVAL '1 day',
if_not_exists => TRUE
);
CREATE INDEX IF NOT EXISTS idx_metrics_1m_machine
ON metrics_1m (machine_id, time DESC);
-- Aggregated metrics table (1-hour resolution, long retention)
CREATE TABLE IF NOT EXISTS metrics_1h (
time TIMESTAMPTZ NOT NULL,
machine_id TEXT NOT NULL,
hostname TEXT NOT NULL,
metric_type TEXT NOT NULL,
avg_value DOUBLE PRECISION NOT NULL,
min_value DOUBLE PRECISION NOT NULL,
max_value DOUBLE PRECISION NOT NULL,
sample_count INTEGER NOT NULL
);
SELECT create_hypertable('metrics_1h', 'time',
chunk_time_interval => INTERVAL '1 week',
if_not_exists => TRUE
);
CREATE INDEX IF NOT EXISTS idx_metrics_1h_machine
ON metrics_1h (machine_id, time DESC);
-- Machines registry
CREATE TABLE IF NOT EXISTS machines (
machine_id TEXT PRIMARY KEY,
hostname TEXT NOT NULL,
first_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(),
last_seen TIMESTAMPTZ NOT NULL DEFAULT NOW(),
metadata JSONB DEFAULT '{}'::jsonb,
health TEXT NOT NULL DEFAULT 'UNKNOWN'
);
-- Alert rules configuration
CREATE TABLE IF NOT EXISTS alert_rules (
id SERIAL PRIMARY KEY,
name TEXT NOT NULL UNIQUE,
metric_type TEXT NOT NULL,
operator TEXT NOT NULL CHECK (operator IN ('gt', 'lt', 'gte', 'lte', 'eq')),
threshold DOUBLE PRECISION NOT NULL,
severity TEXT NOT NULL CHECK (severity IN ('warning', 'critical')),
enabled BOOLEAN NOT NULL DEFAULT TRUE,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW(),
updated_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- Alert history
CREATE TABLE IF NOT EXISTS alerts (
id SERIAL,
time TIMESTAMPTZ NOT NULL DEFAULT NOW(),
machine_id TEXT NOT NULL,
rule_id INTEGER REFERENCES alert_rules(id),
rule_name TEXT NOT NULL,
metric_type TEXT NOT NULL,
value DOUBLE PRECISION NOT NULL,
threshold DOUBLE PRECISION NOT NULL,
severity TEXT NOT NULL,
resolved_at TIMESTAMPTZ,
PRIMARY KEY (id, time)
);
SELECT create_hypertable('alerts', 'time',
chunk_time_interval => INTERVAL '1 day',
if_not_exists => TRUE
);
-- Retention policies
-- Raw data: 24 hours
SELECT add_retention_policy('metrics_raw', INTERVAL '24 hours', if_not_exists => TRUE);
-- 1-minute aggregates: 7 days
SELECT add_retention_policy('metrics_1m', INTERVAL '7 days', if_not_exists => TRUE);
-- 1-hour aggregates: 90 days
SELECT add_retention_policy('metrics_1h', INTERVAL '90 days', if_not_exists => TRUE);
-- Alerts: 30 days
SELECT add_retention_policy('alerts', INTERVAL '30 days', if_not_exists => TRUE);
-- Continuous aggregates for automatic downsampling
CREATE MATERIALIZED VIEW IF NOT EXISTS metrics_1m_agg
WITH (timescaledb.continuous) AS
SELECT
time_bucket('1 minute', time) AS time,
machine_id,
hostname,
metric_type,
AVG(value) AS avg_value,
MIN(value) AS min_value,
MAX(value) AS max_value,
COUNT(*) AS sample_count
FROM metrics_raw
GROUP BY time_bucket('1 minute', time), machine_id, hostname, metric_type
WITH NO DATA;
-- Refresh policy for continuous aggregate
SELECT add_continuous_aggregate_policy('metrics_1m_agg',
start_offset => INTERVAL '1 hour',
end_offset => INTERVAL '1 minute',
schedule_interval => INTERVAL '1 minute',
if_not_exists => TRUE
);
-- Insert default alert rules
INSERT INTO alert_rules (name, metric_type, operator, threshold, severity)
VALUES
('High CPU Usage', 'CPU_PERCENT', 'gt', 80.0, 'warning'),
('Critical CPU Usage', 'CPU_PERCENT', 'gt', 95.0, 'critical'),
('High Memory Usage', 'MEMORY_PERCENT', 'gt', 85.0, 'warning'),
('Critical Memory Usage', 'MEMORY_PERCENT', 'gt', 95.0, 'critical'),
('High Disk Usage', 'DISK_PERCENT', 'gt', 80.0, 'warning'),
('Critical Disk Usage', 'DISK_PERCENT', 'gt', 90.0, 'critical')
ON CONFLICT (name) DO NOTHING;

View File

@@ -0,0 +1,47 @@
# Multi-stage Dockerfile for Aggregator service
FROM python:3.11-slim as base
WORKDIR /app
# Install system dependencies including grpc_health_probe
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& curl -fsSL https://github.com/grpc-ecosystem/grpc-health-probe/releases/download/v0.4.24/grpc_health_probe-linux-amd64 \
-o /bin/grpc_health_probe \
&& chmod +x /bin/grpc_health_probe \
&& rm -rf /var/lib/apt/lists/*
COPY services/aggregator/requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY shared /app/shared
COPY proto /app/proto
RUN python -m grpc_tools.protoc \
-I/app/proto \
--python_out=/app/shared \
--grpc_python_out=/app/shared \
/app/proto/metrics.proto
COPY services/aggregator /app/services/aggregator
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# =============================================================================
FROM base as development
RUN pip install --no-cache-dir watchfiles
CMD ["python", "-m", "watchfiles", "python services/aggregator/main.py", "/app/services/aggregator"]
# =============================================================================
FROM base as production
RUN useradd --create-home --shell /bin/bash appuser
USER appuser
EXPOSE 50051
CMD ["python", "services/aggregator/main.py"]

View File

@@ -0,0 +1,9 @@
grpcio>=1.60.0
grpcio-tools>=1.60.0
grpcio-health-checking>=1.60.0
redis>=5.0.0
asyncpg>=0.29.0
structlog>=23.2.0
python-json-logger>=2.0.7
pydantic>=2.5.0
pydantic-settings>=2.1.0

View File

@@ -0,0 +1,35 @@
# Multi-stage Dockerfile for Alerts service
FROM python:3.11-slim as base
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
COPY services/alerts/requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY shared /app/shared
COPY proto /app/proto
COPY services/alerts /app/services/alerts
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# =============================================================================
FROM base as development
RUN pip install --no-cache-dir watchfiles
CMD ["python", "-m", "watchfiles", "python services/alerts/main.py", "/app/services/alerts"]
# =============================================================================
FROM base as production
RUN useradd --create-home --shell /bin/bash appuser
USER appuser
CMD ["python", "services/alerts/main.py"]

View File

@@ -0,0 +1,6 @@
redis>=5.0.0
asyncpg>=0.29.0
structlog>=23.2.0
python-json-logger>=2.0.7
pydantic>=2.5.0
pydantic-settings>=2.1.0

View File

@@ -0,0 +1,55 @@
# Multi-stage Dockerfile for Collector service
# Stages: base -> development, base -> production
# =============================================================================
# Base stage - common dependencies
# =============================================================================
FROM python:3.11-slim as base
WORKDIR /app
# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY services/collector/requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
# Copy shared code and proto
COPY shared /app/shared
COPY proto /app/proto
# Generate gRPC code from proto
RUN python -m grpc_tools.protoc \
-I/app/proto \
--python_out=/app/shared \
--grpc_python_out=/app/shared \
/app/proto/metrics.proto
# Copy service code
COPY services/collector /app/services/collector
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# =============================================================================
# Development stage - with hot reload
# =============================================================================
FROM base as development
RUN pip install --no-cache-dir watchfiles
CMD ["python", "-m", "watchfiles", "python services/collector/main.py", "/app/services/collector"]
# =============================================================================
# Production stage - optimized
# =============================================================================
FROM base as production
# Run as non-root user
RUN useradd --create-home --shell /bin/bash appuser
USER appuser
CMD ["python", "services/collector/main.py"]

View File

@@ -0,0 +1,7 @@
grpcio>=1.60.0
grpcio-tools>=1.60.0
psutil>=5.9.0
structlog>=23.2.0
python-json-logger>=2.0.7
pydantic>=2.5.0
pydantic-settings>=2.1.0

View File

@@ -0,0 +1,44 @@
# Multi-stage Dockerfile for Gateway service (FastAPI)
FROM python:3.11-slim as base
WORKDIR /app
RUN apt-get update && apt-get install -y --no-install-recommends \
curl \
&& rm -rf /var/lib/apt/lists/*
COPY services/gateway/requirements.txt /app/requirements.txt
RUN pip install --no-cache-dir -r requirements.txt
COPY shared /app/shared
COPY proto /app/proto
RUN python -m grpc_tools.protoc \
-I/app/proto \
--python_out=/app/shared \
--grpc_python_out=/app/shared \
/app/proto/metrics.proto
COPY services/gateway /app/services/gateway
COPY web /app/web
ENV PYTHONPATH=/app
ENV PYTHONUNBUFFERED=1
# =============================================================================
FROM base as development
RUN pip install --no-cache-dir watchfiles
CMD ["uvicorn", "services.gateway.main:app", "--host", "0.0.0.0", "--port", "8000", "--reload"]
# =============================================================================
FROM base as production
RUN useradd --create-home --shell /bin/bash appuser
USER appuser
EXPOSE 8000
CMD ["uvicorn", "services.gateway.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]

View File

@@ -0,0 +1,13 @@
fastapi>=0.109.0
uvicorn[standard]>=0.27.0
grpcio>=1.60.0
grpcio-tools>=1.60.0
redis>=5.0.0
asyncpg>=0.29.0
websockets>=12.0
jinja2>=3.1.2
structlog>=23.2.0
python-json-logger>=2.0.7
pydantic>=2.5.0
pydantic-settings>=2.1.0
httpx>=0.26.0

34
shared/events/__init__.py Normal file
View File

@@ -0,0 +1,34 @@
"""
Event publishing/subscribing abstraction layer.
Supports:
- Redis Pub/Sub (default, simple)
- Redis Streams (with consumer groups, persistence)
- Kafka (future, for high-throughput)
Usage:
from shared.events import get_publisher, get_subscriber
# Publishing
async with get_publisher() as pub:
await pub.publish("metrics.raw", {"machine_id": "m1", ...})
# Subscribing
async with get_subscriber(["metrics.raw", "alerts.*"]) as sub:
async for topic, message in sub.consume():
process(topic, message)
"""
from .base import EventPublisher, EventSubscriber, Event
from .redis_pubsub import RedisPubSubPublisher, RedisPubSubSubscriber
from .factory import get_publisher, get_subscriber
__all__ = [
"EventPublisher",
"EventSubscriber",
"Event",
"RedisPubSubPublisher",
"RedisPubSubSubscriber",
"get_publisher",
"get_subscriber",
]

117
shared/events/base.py Normal file
View File

@@ -0,0 +1,117 @@
"""Abstract base classes for event publishing and subscribing."""
from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime
from typing import Any, AsyncIterator
import uuid
@dataclass
class Event:
"""Standard event envelope."""
topic: str
payload: dict[str, Any]
event_id: str = field(default_factory=lambda: str(uuid.uuid4()))
timestamp: datetime = field(default_factory=datetime.utcnow)
source: str = ""
def to_dict(self) -> dict[str, Any]:
return {
"event_id": self.event_id,
"topic": self.topic,
"timestamp": self.timestamp.isoformat(),
"source": self.source,
"payload": self.payload,
}
@classmethod
def from_dict(cls, data: dict[str, Any]) -> "Event":
return cls(
event_id=data.get("event_id", str(uuid.uuid4())),
topic=data["topic"],
timestamp=datetime.fromisoformat(data["timestamp"]) if "timestamp" in data else datetime.utcnow(),
source=data.get("source", ""),
payload=data.get("payload", {}),
)
class EventPublisher(ABC):
"""Abstract base for event publishers."""
@abstractmethod
async def connect(self) -> None:
"""Establish connection to the message broker."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close connection to the message broker."""
pass
@abstractmethod
async def publish(self, topic: str, payload: dict[str, Any], **kwargs) -> str:
"""
Publish an event to a topic.
Args:
topic: The topic/channel to publish to
payload: The event data
**kwargs: Additional options (e.g., headers, partition key)
Returns:
The event ID
"""
pass
async def publish_event(self, event: Event) -> str:
"""Publish a pre-constructed Event object."""
return await self.publish(event.topic, event.payload, event_id=event.event_id)
async def __aenter__(self) -> "EventPublisher":
await self.connect()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.disconnect()
class EventSubscriber(ABC):
"""Abstract base for event subscribers."""
@abstractmethod
async def connect(self) -> None:
"""Establish connection to the message broker."""
pass
@abstractmethod
async def disconnect(self) -> None:
"""Close connection and unsubscribe."""
pass
@abstractmethod
async def subscribe(self, topics: list[str]) -> None:
"""
Subscribe to one or more topics.
Args:
topics: List of topics/patterns to subscribe to
"""
pass
@abstractmethod
async def consume(self) -> AsyncIterator[Event]:
"""
Async generator that yields events as they arrive.
Yields:
Event objects
"""
pass
async def __aenter__(self) -> "EventSubscriber":
await self.connect()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb) -> None:
await self.disconnect()

101
shared/events/factory.py Normal file
View File

@@ -0,0 +1,101 @@
"""Factory functions for creating event publishers and subscribers."""
import os
from enum import Enum
from .base import EventPublisher, EventSubscriber
from .redis_pubsub import RedisPubSubPublisher, RedisPubSubSubscriber
class EventBackend(str, Enum):
"""Supported event backends."""
REDIS_PUBSUB = "redis_pubsub"
REDIS_STREAMS = "redis_streams" # Future
KAFKA = "kafka" # Future
def get_publisher(
backend: EventBackend | str | None = None,
source: str = "",
**kwargs,
) -> EventPublisher:
"""
Factory function to get an event publisher.
Args:
backend: The event backend to use (default: from EVENTS_BACKEND env var or redis_pubsub)
source: Identifier for the source service
**kwargs: Backend-specific options
Returns:
An EventPublisher instance
Environment variables:
EVENTS_BACKEND: Default backend (redis_pubsub, redis_streams, kafka)
REDIS_URL: Redis connection URL
KAFKA_BOOTSTRAP_SERVERS: Kafka bootstrap servers (future)
"""
if backend is None:
backend = os.getenv("EVENTS_BACKEND", EventBackend.REDIS_PUBSUB)
if isinstance(backend, str):
backend = EventBackend(backend)
if backend == EventBackend.REDIS_PUBSUB:
redis_url = kwargs.get("redis_url") or os.getenv(
"REDIS_URL", "redis://localhost:6379"
)
return RedisPubSubPublisher(redis_url=redis_url, source=source)
elif backend == EventBackend.REDIS_STREAMS:
raise NotImplementedError("Redis Streams backend not yet implemented")
elif backend == EventBackend.KAFKA:
raise NotImplementedError("Kafka backend not yet implemented")
else:
raise ValueError(f"Unknown event backend: {backend}")
def get_subscriber(
topics: list[str] | None = None,
backend: EventBackend | str | None = None,
**kwargs,
) -> EventSubscriber:
"""
Factory function to get an event subscriber.
Args:
topics: Topics to subscribe to
backend: The event backend to use (default: from EVENTS_BACKEND env var or redis_pubsub)
**kwargs: Backend-specific options
Returns:
An EventSubscriber instance
Environment variables:
EVENTS_BACKEND: Default backend (redis_pubsub, redis_streams, kafka)
REDIS_URL: Redis connection URL
KAFKA_BOOTSTRAP_SERVERS: Kafka bootstrap servers (future)
"""
if backend is None:
backend = os.getenv("EVENTS_BACKEND", EventBackend.REDIS_PUBSUB)
if isinstance(backend, str):
backend = EventBackend(backend)
if backend == EventBackend.REDIS_PUBSUB:
redis_url = kwargs.get("redis_url") or os.getenv(
"REDIS_URL", "redis://localhost:6379"
)
return RedisPubSubSubscriber(redis_url=redis_url, topics=topics)
elif backend == EventBackend.REDIS_STREAMS:
raise NotImplementedError("Redis Streams backend not yet implemented")
elif backend == EventBackend.KAFKA:
raise NotImplementedError("Kafka backend not yet implemented")
else:
raise ValueError(f"Unknown event backend: {backend}")

View File

@@ -0,0 +1,142 @@
"""Redis Pub/Sub implementation of event publishing/subscribing."""
import asyncio
import json
import logging
from typing import Any, AsyncIterator
import redis.asyncio as redis
from .base import Event, EventPublisher, EventSubscriber
logger = logging.getLogger(__name__)
class RedisPubSubPublisher(EventPublisher):
"""Redis Pub/Sub based event publisher."""
def __init__(
self,
redis_url: str = "redis://localhost:6379",
source: str = "",
):
self.redis_url = redis_url
self.source = source
self._client: redis.Redis | None = None
async def connect(self) -> None:
self._client = redis.from_url(self.redis_url, decode_responses=True)
await self._client.ping()
logger.info(f"Connected to Redis at {self.redis_url}")
async def disconnect(self) -> None:
if self._client:
await self._client.close()
self._client = None
logger.info("Disconnected from Redis")
async def publish(self, topic: str, payload: dict[str, Any], **kwargs) -> str:
if not self._client:
raise RuntimeError("Publisher not connected")
event = Event(
topic=topic,
payload=payload,
event_id=kwargs.get("event_id", None)
or Event(topic="", payload={}).event_id,
source=self.source,
)
message = json.dumps(event.to_dict())
await self._client.publish(topic, message)
logger.debug(f"Published event {event.event_id} to {topic}")
return event.event_id
class RedisPubSubSubscriber(EventSubscriber):
"""Redis Pub/Sub based event subscriber."""
def __init__(
self,
redis_url: str = "redis://localhost:6379",
topics: list[str] | None = None,
):
self.redis_url = redis_url
self._topics = topics or []
self._client: redis.Redis | None = None
self._pubsub: redis.client.PubSub | None = None
self._running = False
async def connect(self) -> None:
self._client = redis.from_url(self.redis_url, decode_responses=True)
await self._client.ping()
self._pubsub = self._client.pubsub()
logger.info(f"Connected to Redis at {self.redis_url}")
if self._topics:
await self.subscribe(self._topics)
async def disconnect(self) -> None:
self._running = False
if self._pubsub:
await self._pubsub.unsubscribe()
await self._pubsub.close()
self._pubsub = None
if self._client:
await self._client.close()
self._client = None
logger.info("Disconnected from Redis")
async def subscribe(self, topics: list[str]) -> None:
if not self._pubsub:
raise RuntimeError("Subscriber not connected")
# Separate pattern subscriptions from regular ones
patterns = [t for t in topics if "*" in t]
channels = [t for t in topics if "*" not in t]
if channels:
await self._pubsub.subscribe(*channels)
logger.info(f"Subscribed to channels: {channels}")
if patterns:
await self._pubsub.psubscribe(*patterns)
logger.info(f"Subscribed to patterns: {patterns}")
self._topics.extend(topics)
async def consume(self) -> AsyncIterator[Event]:
if not self._pubsub:
raise RuntimeError("Subscriber not connected")
self._running = True
while self._running:
try:
message = await self._pubsub.get_message(
ignore_subscribe_messages=True,
timeout=1.0,
)
if message is None:
await asyncio.sleep(0.01)
continue
if message["type"] not in ("message", "pmessage"):
continue
try:
data = json.loads(message["data"])
event = Event.from_dict(data)
yield event
except (json.JSONDecodeError, KeyError) as e:
logger.warning(f"Failed to parse event: {e}")
continue
except asyncio.CancelledError:
self._running = False
break
except Exception as e:
logger.error(f"Error consuming events: {e}")
await asyncio.sleep(1.0)