claude final draft

This commit is contained in:
buenosairesam
2025-12-29 23:44:30 -03:00
parent 116d4032e2
commit e5aafd5097
22 changed files with 2815 additions and 32 deletions

View File

@@ -1,32 +0,0 @@
# ctlptl configuration for Kind cluster
# Usage: ctlptl apply -f ctlptl.yaml
apiVersion: ctlptl.dev/v1alpha1
kind: Registry
name: sysmonstm-registry
port: 5005
---
apiVersion: ctlptl.dev/v1alpha1
kind: Cluster
product: kind
registry: sysmonstm-registry
kindV1Alpha4Cluster:
name: sysmonstm
nodes:
- role: control-plane
extraPortMappings:
# Gateway HTTP
- containerPort: 30080
hostPort: 8080
protocol: TCP
# Aggregator gRPC
- containerPort: 30051
hostPort: 50051
protocol: TCP
# Resource limits for t2.small compatibility
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
system-reserved: memory=256Mi

24
scripts/generate-proto.sh Executable file
View File

@@ -0,0 +1,24 @@
#!/bin/bash
# Generate Python gRPC code from proto definitions
set -e
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
PROJECT_ROOT="$SCRIPT_DIR/.."
cd "$PROJECT_ROOT"
echo "Generating Python gRPC code from proto/metrics.proto..."
python -m grpc_tools.protoc \
-I./proto \
--python_out=./shared \
--grpc_python_out=./shared \
./proto/metrics.proto
# Fix imports in generated files (grpc_tools generates incorrect imports)
sed -i 's/import metrics_pb2/from shared import metrics_pb2/' shared/metrics_pb2_grpc.py
echo "Generated:"
echo " - shared/metrics_pb2.py"
echo " - shared/metrics_pb2_grpc.py"

View File

@@ -0,0 +1 @@
"""Aggregator service."""

361
services/aggregator/main.py Normal file
View File

@@ -0,0 +1,361 @@
"""Aggregator service - gRPC server that receives metrics and stores them."""
import asyncio
import signal
import sys
from pathlib import Path
import grpc
from grpc_health.v1 import health, health_pb2, health_pb2_grpc
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from services.aggregator.storage import RedisStorage, TimescaleStorage
from shared import metrics_pb2, metrics_pb2_grpc
from shared.config import get_aggregator_config
from shared.events import get_publisher
from shared.logging import setup_logging
class MetricsServicer(metrics_pb2_grpc.MetricsServiceServicer):
"""gRPC servicer for metrics ingestion."""
def __init__(
self,
redis_storage: RedisStorage,
timescale_storage: TimescaleStorage,
event_publisher,
logger,
):
self.redis = redis_storage
self.timescale = timescale_storage
self.publisher = event_publisher
self.logger = logger
async def StreamMetrics(self, request_iterator, context):
"""Receive streaming metrics from a collector."""
metrics_received = 0
current_machine = None
current_batch: list[tuple[str, float, dict]] = []
batch_timestamp = 0
batch_hostname = ""
try:
async for metric in request_iterator:
metrics_received += 1
# Track current machine
if current_machine != metric.machine_id:
# Flush previous batch if switching machines
if current_machine and current_batch:
await self._flush_batch(
current_machine,
batch_hostname,
batch_timestamp,
current_batch,
)
current_batch = []
current_machine = metric.machine_id
self.logger.info(
"collector_connected",
machine_id=metric.machine_id,
hostname=metric.hostname,
)
# Get metric type name
metric_type = metrics_pb2.MetricType.Name(metric.type)
# Add to batch
current_batch.append(
(
metric_type,
metric.value,
dict(metric.labels),
)
)
batch_timestamp = metric.timestamp_ms
batch_hostname = metric.hostname
# Flush batch every 20 metrics or if timestamp changes significantly
if len(current_batch) >= 20:
await self._flush_batch(
current_machine, batch_hostname, batch_timestamp, current_batch
)
current_batch = []
# Flush remaining
if current_machine and current_batch:
await self._flush_batch(
current_machine, batch_hostname, batch_timestamp, current_batch
)
self.logger.info(
"stream_completed",
machine_id=current_machine,
metrics_received=metrics_received,
)
return metrics_pb2.StreamAck(
success=True,
metrics_received=metrics_received,
message="OK",
)
except Exception as e:
self.logger.error(
"stream_error",
error=str(e),
machine_id=current_machine,
metrics_received=metrics_received,
)
return metrics_pb2.StreamAck(
success=False,
metrics_received=metrics_received,
message=str(e),
)
async def _flush_batch(
self,
machine_id: str,
hostname: str,
timestamp_ms: int,
batch: list[tuple[str, float, dict]],
) -> None:
"""Flush a batch of metrics to storage and events."""
# Aggregate metrics for Redis state
metrics_dict = {}
for metric_type, value, labels in batch:
key = metric_type
if labels:
key = f"{metric_type}:{','.join(f'{k}={v}' for k, v in labels.items())}"
metrics_dict[key] = value
# Update Redis (current state)
await self.redis.update_machine_state(
machine_id=machine_id,
hostname=hostname,
metrics=metrics_dict,
timestamp_ms=timestamp_ms,
)
# Insert into TimescaleDB (historical)
try:
await self.timescale.insert_metrics(
machine_id=machine_id,
hostname=hostname,
timestamp_ms=timestamp_ms,
metrics=batch,
)
except Exception as e:
self.logger.warning("timescale_insert_failed", error=str(e))
# Update machine registry
try:
await self.timescale.update_machine_registry(
machine_id=machine_id,
hostname=hostname,
)
except Exception as e:
self.logger.warning("machine_registry_update_failed", error=str(e))
# Publish event for subscribers (alerts, gateway)
await self.publisher.publish(
topic="metrics.raw",
payload={
"machine_id": machine_id,
"hostname": hostname,
"timestamp_ms": timestamp_ms,
"metrics": metrics_dict,
},
)
self.logger.debug(
"batch_flushed",
machine_id=machine_id,
count=len(batch),
)
async def GetCurrentState(self, request, context):
"""Get current state for a single machine."""
state = await self.redis.get_machine_state(request.machine_id)
if not state:
context.set_code(grpc.StatusCode.NOT_FOUND)
context.set_details(f"Machine {request.machine_id} not found")
return metrics_pb2.MachineState()
# Convert state to proto
metrics = []
for key, value in state.get("metrics", {}).items():
parts = key.split(":")
metric_type_str = parts[0]
labels = {}
if len(parts) > 1:
for pair in parts[1].split(","):
k, v = pair.split("=")
labels[k] = v
metric_type = getattr(metrics_pb2, metric_type_str, 0)
metrics.append(
metrics_pb2.Metric(
machine_id=state["machine_id"],
hostname=state["hostname"],
timestamp_ms=state["last_seen_ms"],
type=metric_type,
value=value,
labels=labels,
)
)
return metrics_pb2.MachineState(
machine_id=state["machine_id"],
hostname=state["hostname"],
last_seen_ms=state["last_seen_ms"],
current_metrics=metrics,
health=metrics_pb2.HEALTHY,
)
async def GetAllStates(self, request, context):
"""Get current state for all machines."""
states = await self.redis.get_all_machines()
machine_states = []
for state in states:
metrics = []
for key, value in state.get("metrics", {}).items():
parts = key.split(":")
metric_type_str = parts[0]
metric_type = getattr(metrics_pb2, metric_type_str, 0)
metrics.append(
metrics_pb2.Metric(
machine_id=state["machine_id"],
hostname=state["hostname"],
timestamp_ms=state["last_seen_ms"],
type=metric_type,
value=value,
)
)
machine_states.append(
metrics_pb2.MachineState(
machine_id=state["machine_id"],
hostname=state["hostname"],
last_seen_ms=state["last_seen_ms"],
current_metrics=metrics,
health=metrics_pb2.HEALTHY,
)
)
return metrics_pb2.AllMachinesState(machines=machine_states)
class AggregatorService:
"""Main aggregator service."""
def __init__(self):
self.config = get_aggregator_config()
self.logger = setup_logging(
service_name=self.config.service_name,
log_level=self.config.log_level,
log_format=self.config.log_format,
)
self.redis = RedisStorage(self.config.redis_url)
self.timescale = TimescaleStorage(self.config.timescale_url)
self.publisher = get_publisher(source="aggregator")
self.server: grpc.aio.Server | None = None
self.running = False
async def start(self) -> None:
"""Start the gRPC server."""
self.running = True
# Connect to storage
await self.redis.connect()
try:
await self.timescale.connect()
except Exception as e:
self.logger.warning(
"timescale_connection_failed",
error=str(e),
message="Continuing without TimescaleDB - metrics won't be persisted",
)
# Connect to event publisher
await self.publisher.connect()
# Create gRPC server
self.server = grpc.aio.server()
# Add metrics servicer
servicer = MetricsServicer(
redis_storage=self.redis,
timescale_storage=self.timescale,
event_publisher=self.publisher,
logger=self.logger,
)
metrics_pb2_grpc.add_MetricsServiceServicer_to_server(servicer, self.server)
# Add health check servicer
health_servicer = health.HealthServicer()
health_servicer.set("", health_pb2.HealthCheckResponse.SERVING)
health_servicer.set("MetricsService", health_pb2.HealthCheckResponse.SERVING)
health_pb2_grpc.add_HealthServicer_to_server(health_servicer, self.server)
# Start server
listen_addr = f"[::]:{self.config.grpc_port}"
self.server.add_insecure_port(listen_addr)
await self.server.start()
self.logger.info(
"aggregator_started",
port=self.config.grpc_port,
listen_addr=listen_addr,
)
async def stop(self) -> None:
"""Stop the gRPC server."""
self.running = False
if self.server:
await self.server.stop(grace=5)
self.server = None
await self.publisher.disconnect()
await self.timescale.disconnect()
await self.redis.disconnect()
self.logger.info("aggregator_stopped")
async def wait(self) -> None:
"""Wait for the server to terminate."""
if self.server:
await self.server.wait_for_termination()
async def main():
"""Main entry point."""
service = AggregatorService()
# Handle shutdown signals
loop = asyncio.get_event_loop()
async def shutdown():
service.logger.info("shutdown_signal_received")
await service.stop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, lambda: asyncio.create_task(shutdown()))
await service.start()
await service.wait()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,245 @@
"""Storage layer for metrics - Redis (current state) and TimescaleDB (historical)."""
import json
import time
from datetime import datetime
from typing import Any
import asyncpg
import redis.asyncio as redis
from shared.logging import get_logger
logger = get_logger("storage")
class RedisStorage:
"""Redis storage for current machine state."""
def __init__(self, redis_url: str):
self.redis_url = redis_url
self._client: redis.Redis | None = None
async def connect(self) -> None:
self._client = redis.from_url(self.redis_url, decode_responses=True)
await self._client.ping()
logger.info("redis_connected", url=self.redis_url)
async def disconnect(self) -> None:
if self._client:
await self._client.close()
self._client = None
logger.info("redis_disconnected")
async def update_machine_state(
self,
machine_id: str,
hostname: str,
metrics: dict[str, float],
timestamp_ms: int,
) -> None:
"""Update the current state for a machine."""
if not self._client:
raise RuntimeError("Not connected to Redis")
state = {
"machine_id": machine_id,
"hostname": hostname,
"last_seen_ms": timestamp_ms,
"metrics": metrics,
"updated_at": datetime.utcnow().isoformat(),
}
# Store as hash for efficient partial reads
key = f"machine:{machine_id}"
await self._client.hset(
key,
mapping={
"state": json.dumps(state),
"last_seen": str(timestamp_ms),
},
)
# Set expiry - if no updates for 5 minutes, consider stale
await self._client.expire(key, 300)
# Add to active machines set
await self._client.sadd("machines:active", machine_id)
async def get_machine_state(self, machine_id: str) -> dict[str, Any] | None:
"""Get current state for a machine."""
if not self._client:
raise RuntimeError("Not connected to Redis")
key = f"machine:{machine_id}"
data = await self._client.hget(key, "state")
if data:
return json.loads(data)
return None
async def get_all_machines(self) -> list[dict[str, Any]]:
"""Get current state for all active machines."""
if not self._client:
raise RuntimeError("Not connected to Redis")
machine_ids = await self._client.smembers("machines:active")
states = []
for machine_id in machine_ids:
state = await self.get_machine_state(machine_id)
if state:
states.append(state)
else:
# Remove stale machine from active set
await self._client.srem("machines:active", machine_id)
return states
class TimescaleStorage:
"""TimescaleDB storage for historical metrics."""
def __init__(self, connection_url: str):
self.connection_url = connection_url
self._pool: asyncpg.Pool | None = None
async def connect(self) -> None:
self._pool = await asyncpg.create_pool(
self.connection_url,
min_size=2,
max_size=10,
)
logger.info("timescaledb_connected")
async def disconnect(self) -> None:
if self._pool:
await self._pool.close()
self._pool = None
logger.info("timescaledb_disconnected")
async def insert_metrics(
self,
machine_id: str,
hostname: str,
timestamp_ms: int,
metrics: list[tuple[str, float, dict[str, str]]],
) -> int:
"""
Insert a batch of metrics.
Args:
machine_id: Machine identifier
hostname: Machine hostname
timestamp_ms: Timestamp in milliseconds
metrics: List of (metric_type, value, labels) tuples
Returns:
Number of rows inserted
"""
if not self._pool:
raise RuntimeError("Not connected to TimescaleDB")
timestamp = datetime.utcfromtimestamp(timestamp_ms / 1000)
# Prepare batch insert
rows = [
(timestamp, machine_id, hostname, metric_type, value, json.dumps(labels))
for metric_type, value, labels in metrics
]
async with self._pool.acquire() as conn:
await conn.executemany(
"""
INSERT INTO metrics_raw (time, machine_id, hostname, metric_type, value, labels)
VALUES ($1, $2, $3, $4, $5, $6)
""",
rows,
)
return len(rows)
async def update_machine_registry(
self,
machine_id: str,
hostname: str,
health: str = "HEALTHY",
) -> None:
"""Update the machines registry with last seen time."""
if not self._pool:
raise RuntimeError("Not connected to TimescaleDB")
async with self._pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO machines (machine_id, hostname, last_seen, health)
VALUES ($1, $2, NOW(), $3)
ON CONFLICT (machine_id) DO UPDATE
SET hostname = $2, last_seen = NOW(), health = $3
""",
machine_id,
hostname,
health,
)
async def get_metrics(
self,
machine_id: str | None = None,
metric_type: str | None = None,
start_time: datetime | None = None,
end_time: datetime | None = None,
limit: int = 1000,
) -> list[dict[str, Any]]:
"""Query historical metrics."""
if not self._pool:
raise RuntimeError("Not connected to TimescaleDB")
conditions = []
params = []
param_idx = 1
if machine_id:
conditions.append(f"machine_id = ${param_idx}")
params.append(machine_id)
param_idx += 1
if metric_type:
conditions.append(f"metric_type = ${param_idx}")
params.append(metric_type)
param_idx += 1
if start_time:
conditions.append(f"time >= ${param_idx}")
params.append(start_time)
param_idx += 1
if end_time:
conditions.append(f"time <= ${param_idx}")
params.append(end_time)
param_idx += 1
where_clause = " AND ".join(conditions) if conditions else "TRUE"
query = f"""
SELECT time, machine_id, hostname, metric_type, value, labels
FROM metrics_raw
WHERE {where_clause}
ORDER BY time DESC
LIMIT ${param_idx}
"""
params.append(limit)
async with self._pool.acquire() as conn:
rows = await conn.fetch(query, *params)
return [
{
"time": row["time"].isoformat(),
"machine_id": row["machine_id"],
"hostname": row["hostname"],
"metric_type": row["metric_type"],
"value": row["value"],
"labels": json.loads(row["labels"]) if row["labels"] else {},
}
for row in rows
]

View File

@@ -14,6 +14,12 @@ RUN pip install --no-cache-dir -r requirements.txt
COPY shared /app/shared COPY shared /app/shared
COPY proto /app/proto COPY proto /app/proto
RUN python -m grpc_tools.protoc \
-I/app/proto \
--python_out=/app/shared \
--grpc_python_out=/app/shared \
/app/proto/metrics.proto
COPY services/alerts /app/services/alerts COPY services/alerts /app/services/alerts
ENV PYTHONPATH=/app ENV PYTHONPATH=/app

View File

@@ -0,0 +1 @@
"""Alerts service."""

317
services/alerts/main.py Normal file
View File

@@ -0,0 +1,317 @@
"""Alerts service - subscribes to metrics events and evaluates thresholds."""
import asyncio
import signal
import sys
from dataclasses import dataclass
from datetime import datetime
from pathlib import Path
from typing import Any
import asyncpg
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from shared.config import get_alerts_config
from shared.events import get_publisher, get_subscriber
from shared.logging import setup_logging
@dataclass
class AlertRule:
"""An alert rule configuration."""
id: int
name: str
metric_type: str
operator: str # gt, lt, gte, lte, eq
threshold: float
severity: str # warning, critical
enabled: bool
@dataclass
class Alert:
"""A triggered alert."""
rule: AlertRule
machine_id: str
value: float
triggered_at: datetime
class AlertEvaluator:
"""Evaluates metrics against alert rules."""
OPERATORS = {
"gt": lambda v, t: v > t,
"lt": lambda v, t: v < t,
"gte": lambda v, t: v >= t,
"lte": lambda v, t: v <= t,
"eq": lambda v, t: v == t,
}
def __init__(self, rules: list[AlertRule]):
self.rules = {r.metric_type: r for r in rules if r.enabled}
# Track active alerts to avoid duplicates
self.active_alerts: dict[str, Alert] = {} # key: f"{machine_id}:{rule_name}"
def evaluate(self, machine_id: str, metrics: dict[str, float]) -> list[Alert]:
"""Evaluate metrics against rules and return new alerts."""
new_alerts = []
for metric_type, value in metrics.items():
rule = self.rules.get(metric_type)
if not rule:
continue
op_func = self.OPERATORS.get(rule.operator)
if not op_func:
continue
alert_key = f"{machine_id}:{rule.name}"
if op_func(value, rule.threshold):
# Threshold exceeded
if alert_key not in self.active_alerts:
alert = Alert(
rule=rule,
machine_id=machine_id,
value=value,
triggered_at=datetime.utcnow(),
)
self.active_alerts[alert_key] = alert
new_alerts.append(alert)
else:
# Threshold no longer exceeded - resolve alert
if alert_key in self.active_alerts:
del self.active_alerts[alert_key]
return new_alerts
def update_rules(self, rules: list[AlertRule]) -> None:
"""Update the rules being evaluated."""
self.rules = {r.metric_type: r for r in rules if r.enabled}
class AlertsService:
"""Main alerts service."""
def __init__(self):
self.config = get_alerts_config()
self.logger = setup_logging(
service_name=self.config.service_name,
log_level=self.config.log_level,
log_format=self.config.log_format,
)
self.running = False
self.db_pool: asyncpg.Pool | None = None
self.evaluator: AlertEvaluator | None = None
self.subscriber = get_subscriber(topics=["metrics.raw"])
self.publisher = get_publisher(source="alerts")
async def connect_db(self) -> None:
"""Connect to TimescaleDB for rules and alert storage."""
try:
self.db_pool = await asyncpg.create_pool(
self.config.timescale_url,
min_size=1,
max_size=5,
)
self.logger.info("database_connected")
except Exception as e:
self.logger.warning("database_connection_failed", error=str(e))
self.db_pool = None
async def load_rules(self) -> list[AlertRule]:
"""Load alert rules from database."""
if not self.db_pool:
# Return default rules if no database
return [
AlertRule(
1, "High CPU Usage", "CPU_PERCENT", "gt", 80.0, "warning", True
),
AlertRule(
2, "Critical CPU Usage", "CPU_PERCENT", "gt", 95.0, "critical", True
),
AlertRule(
3,
"High Memory Usage",
"MEMORY_PERCENT",
"gt",
85.0,
"warning",
True,
),
AlertRule(
4,
"Critical Memory Usage",
"MEMORY_PERCENT",
"gt",
95.0,
"critical",
True,
),
AlertRule(
5, "High Disk Usage", "DISK_PERCENT", "gt", 80.0, "warning", True
),
AlertRule(
6,
"Critical Disk Usage",
"DISK_PERCENT",
"gt",
90.0,
"critical",
True,
),
]
async with self.db_pool.acquire() as conn:
rows = await conn.fetch(
"SELECT id, name, metric_type, operator, threshold, severity, enabled FROM alert_rules"
)
return [
AlertRule(
id=row["id"],
name=row["name"],
metric_type=row["metric_type"],
operator=row["operator"],
threshold=row["threshold"],
severity=row["severity"],
enabled=row["enabled"],
)
for row in rows
]
async def store_alert(self, alert: Alert) -> None:
"""Store triggered alert in database."""
if not self.db_pool:
return
try:
async with self.db_pool.acquire() as conn:
await conn.execute(
"""
INSERT INTO alerts (time, machine_id, rule_id, rule_name, metric_type, value, threshold, severity)
VALUES ($1, $2, $3, $4, $5, $6, $7, $8)
""",
alert.triggered_at,
alert.machine_id,
alert.rule.id,
alert.rule.name,
alert.rule.metric_type,
alert.value,
alert.rule.threshold,
alert.rule.severity,
)
except Exception as e:
self.logger.warning("alert_storage_failed", error=str(e))
async def publish_alert(self, alert: Alert) -> None:
"""Publish alert event for other services (e.g., notifications)."""
await self.publisher.publish(
topic=f"alerts.{alert.rule.severity}",
payload={
"rule_name": alert.rule.name,
"machine_id": alert.machine_id,
"metric_type": alert.rule.metric_type,
"value": alert.value,
"threshold": alert.rule.threshold,
"severity": alert.rule.severity,
"triggered_at": alert.triggered_at.isoformat(),
},
)
async def process_metrics(self, event_data: dict[str, Any]) -> None:
"""Process incoming metrics and evaluate alerts."""
if not self.evaluator:
return
machine_id = event_data.get("machine_id", "unknown")
metrics = event_data.get("metrics", {})
alerts = self.evaluator.evaluate(machine_id, metrics)
for alert in alerts:
self.logger.warning(
"alert_triggered",
rule=alert.rule.name,
machine_id=alert.machine_id,
value=alert.value,
threshold=alert.rule.threshold,
severity=alert.rule.severity,
)
await self.store_alert(alert)
await self.publish_alert(alert)
async def run(self) -> None:
"""Main service loop."""
self.running = True
self.logger.info("alerts_service_starting")
# Connect to database
await self.connect_db()
# Load rules
rules = await self.load_rules()
self.evaluator = AlertEvaluator(rules)
self.logger.info("rules_loaded", count=len(rules))
# Connect to event bus
await self.subscriber.connect()
await self.publisher.connect()
self.logger.info("alerts_service_started")
try:
# Process events
async for event in self.subscriber.consume():
if not self.running:
break
try:
await self.process_metrics(event.payload)
except Exception as e:
self.logger.error("event_processing_error", error=str(e))
except asyncio.CancelledError:
self.logger.info("alerts_service_cancelled")
finally:
await self.subscriber.disconnect()
await self.publisher.disconnect()
if self.db_pool:
await self.db_pool.close()
self.logger.info("alerts_service_stopped")
def stop(self) -> None:
"""Signal the service to stop."""
self.running = False
async def main():
"""Main entry point."""
service = AlertsService()
# Handle shutdown signals
loop = asyncio.get_event_loop()
def signal_handler():
service.logger.info("shutdown_signal_received")
service.stop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
await service.run()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,3 +1,5 @@
grpcio>=1.60.0
grpcio-tools>=1.60.0
redis>=5.0.0 redis>=5.0.0
asyncpg>=0.29.0 asyncpg>=0.29.0
structlog>=23.2.0 structlog>=23.2.0

View File

@@ -0,0 +1 @@
"""Collector service."""

209
services/collector/main.py Normal file
View File

@@ -0,0 +1,209 @@
"""Collector service - streams system metrics to the aggregator via gRPC."""
import asyncio
import signal
import sys
from pathlib import Path
import grpc
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from services.collector.metrics import MetricsCollector
from shared import metrics_pb2, metrics_pb2_grpc
from shared.config import get_collector_config
from shared.logging import setup_logging
class CollectorService:
"""Main collector service that streams metrics to the aggregator."""
def __init__(self):
self.config = get_collector_config()
self.logger = setup_logging(
service_name=self.config.service_name,
log_level=self.config.log_level,
log_format=self.config.log_format,
)
self.running = False
self.channel: grpc.aio.Channel | None = None
self.stub: metrics_pb2_grpc.MetricsServiceStub | None = None
self.collector = MetricsCollector(
machine_id=self.config.machine_id,
collect_cpu=self.config.collect_cpu,
collect_memory=self.config.collect_memory,
collect_disk=self.config.collect_disk,
collect_network=self.config.collect_network,
collect_load=self.config.collect_load,
)
async def connect(self) -> None:
"""Establish connection to the aggregator."""
self.logger.info(
"connecting_to_aggregator",
aggregator_url=self.config.aggregator_url,
)
self.channel = grpc.aio.insecure_channel(
self.config.aggregator_url,
options=[
("grpc.keepalive_time_ms", 10000),
("grpc.keepalive_timeout_ms", 5000),
("grpc.keepalive_permit_without_calls", True),
],
)
self.stub = metrics_pb2_grpc.MetricsServiceStub(self.channel)
# Wait for channel to be ready
try:
await asyncio.wait_for(
self.channel.channel_ready(),
timeout=10.0,
)
self.logger.info("connected_to_aggregator")
except asyncio.TimeoutError:
self.logger.error("connection_timeout")
raise
async def disconnect(self) -> None:
"""Close connection to the aggregator."""
if self.channel:
await self.channel.close()
self.channel = None
self.stub = None
self.logger.info("disconnected_from_aggregator")
def _batch_to_proto(self, batch) -> list[metrics_pb2.Metric]:
"""Convert a MetricsBatch to protobuf messages."""
protos = []
for metric in batch.metrics:
proto = metrics_pb2.Metric(
machine_id=batch.machine_id,
hostname=batch.hostname,
timestamp_ms=batch.timestamp_ms,
type=getattr(metrics_pb2, metric.metric_type, 0),
value=metric.value,
labels=metric.labels,
)
protos.append(proto)
return protos
async def _metric_generator(self):
"""Async generator that yields metrics at the configured interval."""
while self.running:
batch = self.collector.collect()
protos = self._batch_to_proto(batch)
for proto in protos:
yield proto
self.logger.debug(
"collected_metrics",
count=len(protos),
machine_id=batch.machine_id,
)
await asyncio.sleep(self.config.collection_interval)
async def stream_metrics(self) -> None:
"""Stream metrics to the aggregator."""
if not self.stub:
raise RuntimeError("Not connected to aggregator")
retry_count = 0
max_retries = 10
base_delay = 1.0
while self.running:
try:
self.logger.info("starting_metric_stream")
response = await self.stub.StreamMetrics(self._metric_generator())
self.logger.info(
"stream_completed",
success=response.success,
metrics_received=response.metrics_received,
message=response.message,
)
retry_count = 0
except grpc.aio.AioRpcError as e:
retry_count += 1
delay = min(base_delay * (2**retry_count), 60.0)
self.logger.warning(
"stream_error",
code=e.code().name,
details=e.details(),
retry_count=retry_count,
retry_delay=delay,
)
if retry_count >= max_retries:
self.logger.error("max_retries_exceeded")
raise
await asyncio.sleep(delay)
# Reconnect
try:
await self.disconnect()
await self.connect()
except Exception as conn_err:
self.logger.error("reconnect_failed", error=str(conn_err))
except asyncio.CancelledError:
self.logger.info("stream_cancelled")
break
async def run(self) -> None:
"""Main entry point for the collector service."""
self.running = True
self.logger.info(
"collector_starting",
machine_id=self.config.machine_id,
interval=self.config.collection_interval,
)
# Initial CPU percent call to initialize (first call always returns 0)
import psutil
psutil.cpu_percent()
await self.connect()
try:
await self.stream_metrics()
finally:
await self.disconnect()
self.logger.info("collector_stopped")
def stop(self) -> None:
"""Signal the collector to stop."""
self.running = False
async def main():
"""Main entry point."""
service = CollectorService()
# Handle shutdown signals
loop = asyncio.get_event_loop()
def signal_handler():
service.logger.info("shutdown_signal_received")
service.stop()
for sig in (signal.SIGTERM, signal.SIGINT):
loop.add_signal_handler(sig, signal_handler)
await service.run()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,233 @@
"""System metrics collection using psutil."""
import socket
import time
from dataclasses import dataclass, field
import psutil
@dataclass
class MetricPoint:
"""A single metric data point."""
metric_type: str
value: float
labels: dict[str, str] = field(default_factory=dict)
@dataclass
class MetricsBatch:
"""A batch of metrics from a single collection cycle."""
machine_id: str
hostname: str
timestamp_ms: int
metrics: list[MetricPoint]
class MetricsCollector:
"""Collects system metrics using psutil."""
def __init__(
self,
machine_id: str,
collect_cpu: bool = True,
collect_memory: bool = True,
collect_disk: bool = True,
collect_network: bool = True,
collect_load: bool = True,
):
self.machine_id = machine_id
self.hostname = socket.gethostname()
self.collect_cpu = collect_cpu
self.collect_memory = collect_memory
self.collect_disk = collect_disk
self.collect_network = collect_network
self.collect_load = collect_load
# Track previous network counters for rate calculation
self._prev_net_io: psutil._common.snetio | None = None
self._prev_net_time: float | None = None
def collect(self) -> MetricsBatch:
"""Collect all enabled metrics and return as a batch."""
metrics: list[MetricPoint] = []
if self.collect_cpu:
metrics.extend(self._collect_cpu())
if self.collect_memory:
metrics.extend(self._collect_memory())
if self.collect_disk:
metrics.extend(self._collect_disk())
if self.collect_network:
metrics.extend(self._collect_network())
if self.collect_load:
metrics.extend(self._collect_load())
return MetricsBatch(
machine_id=self.machine_id,
hostname=self.hostname,
timestamp_ms=int(time.time() * 1000),
metrics=metrics,
)
def _collect_cpu(self) -> list[MetricPoint]:
"""Collect CPU metrics."""
metrics = []
# Overall CPU percent
cpu_percent = psutil.cpu_percent(interval=None)
metrics.append(
MetricPoint(
metric_type="CPU_PERCENT",
value=cpu_percent,
)
)
# Per-core CPU percent
per_cpu = psutil.cpu_percent(interval=None, percpu=True)
for i, pct in enumerate(per_cpu):
metrics.append(
MetricPoint(
metric_type="CPU_PERCENT_PER_CORE",
value=pct,
labels={"core": str(i)},
)
)
return metrics
def _collect_memory(self) -> list[MetricPoint]:
"""Collect memory metrics."""
mem = psutil.virtual_memory()
return [
MetricPoint(metric_type="MEMORY_PERCENT", value=mem.percent),
MetricPoint(metric_type="MEMORY_USED_BYTES", value=float(mem.used)),
MetricPoint(
metric_type="MEMORY_AVAILABLE_BYTES", value=float(mem.available)
),
]
def _collect_disk(self) -> list[MetricPoint]:
"""Collect disk metrics."""
metrics = []
# Disk usage for root partition
try:
disk = psutil.disk_usage("/")
metrics.append(
MetricPoint(
metric_type="DISK_PERCENT",
value=disk.percent,
labels={"mount": "/"},
)
)
metrics.append(
MetricPoint(
metric_type="DISK_USED_BYTES",
value=float(disk.used),
labels={"mount": "/"},
)
)
except (PermissionError, FileNotFoundError):
pass
# Disk I/O rates
try:
io = psutil.disk_io_counters()
if io:
metrics.append(
MetricPoint(
metric_type="DISK_READ_BYTES_SEC",
value=float(
io.read_bytes
), # Will be converted to rate by aggregator
)
)
metrics.append(
MetricPoint(
metric_type="DISK_WRITE_BYTES_SEC",
value=float(io.write_bytes),
)
)
except (PermissionError, AttributeError):
pass
return metrics
def _collect_network(self) -> list[MetricPoint]:
"""Collect network metrics with rate calculation."""
metrics = []
try:
net_io = psutil.net_io_counters()
current_time = time.time()
if self._prev_net_io is not None and self._prev_net_time is not None:
time_delta = current_time - self._prev_net_time
if time_delta > 0:
bytes_sent_rate = (
net_io.bytes_sent - self._prev_net_io.bytes_sent
) / time_delta
bytes_recv_rate = (
net_io.bytes_recv - self._prev_net_io.bytes_recv
) / time_delta
metrics.append(
MetricPoint(
metric_type="NETWORK_SENT_BYTES_SEC",
value=bytes_sent_rate,
)
)
metrics.append(
MetricPoint(
metric_type="NETWORK_RECV_BYTES_SEC",
value=bytes_recv_rate,
)
)
self._prev_net_io = net_io
self._prev_net_time = current_time
# Connection count
connections = len(psutil.net_connections(kind="inet"))
metrics.append(
MetricPoint(
metric_type="NETWORK_CONNECTIONS",
value=float(connections),
)
)
except (PermissionError, psutil.AccessDenied):
pass
return metrics
def _collect_load(self) -> list[MetricPoint]:
"""Collect load average metrics (Unix only)."""
metrics = []
try:
load1, load5, load15 = psutil.getloadavg()
metrics.append(MetricPoint(metric_type="LOAD_AVG_1M", value=load1))
metrics.append(MetricPoint(metric_type="LOAD_AVG_5M", value=load5))
metrics.append(MetricPoint(metric_type="LOAD_AVG_15M", value=load15))
except (AttributeError, OSError):
# Windows doesn't have getloadavg
pass
# Process count
metrics.append(
MetricPoint(
metric_type="PROCESS_COUNT",
value=float(len(psutil.pids())),
)
)
return metrics

View File

@@ -21,6 +21,8 @@ RUN python -m grpc_tools.protoc \
/app/proto/metrics.proto /app/proto/metrics.proto
COPY services/gateway /app/services/gateway COPY services/gateway /app/services/gateway
COPY services/aggregator/__init__.py /app/services/aggregator/__init__.py
COPY services/aggregator/storage.py /app/services/aggregator/storage.py
COPY web /app/web COPY web /app/web
ENV PYTHONPATH=/app ENV PYTHONPATH=/app

View File

@@ -0,0 +1 @@
"""Gateway service."""

393
services/gateway/main.py Normal file
View File

@@ -0,0 +1,393 @@
"""Gateway service - FastAPI with WebSocket for real-time dashboard."""
import asyncio
import json
import sys
from contextlib import asynccontextmanager
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any
import grpc
from fastapi import FastAPI, HTTPException, Query, WebSocket, WebSocketDisconnect
from fastapi.requests import Request
from fastapi.responses import HTMLResponse
from fastapi.staticfiles import StaticFiles
from fastapi.templating import Jinja2Templates
# Add project root to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
from services.aggregator.storage import TimescaleStorage
from shared import metrics_pb2, metrics_pb2_grpc
from shared.config import get_gateway_config
from shared.events import get_subscriber
from shared.logging import setup_logging
# Global state
config = get_gateway_config()
logger = setup_logging(
service_name=config.service_name,
log_level=config.log_level,
log_format=config.log_format,
)
# WebSocket connection manager
class ConnectionManager:
"""Manages WebSocket connections for real-time updates."""
def __init__(self):
self.active_connections: list[WebSocket] = []
async def connect(self, websocket: WebSocket) -> None:
await websocket.accept()
self.active_connections.append(websocket)
logger.info("websocket_connected", total=len(self.active_connections))
def disconnect(self, websocket: WebSocket) -> None:
self.active_connections.remove(websocket)
logger.info("websocket_disconnected", total=len(self.active_connections))
async def broadcast(self, message: dict) -> None:
"""Broadcast message to all connected clients."""
if not self.active_connections:
return
data = json.dumps(message)
disconnected = []
for connection in self.active_connections:
try:
await connection.send_text(data)
except Exception:
disconnected.append(connection)
# Clean up disconnected
for conn in disconnected:
try:
self.active_connections.remove(conn)
except ValueError:
pass
manager = ConnectionManager()
timescale: TimescaleStorage | None = None
grpc_channel: grpc.aio.Channel | None = None
grpc_stub: metrics_pb2_grpc.MetricsServiceStub | None = None
async def event_listener():
"""Background task that listens for metric events and broadcasts to WebSocket clients."""
logger.info("event_listener_starting")
async with get_subscriber(topics=["metrics.raw"]) as subscriber:
async for event in subscriber.consume():
try:
await manager.broadcast(
{
"type": "metrics",
"data": event.payload,
"timestamp": event.timestamp.isoformat(),
}
)
except Exception as e:
logger.warning("broadcast_error", error=str(e))
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan manager."""
global timescale, grpc_channel, grpc_stub
logger.info("gateway_starting", port=config.http_port)
# Connect to TimescaleDB for historical queries
timescale = TimescaleStorage(config.timescale_url)
try:
await timescale.connect()
except Exception as e:
logger.warning("timescale_connection_failed", error=str(e))
timescale = None
# Connect to aggregator via gRPC
grpc_channel = grpc.aio.insecure_channel(config.aggregator_url)
grpc_stub = metrics_pb2_grpc.MetricsServiceStub(grpc_channel)
# Start event listener in background
listener_task = asyncio.create_task(event_listener())
logger.info("gateway_started")
yield
# Cleanup
listener_task.cancel()
try:
await listener_task
except asyncio.CancelledError:
pass
if grpc_channel:
await grpc_channel.close()
if timescale:
await timescale.disconnect()
logger.info("gateway_stopped")
# Create FastAPI app
app = FastAPI(
title="System Monitor Gateway",
description="Real-time system monitoring dashboard",
version="0.1.0",
lifespan=lifespan,
)
# Mount static files
static_path = Path(__file__).parent.parent.parent / "web" / "static"
if static_path.exists():
app.mount("/static", StaticFiles(directory=str(static_path)), name="static")
# Templates
templates_path = Path(__file__).parent.parent.parent / "web" / "templates"
templates = (
Jinja2Templates(directory=str(templates_path)) if templates_path.exists() else None
)
# ============================================================================
# Health endpoints
# ============================================================================
@app.get("/health")
async def health_check():
"""Health check endpoint."""
return {"status": "healthy", "service": "gateway"}
@app.get("/ready")
async def readiness_check():
"""Readiness check - verifies dependencies."""
checks = {"gateway": "ok"}
# Check gRPC connection
try:
if grpc_stub:
await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=2.0)
checks["aggregator"] = "ok"
except Exception as e:
checks["aggregator"] = f"error: {str(e)}"
# Check TimescaleDB
if timescale and timescale._pool:
checks["timescaledb"] = "ok"
else:
checks["timescaledb"] = "not connected"
return {"status": "ready", "checks": checks}
# ============================================================================
# REST API endpoints
# ============================================================================
@app.get("/api/machines")
async def get_machines():
"""Get current state of all machines."""
if not grpc_stub:
raise HTTPException(status_code=503, detail="Aggregator not connected")
try:
response = await grpc_stub.GetAllStates(metrics_pb2.Empty(), timeout=5.0)
machines = []
for state in response.machines:
metrics = {}
for m in state.current_metrics:
metric_type = metrics_pb2.MetricType.Name(m.type)
metrics[metric_type] = m.value
machines.append(
{
"machine_id": state.machine_id,
"hostname": state.hostname,
"last_seen_ms": state.last_seen_ms,
"health": metrics_pb2.HealthStatus.Name(state.health),
"metrics": metrics,
}
)
return {"machines": machines}
except grpc.aio.AioRpcError as e:
raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
@app.get("/api/machines/{machine_id}")
async def get_machine(machine_id: str):
"""Get current state of a specific machine."""
if not grpc_stub:
raise HTTPException(status_code=503, detail="Aggregator not connected")
try:
response = await grpc_stub.GetCurrentState(
metrics_pb2.StateRequest(machine_id=machine_id),
timeout=5.0,
)
if not response.machine_id:
raise HTTPException(status_code=404, detail="Machine not found")
metrics = {}
for m in response.current_metrics:
metric_type = metrics_pb2.MetricType.Name(m.type)
metrics[metric_type] = m.value
return {
"machine_id": response.machine_id,
"hostname": response.hostname,
"last_seen_ms": response.last_seen_ms,
"health": metrics_pb2.HealthStatus.Name(response.health),
"metrics": metrics,
}
except grpc.aio.AioRpcError as e:
if e.code() == grpc.StatusCode.NOT_FOUND:
raise HTTPException(status_code=404, detail="Machine not found")
raise HTTPException(status_code=503, detail=f"Aggregator error: {e.details()}")
@app.get("/api/metrics")
async def get_metrics(
machine_id: str | None = Query(None),
metric_type: str | None = Query(None),
minutes: int = Query(60, ge=1, le=1440),
limit: int = Query(1000, ge=1, le=10000),
):
"""Get historical metrics."""
if not timescale:
raise HTTPException(status_code=503, detail="TimescaleDB not connected")
end_time = datetime.utcnow()
start_time = end_time - timedelta(minutes=minutes)
try:
metrics = await timescale.get_metrics(
machine_id=machine_id,
metric_type=metric_type,
start_time=start_time,
end_time=end_time,
limit=limit,
)
return {"metrics": metrics, "count": len(metrics)}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
# ============================================================================
# WebSocket endpoint
# ============================================================================
@app.websocket("/ws")
async def websocket_endpoint(websocket: WebSocket):
"""WebSocket endpoint for real-time metric updates."""
await manager.connect(websocket)
try:
# Send initial state
if grpc_stub:
try:
response = await grpc_stub.GetAllStates(
metrics_pb2.Empty(), timeout=5.0
)
for state in response.machines:
metrics = {}
for m in state.current_metrics:
metric_type = metrics_pb2.MetricType.Name(m.type)
metrics[metric_type] = m.value
await websocket.send_json(
{
"type": "initial",
"data": {
"machine_id": state.machine_id,
"hostname": state.hostname,
"metrics": metrics,
},
}
)
except Exception as e:
logger.warning("initial_state_error", error=str(e))
# Keep connection alive and handle incoming messages
while True:
try:
data = await websocket.receive_text()
# Handle ping/pong or commands from client
if data == "ping":
await websocket.send_text("pong")
except WebSocketDisconnect:
break
finally:
manager.disconnect(websocket)
# ============================================================================
# Dashboard (HTML)
# ============================================================================
@app.get("/", response_class=HTMLResponse)
async def dashboard(request: Request):
"""Serve the dashboard HTML."""
if templates:
return templates.TemplateResponse("dashboard.html", {"request": request})
# Fallback if templates not found
return HTMLResponse("""
<!DOCTYPE html>
<html>
<head>
<title>System Monitor</title>
<style>
body { font-family: system-ui; background: #1a1a2e; color: #eee; padding: 2rem; }
h1 { color: #e94560; }
pre { background: #16213e; padding: 1rem; border-radius: 8px; overflow: auto; }
</style>
</head>
<body>
<h1>System Monitor</h1>
<p>Dashboard template not found. API endpoints:</p>
<ul>
<li><a href="/api/machines">/api/machines</a> - Current state of all machines</li>
<li><a href="/api/metrics">/api/metrics</a> - Historical metrics</li>
<li><a href="/docs">/docs</a> - API documentation</li>
</ul>
<h2>Live Metrics</h2>
<pre id="output">Connecting...</pre>
<script>
const ws = new WebSocket(`ws://${location.host}/ws`);
const output = document.getElementById('output');
ws.onmessage = (e) => {
output.textContent = JSON.stringify(JSON.parse(e.data), null, 2);
};
ws.onclose = () => { output.textContent = 'Disconnected'; };
</script>
</body>
</html>
""")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=config.http_port)

5
shared/__init__.py Normal file
View File

@@ -0,0 +1,5 @@
"""Shared utilities and generated protobuf modules."""
from . import metrics_pb2, metrics_pb2_grpc
__all__ = ["metrics_pb2", "metrics_pb2_grpc"]

104
shared/config.py Normal file
View File

@@ -0,0 +1,104 @@
"""Shared configuration management using Pydantic Settings."""
from functools import lru_cache
from pydantic_settings import BaseSettings, SettingsConfigDict
class BaseConfig(BaseSettings):
"""Base configuration shared across all services."""
model_config = SettingsConfigDict(
env_file=".env",
env_file_encoding="utf-8",
extra="ignore",
)
# Service identification
service_name: str = "unknown"
machine_id: str = "local"
# Logging
log_level: str = "INFO"
log_format: str = "json" # "json" or "console"
# Redis
redis_url: str = "redis://localhost:6379"
# Events
events_backend: str = "redis_pubsub"
class CollectorConfig(BaseConfig):
"""Collector service configuration."""
service_name: str = "collector"
# Aggregator connection
aggregator_url: str = "localhost:50051"
# Collection settings
collection_interval: int = 5 # seconds
# Metrics to collect
collect_cpu: bool = True
collect_memory: bool = True
collect_disk: bool = True
collect_network: bool = True
collect_load: bool = True
class AggregatorConfig(BaseConfig):
"""Aggregator service configuration."""
service_name: str = "aggregator"
# gRPC server
grpc_port: int = 50051
# TimescaleDB - can be set directly via TIMESCALE_URL
timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
class GatewayConfig(BaseConfig):
"""Gateway service configuration."""
service_name: str = "gateway"
# HTTP server
http_port: int = 8000
# Aggregator connection
aggregator_url: str = "localhost:50051"
# TimescaleDB - can be set directly via TIMESCALE_URL
timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
class AlertsConfig(BaseConfig):
"""Alerts service configuration."""
service_name: str = "alerts"
# TimescaleDB - can be set directly via TIMESCALE_URL or built from components
timescale_url: str = "postgresql://monitor:monitor@localhost:5432/monitor"
@lru_cache
def get_collector_config() -> CollectorConfig:
return CollectorConfig()
@lru_cache
def get_aggregator_config() -> AggregatorConfig:
return AggregatorConfig()
@lru_cache
def get_gateway_config() -> GatewayConfig:
return GatewayConfig()
@lru_cache
def get_alerts_config() -> AlertsConfig:
return AlertsConfig()

74
shared/logging.py Normal file
View File

@@ -0,0 +1,74 @@
"""Structured logging configuration."""
import logging
import sys
from typing import Any
import structlog
def setup_logging(
service_name: str,
log_level: str = "INFO",
log_format: str = "json",
) -> structlog.BoundLogger:
"""
Configure structured logging for a service.
Args:
service_name: Name of the service for log context
log_level: Logging level (DEBUG, INFO, WARNING, ERROR)
log_format: Output format ("json" or "console")
Returns:
Configured structlog logger
"""
# Shared processors
shared_processors: list[Any] = [
structlog.contextvars.merge_contextvars,
structlog.processors.add_log_level,
structlog.processors.TimeStamper(fmt="iso"),
structlog.processors.StackInfoRenderer(),
]
if log_format == "json":
# JSON format for production
processors = shared_processors + [
structlog.processors.format_exc_info,
structlog.processors.JSONRenderer(),
]
else:
# Console format for development
processors = shared_processors + [
structlog.dev.ConsoleRenderer(colors=True),
]
structlog.configure(
processors=processors,
wrapper_class=structlog.make_filtering_bound_logger(
getattr(logging, log_level.upper(), logging.INFO)
),
context_class=dict,
logger_factory=structlog.PrintLoggerFactory(),
cache_logger_on_first_use=True,
)
# Also configure standard library logging
logging.basicConfig(
format="%(message)s",
stream=sys.stdout,
level=getattr(logging, log_level.upper(), logging.INFO),
)
# Get logger with service context
logger = structlog.get_logger(service=service_name)
return logger
def get_logger(name: str | None = None) -> structlog.BoundLogger:
"""Get a logger instance, optionally with a specific name."""
if name:
return structlog.get_logger(component=name)
return structlog.get_logger()

93
shared/metrics_pb2.py Normal file
View File

@@ -0,0 +1,93 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# NO CHECKED-IN PROTOBUF GENCODE
# source: metrics.proto
# Protobuf Python Version: 6.31.1
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import descriptor_pool as _descriptor_pool
from google.protobuf import runtime_version as _runtime_version
from google.protobuf import symbol_database as _symbol_database
from google.protobuf.internal import builder as _builder
_runtime_version.ValidateProtobufRuntimeVersion(
_runtime_version.Domain.PUBLIC,
6,
31,
1,
'',
'metrics.proto'
)
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(b'\n\rmetrics.proto\x12\nmonitoring\"\x07\n\x05\x45mpty\"\xd8\x01\n\x06Metric\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12$\n\x04type\x18\x04 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x05 \x01(\x01\x12.\n\x06labels\x18\x06 \x03(\x0b\x32\x1e.monitoring.Metric.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"s\n\x0bMetricBatch\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0ctimestamp_ms\x18\x03 \x01(\x03\x12(\n\x07metrics\x18\x04 \x03(\x0b\x32\x17.monitoring.MetricPoint\"\xa6\x01\n\x0bMetricPoint\x12$\n\x04type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\r\n\x05value\x18\x02 \x01(\x01\x12\x33\n\x06labels\x18\x03 \x03(\x0b\x32#.monitoring.MetricPoint.LabelsEntry\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"G\n\tStreamAck\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x18\n\x10metrics_received\x18\x02 \x01(\x03\x12\x0f\n\x07message\x18\x03 \x01(\t\"\"\n\x0cStateRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x8c\x02\n\x0cMachineState\x12\x12\n\nmachine_id\x18\x01 \x01(\t\x12\x10\n\x08hostname\x18\x02 \x01(\t\x12\x14\n\x0clast_seen_ms\x18\x03 \x01(\x03\x12+\n\x0f\x63urrent_metrics\x18\x04 \x03(\x0b\x32\x12.monitoring.Metric\x12(\n\x06health\x18\x05 \x01(\x0e\x32\x18.monitoring.HealthStatus\x12\x38\n\x08metadata\x18\x06 \x03(\x0b\x32&.monitoring.MachineState.MetadataEntry\x1a/\n\rMetadataEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\">\n\x10\x41llMachinesState\x12*\n\x08machines\x18\x01 \x03(\x0b\x32\x18.monitoring.MachineState\"\xd7\x01\n\x0e\x43ontrolCommand\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12<\n\x0fupdate_interval\x18\x02 \x01(\x0b\x32!.monitoring.UpdateIntervalCommandH\x00\x12\x37\n\x07restart\x18\x03 \x01(\x0b\x32$.monitoring.RestartCollectionCommandH\x00\x12/\n\x08shutdown\x18\x04 \x01(\x0b\x32\x1b.monitoring.ShutdownCommandH\x00\x42\t\n\x07\x63ommand\"1\n\x15UpdateIntervalCommand\x12\x18\n\x10interval_seconds\x18\x01 \x01(\x05\"\x1a\n\x18RestartCollectionCommand\"#\n\x0fShutdownCommand\x12\x10\n\x08graceful\x18\x01 \x01(\x08\"G\n\x0f\x43ontrolResponse\x12\x12\n\ncommand_id\x18\x01 \x01(\t\x12\x0f\n\x07success\x18\x02 \x01(\x08\x12\x0f\n\x07message\x18\x03 \x01(\t\"#\n\rConfigRequest\x12\x12\n\nmachine_id\x18\x01 \x01(\t\"\x80\x02\n\x0f\x43ollectorConfig\x12#\n\x1b\x63ollection_interval_seconds\x18\x01 \x01(\x05\x12/\n\x0f\x65nabled_metrics\x18\x02 \x03(\x0e\x32\x16.monitoring.MetricType\x12\x37\n\x06labels\x18\x03 \x03(\x0b\x32\'.monitoring.CollectorConfig.LabelsEntry\x12/\n\nthresholds\x18\x04 \x03(\x0b\x32\x1b.monitoring.ThresholdConfig\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\"u\n\x0fThresholdConfig\x12+\n\x0bmetric_type\x18\x01 \x01(\x0e\x32\x16.monitoring.MetricType\x12\x19\n\x11warning_threshold\x18\x02 \x01(\x01\x12\x1a\n\x12\x63ritical_threshold\x18\x03 \x01(\x01*\x8d\x03\n\nMetricType\x12\x1b\n\x17METRIC_TYPE_UNSPECIFIED\x10\x00\x12\x0f\n\x0b\x43PU_PERCENT\x10\x01\x12\x18\n\x14\x43PU_PERCENT_PER_CORE\x10\x02\x12\x12\n\x0eMEMORY_PERCENT\x10\x03\x12\x15\n\x11MEMORY_USED_BYTES\x10\x04\x12\x1a\n\x16MEMORY_AVAILABLE_BYTES\x10\x05\x12\x10\n\x0c\x44ISK_PERCENT\x10\x06\x12\x13\n\x0f\x44ISK_USED_BYTES\x10\x07\x12\x17\n\x13\x44ISK_READ_BYTES_SEC\x10\x08\x12\x18\n\x14\x44ISK_WRITE_BYTES_SEC\x10\t\x12\x1a\n\x16NETWORK_SENT_BYTES_SEC\x10\n\x12\x1a\n\x16NETWORK_RECV_BYTES_SEC\x10\x0b\x12\x17\n\x13NETWORK_CONNECTIONS\x10\x0c\x12\x11\n\rPROCESS_COUNT\x10\r\x12\x0f\n\x0bLOAD_AVG_1M\x10\x0e\x12\x0f\n\x0bLOAD_AVG_5M\x10\x0f\x12\x10\n\x0cLOAD_AVG_15M\x10\x10*o\n\x0cHealthStatus\x12\x1d\n\x19HEALTH_STATUS_UNSPECIFIED\x10\x00\x12\x0b\n\x07HEALTHY\x10\x01\x12\x0b\n\x07WARNING\x10\x02\x12\x0c\n\x08\x43RITICAL\x10\x03\x12\x0b\n\x07UNKNOWN\x10\x04\x12\x0b\n\x07OFFLINE\x10\x05\x32\xdc\x01\n\x0eMetricsService\x12>\n\rStreamMetrics\x12\x12.monitoring.Metric\x1a\x15.monitoring.StreamAck\"\x00(\x01\x12G\n\x0fGetCurrentState\x12\x18.monitoring.StateRequest\x1a\x18.monitoring.MachineState\"\x00\x12\x41\n\x0cGetAllStates\x12\x11.monitoring.Empty\x1a\x1c.monitoring.AllMachinesState\"\x00\x32Z\n\x0e\x43ontrolService\x12H\n\x07\x43ontrol\x12\x1a.monitoring.ControlCommand\x1a\x1b.monitoring.ControlResponse\"\x00(\x01\x30\x01\x32\xa1\x01\n\rConfigService\x12\x45\n\tGetConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x12I\n\x0bWatchConfig\x12\x19.monitoring.ConfigRequest\x1a\x1b.monitoring.CollectorConfig\"\x00\x30\x01\x42%Z#github.com/your-org/sysmonstm/protob\x06proto3')
_globals = globals()
_builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
_builder.BuildTopDescriptorsAndMessages(DESCRIPTOR, 'metrics_pb2', _globals)
if not _descriptor._USE_C_DESCRIPTORS:
_globals['DESCRIPTOR']._loaded_options = None
_globals['DESCRIPTOR']._serialized_options = b'Z#github.com/your-org/sysmonstm/proto'
_globals['_METRIC_LABELSENTRY']._loaded_options = None
_globals['_METRIC_LABELSENTRY']._serialized_options = b'8\001'
_globals['_METRICPOINT_LABELSENTRY']._loaded_options = None
_globals['_METRICPOINT_LABELSENTRY']._serialized_options = b'8\001'
_globals['_MACHINESTATE_METADATAENTRY']._loaded_options = None
_globals['_MACHINESTATE_METADATAENTRY']._serialized_options = b'8\001'
_globals['_COLLECTORCONFIG_LABELSENTRY']._loaded_options = None
_globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_options = b'8\001'
_globals['_METRICTYPE']._serialized_start=1810
_globals['_METRICTYPE']._serialized_end=2207
_globals['_HEALTHSTATUS']._serialized_start=2209
_globals['_HEALTHSTATUS']._serialized_end=2320
_globals['_EMPTY']._serialized_start=29
_globals['_EMPTY']._serialized_end=36
_globals['_METRIC']._serialized_start=39
_globals['_METRIC']._serialized_end=255
_globals['_METRIC_LABELSENTRY']._serialized_start=210
_globals['_METRIC_LABELSENTRY']._serialized_end=255
_globals['_METRICBATCH']._serialized_start=257
_globals['_METRICBATCH']._serialized_end=372
_globals['_METRICPOINT']._serialized_start=375
_globals['_METRICPOINT']._serialized_end=541
_globals['_METRICPOINT_LABELSENTRY']._serialized_start=210
_globals['_METRICPOINT_LABELSENTRY']._serialized_end=255
_globals['_STREAMACK']._serialized_start=543
_globals['_STREAMACK']._serialized_end=614
_globals['_STATEREQUEST']._serialized_start=616
_globals['_STATEREQUEST']._serialized_end=650
_globals['_MACHINESTATE']._serialized_start=653
_globals['_MACHINESTATE']._serialized_end=921
_globals['_MACHINESTATE_METADATAENTRY']._serialized_start=874
_globals['_MACHINESTATE_METADATAENTRY']._serialized_end=921
_globals['_ALLMACHINESSTATE']._serialized_start=923
_globals['_ALLMACHINESSTATE']._serialized_end=985
_globals['_CONTROLCOMMAND']._serialized_start=988
_globals['_CONTROLCOMMAND']._serialized_end=1203
_globals['_UPDATEINTERVALCOMMAND']._serialized_start=1205
_globals['_UPDATEINTERVALCOMMAND']._serialized_end=1254
_globals['_RESTARTCOLLECTIONCOMMAND']._serialized_start=1256
_globals['_RESTARTCOLLECTIONCOMMAND']._serialized_end=1282
_globals['_SHUTDOWNCOMMAND']._serialized_start=1284
_globals['_SHUTDOWNCOMMAND']._serialized_end=1319
_globals['_CONTROLRESPONSE']._serialized_start=1321
_globals['_CONTROLRESPONSE']._serialized_end=1392
_globals['_CONFIGREQUEST']._serialized_start=1394
_globals['_CONFIGREQUEST']._serialized_end=1429
_globals['_COLLECTORCONFIG']._serialized_start=1432
_globals['_COLLECTORCONFIG']._serialized_end=1688
_globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_start=210
_globals['_COLLECTORCONFIG_LABELSENTRY']._serialized_end=255
_globals['_THRESHOLDCONFIG']._serialized_start=1690
_globals['_THRESHOLDCONFIG']._serialized_end=1807
_globals['_METRICSSERVICE']._serialized_start=2323
_globals['_METRICSSERVICE']._serialized_end=2543
_globals['_CONTROLSERVICE']._serialized_start=2545
_globals['_CONTROLSERVICE']._serialized_end=2635
_globals['_CONFIGSERVICE']._serialized_start=2638
_globals['_CONFIGSERVICE']._serialized_end=2799
# @@protoc_insertion_point(module_scope)

385
shared/metrics_pb2_grpc.py Normal file
View File

@@ -0,0 +1,385 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import warnings
from shared import metrics_pb2 as metrics__pb2
GRPC_GENERATED_VERSION = '1.76.0'
GRPC_VERSION = grpc.__version__
_version_not_supported = False
try:
from grpc._utilities import first_version_is_lower
_version_not_supported = first_version_is_lower(GRPC_VERSION, GRPC_GENERATED_VERSION)
except ImportError:
_version_not_supported = True
if _version_not_supported:
raise RuntimeError(
f'The grpc package installed is at version {GRPC_VERSION},'
+ ' but the generated code in metrics_pb2_grpc.py depends on'
+ f' grpcio>={GRPC_GENERATED_VERSION}.'
+ f' Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}'
+ f' or downgrade your generated code using grpcio-tools<={GRPC_VERSION}.'
)
class MetricsServiceStub(object):
"""MetricsService handles streaming metrics from collectors to aggregator
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.StreamMetrics = channel.stream_unary(
'/monitoring.MetricsService/StreamMetrics',
request_serializer=metrics__pb2.Metric.SerializeToString,
response_deserializer=metrics__pb2.StreamAck.FromString,
_registered_method=True)
self.GetCurrentState = channel.unary_unary(
'/monitoring.MetricsService/GetCurrentState',
request_serializer=metrics__pb2.StateRequest.SerializeToString,
response_deserializer=metrics__pb2.MachineState.FromString,
_registered_method=True)
self.GetAllStates = channel.unary_unary(
'/monitoring.MetricsService/GetAllStates',
request_serializer=metrics__pb2.Empty.SerializeToString,
response_deserializer=metrics__pb2.AllMachinesState.FromString,
_registered_method=True)
class MetricsServiceServicer(object):
"""MetricsService handles streaming metrics from collectors to aggregator
"""
def StreamMetrics(self, request_iterator, context):
"""Client-side streaming: collector streams metrics to aggregator
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GetCurrentState(self, request, context):
"""Get current state of a machine
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def GetAllStates(self, request, context):
"""Get current state of all machines
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_MetricsServiceServicer_to_server(servicer, server):
rpc_method_handlers = {
'StreamMetrics': grpc.stream_unary_rpc_method_handler(
servicer.StreamMetrics,
request_deserializer=metrics__pb2.Metric.FromString,
response_serializer=metrics__pb2.StreamAck.SerializeToString,
),
'GetCurrentState': grpc.unary_unary_rpc_method_handler(
servicer.GetCurrentState,
request_deserializer=metrics__pb2.StateRequest.FromString,
response_serializer=metrics__pb2.MachineState.SerializeToString,
),
'GetAllStates': grpc.unary_unary_rpc_method_handler(
servicer.GetAllStates,
request_deserializer=metrics__pb2.Empty.FromString,
response_serializer=metrics__pb2.AllMachinesState.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'monitoring.MetricsService', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('monitoring.MetricsService', rpc_method_handlers)
# This class is part of an EXPERIMENTAL API.
class MetricsService(object):
"""MetricsService handles streaming metrics from collectors to aggregator
"""
@staticmethod
def StreamMetrics(request_iterator,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.stream_unary(
request_iterator,
target,
'/monitoring.MetricsService/StreamMetrics',
metrics__pb2.Metric.SerializeToString,
metrics__pb2.StreamAck.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def GetCurrentState(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/monitoring.MetricsService/GetCurrentState',
metrics__pb2.StateRequest.SerializeToString,
metrics__pb2.MachineState.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def GetAllStates(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/monitoring.MetricsService/GetAllStates',
metrics__pb2.Empty.SerializeToString,
metrics__pb2.AllMachinesState.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
class ControlServiceStub(object):
"""ControlService handles bidirectional control commands
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.Control = channel.stream_stream(
'/monitoring.ControlService/Control',
request_serializer=metrics__pb2.ControlCommand.SerializeToString,
response_deserializer=metrics__pb2.ControlResponse.FromString,
_registered_method=True)
class ControlServiceServicer(object):
"""ControlService handles bidirectional control commands
"""
def Control(self, request_iterator, context):
"""Bidirectional streaming for commands and responses
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_ControlServiceServicer_to_server(servicer, server):
rpc_method_handlers = {
'Control': grpc.stream_stream_rpc_method_handler(
servicer.Control,
request_deserializer=metrics__pb2.ControlCommand.FromString,
response_serializer=metrics__pb2.ControlResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'monitoring.ControlService', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('monitoring.ControlService', rpc_method_handlers)
# This class is part of an EXPERIMENTAL API.
class ControlService(object):
"""ControlService handles bidirectional control commands
"""
@staticmethod
def Control(request_iterator,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.stream_stream(
request_iterator,
target,
'/monitoring.ControlService/Control',
metrics__pb2.ControlCommand.SerializeToString,
metrics__pb2.ControlResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
class ConfigServiceStub(object):
"""ConfigService handles dynamic configuration
"""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.GetConfig = channel.unary_unary(
'/monitoring.ConfigService/GetConfig',
request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
response_deserializer=metrics__pb2.CollectorConfig.FromString,
_registered_method=True)
self.WatchConfig = channel.unary_stream(
'/monitoring.ConfigService/WatchConfig',
request_serializer=metrics__pb2.ConfigRequest.SerializeToString,
response_deserializer=metrics__pb2.CollectorConfig.FromString,
_registered_method=True)
class ConfigServiceServicer(object):
"""ConfigService handles dynamic configuration
"""
def GetConfig(self, request, context):
"""Get current configuration for a collector
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def WatchConfig(self, request, context):
"""Stream configuration updates
"""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
def add_ConfigServiceServicer_to_server(servicer, server):
rpc_method_handlers = {
'GetConfig': grpc.unary_unary_rpc_method_handler(
servicer.GetConfig,
request_deserializer=metrics__pb2.ConfigRequest.FromString,
response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
),
'WatchConfig': grpc.unary_stream_rpc_method_handler(
servicer.WatchConfig,
request_deserializer=metrics__pb2.ConfigRequest.FromString,
response_serializer=metrics__pb2.CollectorConfig.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'monitoring.ConfigService', rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
server.add_registered_method_handlers('monitoring.ConfigService', rpc_method_handlers)
# This class is part of an EXPERIMENTAL API.
class ConfigService(object):
"""ConfigService handles dynamic configuration
"""
@staticmethod
def GetConfig(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_unary(
request,
target,
'/monitoring.ConfigService/GetConfig',
metrics__pb2.ConfigRequest.SerializeToString,
metrics__pb2.CollectorConfig.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)
@staticmethod
def WatchConfig(request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.unary_stream(
request,
target,
'/monitoring.ConfigService/WatchConfig',
metrics__pb2.ConfigRequest.SerializeToString,
metrics__pb2.CollectorConfig.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
_registered_method=True)

0
web/static/.gitkeep Normal file
View File

View File

@@ -0,0 +1,358 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Monitor Dashboard</title>
<style>
:root {
--bg-primary: #1a1a2e;
--bg-secondary: #16213e;
--bg-card: #0f3460;
--text-primary: #eee;
--text-secondary: #a0a0a0;
--accent: #e94560;
--success: #4ade80;
--warning: #fbbf24;
--danger: #ef4444;
--border: #2a2a4a;
}
* { box-sizing: border-box; margin: 0; padding: 0; }
body {
font-family: system-ui, -apple-system, sans-serif;
background: var(--bg-primary);
color: var(--text-primary);
min-height: 100vh;
}
header {
background: var(--bg-secondary);
padding: 1rem 2rem;
border-bottom: 2px solid var(--accent);
display: flex;
justify-content: space-between;
align-items: center;
}
header h1 { font-size: 1.5rem; }
.status {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.875rem;
}
.status-dot {
width: 10px;
height: 10px;
border-radius: 50%;
background: var(--danger);
}
.status-dot.connected { background: var(--success); }
main {
padding: 1.5rem;
max-width: 1600px;
margin: 0 auto;
}
.machines-grid {
display: grid;
grid-template-columns: repeat(auto-fill, minmax(400px, 1fr));
gap: 1.5rem;
}
.machine-card {
background: var(--bg-secondary);
border-radius: 8px;
padding: 1.25rem;
border: 1px solid var(--border);
}
.machine-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 1rem;
padding-bottom: 0.75rem;
border-bottom: 1px solid var(--border);
}
.machine-name {
font-weight: 600;
color: var(--accent);
}
.machine-id {
font-size: 0.75rem;
color: var(--text-secondary);
}
.machine-status {
font-size: 0.75rem;
padding: 0.25rem 0.5rem;
border-radius: 4px;
background: var(--success);
color: #000;
}
.machine-status.warning { background: var(--warning); }
.machine-status.critical { background: var(--danger); color: #fff; }
.machine-status.offline { background: var(--text-secondary); }
.metrics-grid {
display: grid;
grid-template-columns: repeat(2, 1fr);
gap: 0.75rem;
}
.metric {
background: var(--bg-card);
padding: 0.75rem;
border-radius: 6px;
}
.metric-label {
font-size: 0.75rem;
color: var(--text-secondary);
margin-bottom: 0.25rem;
}
.metric-value {
font-size: 1.5rem;
font-weight: 600;
}
.metric-bar {
height: 4px;
background: var(--border);
border-radius: 2px;
margin-top: 0.5rem;
overflow: hidden;
}
.metric-bar-fill {
height: 100%;
background: var(--success);
transition: width 0.3s ease;
}
.metric-bar-fill.warning { background: var(--warning); }
.metric-bar-fill.critical { background: var(--danger); }
.last-seen {
font-size: 0.75rem;
color: var(--text-secondary);
margin-top: 1rem;
text-align: right;
}
.no-machines {
text-align: center;
padding: 3rem;
color: var(--text-secondary);
}
.no-machines h2 {
color: var(--text-primary);
margin-bottom: 0.5rem;
}
@media (max-width: 600px) {
.machines-grid {
grid-template-columns: 1fr;
}
.metrics-grid {
grid-template-columns: 1fr;
}
}
</style>
</head>
<body>
<header>
<h1>System Monitor</h1>
<div class="status">
<span class="status-dot" id="status-dot"></span>
<span id="status-text">Connecting...</span>
</div>
</header>
<main>
<div class="machines-grid" id="machines-grid">
<div class="no-machines">
<h2>No machines connected</h2>
<p>Waiting for collectors to send metrics...</p>
</div>
</div>
</main>
<script>
const machinesGrid = document.getElementById('machines-grid');
const statusDot = document.getElementById('status-dot');
const statusText = document.getElementById('status-text');
const machines = new Map();
function formatBytes(bytes) {
if (bytes === 0) return '0 B';
const k = 1024;
const sizes = ['B', 'KB', 'MB', 'GB', 'TB'];
const i = Math.floor(Math.log(bytes) / Math.log(k));
return parseFloat((bytes / Math.pow(k, i)).toFixed(1)) + ' ' + sizes[i];
}
function formatRate(bytesPerSec) {
return formatBytes(bytesPerSec) + '/s';
}
function getBarClass(value, warning = 80, critical = 95) {
if (value >= critical) return 'critical';
if (value >= warning) return 'warning';
return '';
}
function getStatusClass(metrics) {
const cpu = metrics.CPU_PERCENT || 0;
const mem = metrics.MEMORY_PERCENT || 0;
const disk = metrics.DISK_PERCENT || 0;
if (cpu > 95 || mem > 95 || disk > 90) return 'critical';
if (cpu > 80 || mem > 85 || disk > 80) return 'warning';
return '';
}
function timeSince(timestampMs) {
const seconds = Math.floor((Date.now() - timestampMs) / 1000);
if (seconds < 5) return 'just now';
if (seconds < 60) return `${seconds}s ago`;
const minutes = Math.floor(seconds / 60);
if (minutes < 60) return `${minutes}m ago`;
return `${Math.floor(minutes / 60)}h ago`;
}
function renderMachine(data) {
const m = data.metrics || {};
const statusClass = getStatusClass(m);
return `
<div class="machine-card" data-machine="${data.machine_id}">
<div class="machine-header">
<div>
<div class="machine-name">${data.hostname || data.machine_id}</div>
<div class="machine-id">${data.machine_id}</div>
</div>
<span class="machine-status ${statusClass}">${statusClass || 'healthy'}</span>
</div>
<div class="metrics-grid">
<div class="metric">
<div class="metric-label">CPU</div>
<div class="metric-value">${(m.CPU_PERCENT || 0).toFixed(1)}%</div>
<div class="metric-bar">
<div class="metric-bar-fill ${getBarClass(m.CPU_PERCENT || 0)}"
style="width: ${m.CPU_PERCENT || 0}%"></div>
</div>
</div>
<div class="metric">
<div class="metric-label">Memory</div>
<div class="metric-value">${(m.MEMORY_PERCENT || 0).toFixed(1)}%</div>
<div class="metric-bar">
<div class="metric-bar-fill ${getBarClass(m.MEMORY_PERCENT || 0, 85, 95)}"
style="width: ${m.MEMORY_PERCENT || 0}%"></div>
</div>
</div>
<div class="metric">
<div class="metric-label">Disk</div>
<div class="metric-value">${(m.DISK_PERCENT || 0).toFixed(1)}%</div>
<div class="metric-bar">
<div class="metric-bar-fill ${getBarClass(m.DISK_PERCENT || 0, 80, 90)}"
style="width: ${m.DISK_PERCENT || 0}%"></div>
</div>
</div>
<div class="metric">
<div class="metric-label">Load (1m)</div>
<div class="metric-value">${(m.LOAD_AVG_1M || 0).toFixed(2)}</div>
</div>
<div class="metric">
<div class="metric-label">Network In</div>
<div class="metric-value">${formatRate(m.NETWORK_RECV_BYTES_SEC || 0)}</div>
</div>
<div class="metric">
<div class="metric-label">Network Out</div>
<div class="metric-value">${formatRate(m.NETWORK_SENT_BYTES_SEC || 0)}</div>
</div>
</div>
<div class="last-seen">Last seen: ${timeSince(data.timestamp_ms || Date.now())}</div>
</div>
`;
}
function updateUI() {
if (machines.size === 0) {
machinesGrid.innerHTML = `
<div class="no-machines">
<h2>No machines connected</h2>
<p>Waiting for collectors to send metrics...</p>
</div>
`;
return;
}
machinesGrid.innerHTML = Array.from(machines.values())
.map(renderMachine)
.join('');
}
function connect() {
const ws = new WebSocket(`ws://${location.host}/ws`);
ws.onopen = () => {
statusDot.classList.add('connected');
statusText.textContent = 'Connected';
};
ws.onclose = () => {
statusDot.classList.remove('connected');
statusText.textContent = 'Disconnected - Reconnecting...';
setTimeout(connect, 3000);
};
ws.onerror = () => {
statusDot.classList.remove('connected');
statusText.textContent = 'Connection error';
};
ws.onmessage = (event) => {
try {
const msg = JSON.parse(event.data);
if (msg.type === 'initial' || msg.type === 'metrics') {
const data = msg.data;
data.timestamp_ms = data.timestamp_ms || Date.now();
machines.set(data.machine_id, data);
updateUI();
}
} catch (e) {
console.error('Failed to parse message:', e);
}
};
// Send periodic pings
setInterval(() => {
if (ws.readyState === WebSocket.OPEN) {
ws.send('ping');
}
}, 30000);
}
// Update "last seen" timestamps periodically
setInterval(updateUI, 5000);
// Start connection
connect();
</script>
</body>
</html>